[llvm] 7388520 - [GISel] Add more cases to isKnownNeverNaN
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 30 07:11:05 PDT 2022
Author: Pierre van Houtryve
Date: 2022-09-30T14:10:56Z
New Revision: 7388520d1c1810cb06fd71ce0c82dc5bf44d052a
URL: https://github.com/llvm/llvm-project/commit/7388520d1c1810cb06fd71ce0c82dc5bf44d052a
DIFF: https://github.com/llvm/llvm-project/commit/7388520d1c1810cb06fd71ce0c82dc5bf44d052a.diff
LOG: [GISel] Add more cases to isKnownNeverNaN
Make it even with the DAG implementation as of D134854
Reviewed By: arsenm, foad
Differential Revision: https://reviews.llvm.org/D134857
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/Utils.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
llvm/test/CodeGen/AMDGPU/v_pack.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 4e61fd98f79d..92368ab9beb7 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -658,6 +658,20 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
switch (DefMI->getOpcode()) {
default:
break;
+ case TargetOpcode::G_FADD:
+ case TargetOpcode::G_FSUB:
+ case TargetOpcode::G_FMUL:
+ case TargetOpcode::G_FDIV:
+ case TargetOpcode::G_FREM:
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FCOS:
+ case TargetOpcode::G_FMA:
+ case TargetOpcode::G_FMAD:
+ if (SNaN)
+ return true;
+
+ // TODO: Need isKnownNeverInfinity
+ return false;
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE: {
if (SNaN)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index 4c4d6a4ea986..2857c916dd31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -105,8 +105,7 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0
+; GFX10-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 566db10ae284..7f8725e7f0f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -205,7 +205,6 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
; GFX10-NEXT: v_med3_f32 v0, v0, 0, 1.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
@@ -222,7 +221,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0
; GFX10-NEXT: v_max_f32_e32 v0, 0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index 2b4ee22ad330..733be82403cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -254,8 +254,7 @@ body: |
; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16)
; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
- ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]]
- ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FCANONICALIZE]]
+ ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]]
; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]]
; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
@@ -306,8 +305,7 @@ body: |
; CHECK-NEXT: %qnan_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %qnan_s32(s32), %undef(s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef
- ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]]
- ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FCANONICALIZE]]
+ ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]]
; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]]
; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 6520a613fae7..27a41368f1f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -453,8 +453,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -484,13 +484,11 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -532,8 +530,8 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -563,13 +561,11 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 7a8df8f636a7..ffa396021ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -241,8 +241,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -272,13 +272,11 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 2bc553146f5d..0af76dd62422 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -6,14 +6,14 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
define half @v_fdiv_f16(half %a, half %b) {
; GFX6-IEEE-LABEL: v_fdiv_f16:
@@ -771,42 +771,77 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_v2f16_afn:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_afn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_afn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX9-IEEE: ; %bb.0:
+; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX11-IEEE: ; %bb.0:
+; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX11-FLUSH: ; %bb.0:
+; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -1479,42 +1514,77 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9-IEEE: ; %bb.0:
+; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11-IEEE: ; %bb.0:
+; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11-FLUSH: ; %bb.0:
+; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
}
@@ -1709,42 +1779,77 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9-IEEE: ; %bb.0:
+; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11-IEEE: ; %bb.0:
+; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11-FLUSH: ; %bb.0:
+; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index d5c51eebe820..2e9a66c579cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -146,7 +146,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_min_f32_e32 v5, v2, v3
; SI-NEXT: v_max_f32_e32 v2, v2, v3
@@ -186,7 +185,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4
; VI-NEXT: v_min_f32_e32 v5, v4, v2
; VI-NEXT: v_max_f32_e32 v2, v4, v2
; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -208,7 +206,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: v_max_f32_e32 v2, v3, v3
@@ -231,7 +228,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
; GFX10-NEXT: v_max_f32_e32 v4, v1, v2
; GFX10-NEXT: v_min_f32_e32 v1, v1, v2
; GFX10-NEXT: v_min_f32_e32 v2, v4, v3
@@ -251,11 +247,9 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f32_e32 v4, v1, v2
; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
index e1f33d0563e9..8025e8fc8129 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
@@ -242,12 +242,8 @@ body: |
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[COPY2]](s32), [[COPY3]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
%0:vgpr(s32) = COPY $vgpr0
%2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
%8:vgpr(s32) = COPY %2(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index d59ea23772b1..ac783fa72b9c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -33,8 +33,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrsp
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
@@ -83,8 +82,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half ad
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
@@ -230,8 +228,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
; GISEL-NEXT: v_sub_f16_e32 v0, 0x8000, v0
; GISEL-NEXT: v_sub_f16_e32 v1, 0x8000, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
More information about the llvm-commits
mailing list