[llvm] d825850 - [AMDGPU][GISel] Update `isCanonicalized`
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 30 07:13:44 PDT 2022
Author: Pierre van Houtryve
Date: 2022-09-30T14:13:35Z
New Revision: d8258508d49845c577db635ef14ef506df02e5e6
URL: https://github.com/llvm/llvm-project/commit/d8258508d49845c577db635ef14ef506df02e5e6
DIFF: https://github.com/llvm/llvm-project/commit/d8258508d49845c577db635ef14ef506df02e5e6.diff
LOG: [AMDGPU][GISel] Update `isCanonicalized`
Recognize more opcodes in the function.
Fixes some regressions introduced in D134857 for fdiv.f16 too.
Depends on D134857
Reviewed By: arsenm, foad
Differential Revision: https://reviews.llvm.org/D134862
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
llvm/test/CodeGen/AMDGPU/v_pack.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b79bdec97876..f757ca16ef3e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10188,19 +10188,87 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
return false;
switch (Opcode) {
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FSUB:
+ case AMDGPU::G_FMUL:
+ case AMDGPU::G_FCEIL:
+ case AMDGPU::G_FFLOOR:
+ case AMDGPU::G_FRINT:
+ case AMDGPU::G_FNEARBYINT:
+ case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+ case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
+ case AMDGPU::G_FSQRT:
+ case AMDGPU::G_FDIV:
+ case AMDGPU::G_FREM:
+ case AMDGPU::G_FPOW:
+ case AMDGPU::G_FPEXT:
+ case AMDGPU::G_FLOG:
+ case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FLOG10:
+ case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_AMDGPU_RCP_IFLAG:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
+ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
+ return true;
+ case AMDGPU::G_FNEG:
+ case AMDGPU::G_FABS:
+ case AMDGPU::G_FCOPYSIGN:
+ return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE: {
if (Subtarget->supportsMinMaxDenormModes() ||
denormalsEnabledForType(MRI.getType(Reg), MF))
return true;
+
+ [[fallthrough]];
+ }
+ case AMDGPU::G_BUILD_VECTOR:
for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
return false;
return true;
- }
+ case AMDGPU::G_INTRINSIC:
+ switch (MI->getIntrinsicID()) {
+ case Intrinsic::amdgcn_fmul_legacy:
+ case Intrinsic::amdgcn_fmad_ftz:
+ case Intrinsic::amdgcn_sqrt:
+ case Intrinsic::amdgcn_fmed3:
+ case Intrinsic::amdgcn_sin:
+ case Intrinsic::amdgcn_cos:
+ case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_div_scale:
+ case Intrinsic::amdgcn_div_fmas:
+ case Intrinsic::amdgcn_div_fixup:
+ case Intrinsic::amdgcn_fract:
+ case Intrinsic::amdgcn_ldexp:
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cubeid:
+ case Intrinsic::amdgcn_cubema:
+ case Intrinsic::amdgcn_cubesc:
+ case Intrinsic::amdgcn_cubetc:
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_fdot2:
+ case Intrinsic::amdgcn_trig_preop:
+ return true;
+ default:
+ break;
+ }
+
+ [[fallthrough]];
default:
- return denormalsEnabledForType(MRI.getType(Reg), MF) &&
- isKnownNeverSNaN(Reg, MRI);
+ return false;
}
llvm_unreachable("invalid operation");
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 27a41368f1f4..b5afef282f65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -473,8 +473,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul:
@@ -506,13 +506,11 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul <4 x half> %x, %y
@@ -550,8 +548,8 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul_rhs:
@@ -583,13 +581,11 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index ffa396021ddc..0ba1a9b4002e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -261,8 +261,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -294,13 +294,11 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DENORM-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-DENORM-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir
index 1fe8dd5cfe28..a8f17b86f162 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-foldable-fneg.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK
---
name: test_fminnum
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 0af76dd62422..7c07a922374e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -6,14 +6,14 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define half @v_fdiv_f16(half %a, half %b) {
; GFX6-IEEE-LABEL: v_fdiv_f16:
@@ -678,12 +678,12 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX9-NEXT: v_rcp_f32_e32 v5, v5
; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
@@ -691,9 +691,8 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16:
@@ -714,8 +713,7 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16:
@@ -736,8 +734,7 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> %a, %b
ret <2 x half> %fdiv
@@ -771,77 +768,40 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX10-IEEE: ; %bb.0:
-; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX11-IEEE: ; %bb.0:
-; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_fdiv_v2f16_afn:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_afn:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_afn:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -943,12 +903,12 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX9-NEXT: v_rcp_f32_e32 v5, v5
; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
@@ -956,9 +916,8 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_ulp25:
@@ -979,8 +938,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_ulp25:
@@ -1001,8 +959,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
@@ -1102,20 +1059,19 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-NEXT: v_rcp_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16:
@@ -1134,28 +1090,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
@@ -1255,20 +1210,19 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-NEXT: v_rcp_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp:
@@ -1287,28 +1241,27 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX11-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
@@ -1344,7 +1297,7 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v0
; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
@@ -1353,20 +1306,18 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rcp_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_rcp_f16_e32 v1, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
@@ -1458,7 +1409,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v0
; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_ulp25:
@@ -1467,20 +1418,18 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rcp_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_ulp25:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_rcp_f16_e32 v1, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rcp_f16_e32 v0, v0
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
ret <2 x half> %fdiv
@@ -1514,77 +1463,40 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX10-IEEE: ; %bb.0:
-; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX11-IEEE: ; %bb.0:
-; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
}
@@ -1686,12 +1598,12 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX9-NEXT: v_rcp_f32_e32 v5, v5
; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
@@ -1699,9 +1611,8 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -1722,8 +1633,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -1744,8 +1654,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
@@ -1779,77 +1688,40 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX10-IEEE: ; %bb.0:
-; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-IEEE-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-IEEE-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-IEEE-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-FLUSH-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-FLUSH-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-FLUSH-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-FLUSH-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX11-IEEE: ; %bb.0:
-; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-IEEE-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-IEEE-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-FLUSH-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FLUSH-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index c28bb26ba959..e263c2e5be17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -253,7 +253,7 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16:
@@ -262,8 +262,7 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f16_e32 v1, v0
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v2f16:
@@ -273,8 +272,7 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x)
ret <2 x half> %roundeven
@@ -330,7 +328,7 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16_fneg:
@@ -340,8 +338,7 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: v_rndne_f16_e32 v1, v0
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v2f16_fneg:
@@ -352,8 +349,7 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg)
@@ -416,8 +412,8 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_rndne_f16_e32 v3, v1
; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v4f16:
@@ -425,13 +421,11 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_rndne_f16_e32 v2, v0
-; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_roundeven_v4f16:
@@ -444,10 +438,8 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_rndne_f16_e32 v2, v2
; GFX11-NEXT: v_rndne_f16_e32 v3, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
ret <4 x half> %roundeven
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index ac783fa72b9c..e17d38cff633 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -172,10 +172,7 @@ define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspa
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
More information about the llvm-commits
mailing list