[llvm] 46adccc - [AMDGPU] Improve Codegen for build_vector
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed May 12 06:17:51 PDT 2021
Author: Julien Pagès
Date: 2021-05-12T14:17:44+01:00
New Revision: 46adccc5cc1095f57b65fb2a17a4a023ccc77eb9
URL: https://github.com/llvm/llvm-project/commit/46adccc5cc1095f57b65fb2a17a4a023ccc77eb9
DIFF: https://github.com/llvm/llvm-project/commit/46adccc5cc1095f57b65fb2a17a4a023ccc77eb9.diff
LOG: [AMDGPU] Improve Codegen for build_vector
Improve the code generation of build_vector.
Use the v_pack_b32_f16 instruction instead of
v_and_b32 + v_lshl_or_b32
Differential Revision: https://reviews.llvm.org/D98081
Patch by Julien Pagès!
Added:
llvm/test/CodeGen/AMDGPU/v_pack.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
llvm/test/CodeGen/AMDGPU/fexp.ll
llvm/test/CodeGen/AMDGPU/fpow.ll
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
llvm/test/CodeGen/AMDGPU/frem.ll
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.round.ll
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 9b177088a2aa..fcf0d77d52da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -185,6 +185,28 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
}];
}
+class is_canonicalized<SDPatternOperator op> : PatFrag<
+ (ops node:$src0, node:$src1),
+ (op $src0, $src1),
+ [{
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
+
+ return Lowering.isCanonicalized(*CurDAG, N->getOperand(0)) &&
+ Lowering.isCanonicalized(*CurDAG, N->getOperand(1));
+ }]> {
+
+ // TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class
+ let GISelPredicateCode = [{
+ const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+ MF.getSubtarget().getTargetLowering());
+
+ return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
+ TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+ }];
+}
+
+
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
def smin_oneuse : HasOneUseBinOp<smin>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5eca427052bc..04c5a8733dfb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9610,19 +9610,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// Could be anything.
return false;
- case ISD::BITCAST: {
+ case ISD::BITCAST:
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::i16 &&
- Src.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncSrc = Src.getOperand(0);
+ if (Op.getValueType() == MVT::i16) {
+ SDValue TruncSrc = Op.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}
}
-
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8f6ccd825017..adf168e632c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2280,9 +2280,13 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+def : GCNPat <
+ (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+ (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+ (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
+>;
} // End SubtargetPredicate = HasVOP3PInsts
-
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index e8ec9a0c8300..1af86dda4850 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -288,8 +288,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16
; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
-; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
-; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
+; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 99251ef866ac..a62bbac8ff87 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -552,7 +552,7 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX9-NEXT: s_setpc_b64
; High bits known zero
@@ -635,9 +635,7 @@ define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
; GFX9: s_waitcnt
; GFX9-DAG: v_max_f16_e32 v0, v0, v0
-; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
-; GFX9: v_and_b32_e32 v0, 0xffff, v0
-; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
+; GFX9: v_pack_b32_f16 v0, v0, 2.0
; GFX9: s_setpc_b64
; VI: s_waitcnt
@@ -653,8 +651,7 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
; GFX9: v_max_f16_e32 v0, v0, v0
-; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
-; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
+; GFX9: v_pack_b32_f16 v0, 2.0, v0
; GFX9: s_setpc_b64
; VI: s_waitcnt
@@ -680,8 +677,8 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_setpc_b64
; VI: s_waitcnt
@@ -721,7 +718,7 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, 0
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll
index d03233b3af0c..52e177a3adcc 100644
--- a/llvm/test/CodeGen/AMDGPU/fexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fexp.ll
@@ -137,8 +137,7 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
; GFX9-NEXT: v_pk_mul_f16 v0, v0, [[SREG]] op_sel_hi:[1,0]
; GFX9-NEXT: v_exp_f16_e32 v1, v0
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x half> @llvm.exp.v2f16(<2 x half> %arg0)
ret <2 x half> %result
@@ -198,14 +197,11 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
; GFX9-NEXT: v_mul_f16_sdwa [[MUL3:v[0-9]+]], v1, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_mul_f16_sdwa [[MUL4:v[0-9]+]], v0, [[SREG]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_exp_f16_e32 [[EXP1:v[0-9]+]], [[MUL1]]
-; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL2]]
-; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL4]]
-; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL3]]
-; GFX9-NEXT: v_mov_b32_e32 [[VCONST:v[0-9]+]], 0xffff
-; GFX9-NEXT: v_and_b32_e32 [[AND1:v[0-9]+]], [[VCONST]], [[EXP2]]
-; GFX9-NEXT: v_and_b32_e32 [[AND2:v[0-9]+]], [[VCONST]], [[EXP1]]
-; GFX9-NEXT: v_lshl_or_b32 v0, [[EXP3]], 16, [[AND1]]
-; GFX9-NEXT: v_lshl_or_b32 v1, [[EXP4]], 16, [[AND2]]
+; GFX9-NEXT: v_exp_f16_e32 [[EXP2:v[0-9]+]], [[MUL3]]
+; GFX9-NEXT: v_exp_f16_e32 [[EXP3:v[0-9]+]], [[MUL2]]
+; GFX9-NEXT: v_exp_f16_e32 [[EXP4:v[0-9]+]], [[MUL4]]
+; GFX9-NEXT: v_pack_b32_f16 v1, [[EXP1]], [[EXP2]]
+; GFX9-NEXT: v_pack_b32_f16 v0, [[EXP3]], [[EXP4]]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x half> @llvm.exp.v4f16(<4 x half> %arg0)
ret <4 x half> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index 2e58cdfa48ee..8c9d2c09bf58 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -190,21 +190,20 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
@@ -213,8 +212,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_exp_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
ret <2 x half> %pow
@@ -274,31 +272,29 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_log_f32_e32 v2, v2
-; GFX10-NEXT: v_log_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+;GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
+;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+;GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
+;GFX10-NEXT: v_log_f32_e32 v2, v2
+;GFX10-NEXT: v_log_f32_e32 v0, v0
+;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
+;GFX10-NEXT: v_exp_f32_e32 v1, v2
+;GFX10-NEXT: v_exp_f32_e32 v0, v0
+;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@@ -359,31 +355,29 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_log_f32_e32 v2, v2
-; GFX10-NEXT: v_log_f32_e32 v0, v0
-; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+;GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+;GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
+;GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+;GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
+;GFX10-NEXT: v_log_f32_e32 v2, v2
+;GFX10-NEXT: v_log_f32_e32 v0, v0
+;GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+;GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
+;GFX10-NEXT: v_exp_f32_e32 v1, v2
+;GFX10-NEXT: v_exp_f32_e32 v0, v0
+;GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+;GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+;GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%y.fneg = fneg <2 x half> %y
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@@ -449,21 +443,20 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
@@ -472,8 +465,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_exp_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%y.fneg = fneg <2 x half> %y
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index a722042d7c50..be53ffe52918 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -44,8 +44,7 @@ entry:
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index e465320da0bd..1c460656073b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1514,8 +1514,7 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
; GFX9-NEXT: v_trunc_f16_e32 v4, v4
; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1550,8 +1549,7 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
<2 x half> addrspace(1)* %in2) #0 {
@@ -1862,29 +1860,26 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
; GFX9-NEXT: v_trunc_f16_e32 v6, v6
; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
+; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX9-NEXT: v_trunc_f16_e32 v3, v3
+; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX9-NEXT: v_and_b32_e32 v5, v3, v5
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX9-NEXT: v_rcp_f32_e32 v6, v6
; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
; GFX9-NEXT: v_trunc_f16_e32 v5, v5
-; GFX9-NEXT: v_fma_f16 v5, -v5, v2, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX9-NEXT: v_rcp_f32_e32 v7, v7
-; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7
-; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX9-NEXT: v_div_fixup_f16 v6, v6, v2, v0
-; GFX9-NEXT: v_trunc_f16_e32 v6, v6
-; GFX9-NEXT: v_fma_f16 v0, -v6, v2, v0
-; GFX9-NEXT: v_and_b32_e32 v2, v3, v5
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -1919,30 +1914,27 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX10-NEXT: v_and_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v5
-; GFX10-NEXT: v_rcp_f32_e32 v6, v6
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
+; GFX10-NEXT: v_rcp_f32_e32 v5, v5
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_trunc_f16_e32 v5, v5
-; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v2
+; GFX10-NEXT: v_trunc_f16_e32 v3, v3
+; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX10-NEXT: v_rcp_f32_e32 v7, v7
-; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
-; GFX10-NEXT: v_trunc_f16_e32 v5, v5
-; GFX10-NEXT: v_fmac_f16_e64 v0, -v5, v2
-; GFX10-NEXT: v_and_b32_e32 v2, v3, v6
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX10-NEXT: v_rcp_f32_e32 v6, v6
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX10-NEXT: v_trunc_f16_e32 v3, v3
+; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
+; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x half> addrspace(1)* %in2) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 74791624292a..0c8d303f497a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -135,10 +135,9 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cos_f16_e32 v3, v3
+; GFX9-NEXT: v_cos_f16_e32 v2, v3
; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -146,16 +145,15 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_cos_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_cos_f16_e32 v2, v3
; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
index 31933270436a..0b47c934ee79 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll
@@ -55,8 +55,7 @@ entry:
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; VI-NOT: v_and_b32_e32
; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]
-; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]]
-; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]]
+; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]]
; SI: buffer_store_dword v[[R_F32_5]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
index 608331c56c6a..2851b270d93b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll
@@ -55,8 +55,7 @@ entry:
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; VI-NOT: v_and_b32_e32
; VI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_0]], v[[R_F16_2]]
-; GFX9: v_and_b32_e32 v[[R_F32_4:[0-9]+]], 0xffff, v[[R_F32_3]]
-; GFX9: v_lshl_or_b32 v[[R_F32_5:[0-9]+]], v[[R_F32_2]], 16, v[[R_F32_4]]
+; GFX9: v_pack_b32_f16 v[[R_F32_5:[0-9]+]], v[[R_F32_3]], v[[R_F32_2]]
; SI: buffer_store_dword v[[R_F32_5]]
; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[R_F32_5]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index de87308f685c..5b3d5137ac1d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -43,8 +43,7 @@ entry:
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index f0dcd721cb9c..ae029f56400b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -87,8 +87,7 @@ define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]],
; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]],
-; GFX9: v_and_b32_e32
-; GFX9: v_lshl_or_b32
+; GFX9: v_pack_b32_f16
define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 {
%in = bitcast i32 %in.arg to <2 x half>
%result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index f2e444880504..3e8b8e5e8bb2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -135,10 +135,9 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v2, v3
; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -146,16 +145,15 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0x3118
+; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_f16_e32 v2, 0.15915494, v1
-; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_sin_f16_e32 v2, v2
+; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sin_f16_e32 v2, v3
; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index f08ed4843417..0cf1a5f149f0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -225,10 +225,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt:
; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX9: v_cvt_f16_f32_e32 v0, v0
; GFX9: v_cvt_f16_f32_e32 v1, v3
-; GFX9: v_and_b32_e32 v0, 0xffff, v0
-; GFX9: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9: v_cvt_f16_f32_e32 v0, v0
+; GFX9: v_pack_b32_f16 v0, v0, v1
; GFX9: s_setpc_b64
define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -247,11 +246,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64
define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src0.ext = fpext <3 x half> %src0 to <3 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
new file mode 100644
index 000000000000..940dc967edc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+; GCN-LABEL: v_pack_b32_v2f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_pack_b32_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GISEL-NEXT: s_movk_i32 s0, 0x4000
+; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GISEL-NEXT: v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; use v0
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
+ %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
+ %v0 = load volatile half, half addrspace(1)* %in0.gep
+ %v1 = load volatile half, half addrspace(1)* %in1.gep
+ %v0.add = fadd half %v0, 2.0
+ %v1.add = fadd half %v1, 2.0
+ %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
+ %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
+ %vec.i32 = bitcast <2 x half> %vec.1 to i32
+ call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+ ret void
+}
+
+define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+; GCN-LABEL: v_pack_b32_v2f16_sub:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
+; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_pack_b32_v2f16_sub:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x4000
+; GISEL-NEXT: v_add_f16_e32 v1, -2.0, v1
+; GISEL-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; use v0
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
+ %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
+ %v0 = load volatile half, half addrspace(1)* %in0.gep
+ %v1 = load volatile half, half addrspace(1)* %in1.gep
+ %v0.add = fsub half %v0, 2.0
+ %v1.add = fadd half %v1, 2.0
+ %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
+ %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
+ %vec.i32 = bitcast <2 x half> %vec.1 to i32
+ call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+ ret void
+}
+
+define amdgpu_kernel void @fptrunc(
+; GCN-LABEL: fptrunc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0x31016000
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+;
+; GISEL-LABEL: fptrunc:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GISEL-NEXT: v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-NEXT: s_endpgm
+ <2 x half> addrspace(1)* %r,
+ <2 x float> addrspace(1)* %a) {
+ %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
+ %r.val = fptrunc <2 x float> %a.val to <2 x half>
+ store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+ ret void
+}
+
+define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+; GCN-LABEL: v_pack_b32.fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_pack_b32.fabs:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GISEL-NEXT: s_movk_i32 s0, 0x7fff
+; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GISEL-NEXT: v_and_b32_e32 v0, s0, v0
+; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; use v0
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
+ %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
+ %v0 = load volatile half, half addrspace(1)* %in0.gep
+ %v1 = load volatile half, half addrspace(1)* %in1.gep
+ %v0.add = fadd half %v0, 2.0
+ %v1.add = fadd half %v1, 2.0
+ %v0.fabs = call half @llvm.fabs.f16(half %v0.add)
+ %v1.fabs = call half @llvm.fabs.f16(half %v1.add)
+ %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0
+ %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1
+ %vec.i32 = bitcast <2 x half> %vec.1 to i32
+ call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+ ret void
+}
+
+define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
+; GCN-LABEL: v_pack_b32.fneg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use v0
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_endpgm
+;
+; GISEL-LABEL: v_pack_b32.fneg:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_waitcnt_depctr 0xffe3
+; GISEL-NEXT: s_mov_b32 s0, 0x8000
+; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
+; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
+; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0
+; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: ; use v0
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
+ %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
+ %v0 = load volatile half, half addrspace(1)* %in0.gep
+ %v1 = load volatile half, half addrspace(1)* %in1.gep
+ %v0.add = fadd half %v0, 2.0
+ %v1.add = fadd half %v1, 2.0
+ %v0.fneg = fsub half -0.0, %v0.add
+ %v1.fneg = fsub half -0.0, %v1.add
+ %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0
+ %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1
+ %vec.i32 = bitcast <2 x half> %vec.1 to i32
+ call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+ ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
More information about the llvm-commits
mailing list