[llvm-branch-commits] [llvm] AMDGPU: Change ABI of 16-bit element vectors on gfx6/7 (PR #175781)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 13 07:31:37 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Fix ABI on old subtargets so match new subtargets, packing
16-bit element subvectors into 32-bit registers. Previously
this would be scalarized and promoted to i32/float.
Note this only changes the vector cases. Scalar i16/half are
still promoted to i32/float for now. I've unsuccessfully tried
to make that switch in the past, so leave that for later.
This will help with removal of softPromoteHalfType.
---
Patch is 21.22 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175781.diff
157 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll (+17-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll (+74-48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll (+138-160)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+130-115)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll (+2-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll (+4-20)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll (+29-21)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll (+18-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+430-359)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll (+89-53)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll (+89-89)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+183-142)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+193-151)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+23-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (+5-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+102-91)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll (+138-160)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+340-290)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll (+77-69)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll (+108-78)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+336-287)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll (+8-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+190-146)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+158-114)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll (+13-28)
- (modified) llvm/test/CodeGen/AMDGPU/abs_i16.ll (+480-518)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+44717-48398)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+2789-2023)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll (+923-650)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+1864-1301)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+1296-912)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+6057-4485)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+1674-1190)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+4865-3712)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+578-487)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+2057-1455)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+3897-2742)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+4677-3284)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+267-212)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+15986-12539)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+6571-7076)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+7768-8457)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+1321-973)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+8787-9729)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+10305-11655)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+11359-13002)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+12473-14142)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+13870-15638)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+1526-1148)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+4623-9054)
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+15-34)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+565-437)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+255-203)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+255-203)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+11-31)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+56-33)
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+5-1)
- (modified) llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll (+342-226)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+14-17)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+50-42)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+259-248)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector.ll (+4-5)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+456-473)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+1236-2026)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1113-1096)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+48-42)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+447-351)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+303-235)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+303-235)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+299-231)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+136-132)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+177-147)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+31-19)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+136-132)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+177-147)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+99-93)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.ll (+10-15)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+20-41)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+42-70)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+11-24)
- (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+52-56)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+68-56)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+40-36)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+150-509)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+117-330)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+1055-851)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+609-489)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+609-489)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+601-481)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+38-38)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+707-584)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll (+16-57)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll (+3-17)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll (+3-31)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll (+8-69)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll (+16-57)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll (+16-57)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+106-110)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+155-153)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+97-193)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+61-49)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+9-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+22-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+140-92)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+166-188)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+166-188)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+147-221)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+396-348)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (+11-8)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+283-221)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+296-234)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+296-234)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+283-221)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+26-14)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+372-340)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+99-75)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+449-551)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+912-774)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+449-551)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+912-774)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+85-63)
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+80-60)
- (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+106-72)
- (modified) llvm/test/CodeGen/AMDGPU/saddsat.ll (+43-45)
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+588-521)
- (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+242-194)
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+528-921)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+24-7)
- (modified) llvm/test/CodeGen/AMDGPU/ssubsat.ll (+43-45)
- (modified) llvm/test/CodeGen/AMDGPU/strict_fpext.ll (+14-29)
- (modified) llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll (+13-4)
- (modified) llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll (+79-173)
- (modified) llvm/test/CodeGen/AMDGPU/trunc-combine.ll (+21-62)
- (modified) llvm/test/CodeGen/AMDGPU/uaddsat.ll (+31-32)
- (modified) llvm/test/CodeGen/AMDGPU/usubsat.ll (+37-38)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+68-40)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll (+66-77)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll (+96-97)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll (+96-97)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll (+88-105)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll (+96-97)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll (+88-105)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll (+96-97)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll (+66-43)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll (+64-67)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+107-109)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+107-109)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+80-80)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+86-94)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll (+68-69)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ed5988ee6efc3..49f5d514071e2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1113,7 +1113,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
if (Size == 16) {
if (Subtarget->has16BitInsts())
return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
- return VT.isInteger() ? MVT::i32 : MVT::f32;
+ return ScalarVT == MVT::f32 ? MVT::f32 : MVT::i32;
}
if (Size < 16)
@@ -1139,7 +1139,7 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
unsigned Size = ScalarVT.getSizeInBits();
// FIXME: Should probably promote 8-bit vectors to i16.
- if (Size == 16 && Subtarget->has16BitInsts())
+ if (Size == 16)
return (NumElts + 1) / 2;
if (Size <= 32)
@@ -1163,11 +1163,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts()) {
- RegisterVT = MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
- IntermediateVT = RegisterVT;
+ if (Size == 16) {
+ MVT SimpleIntermediateVT =
+ MVT::getVectorVT(ScalarVT.getSimpleVT(), ElementCount::getFixed(2));
+ IntermediateVT = SimpleIntermediateVT;
+ RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
NumIntermediates = (NumElts + 1) / 2;
- return NumIntermediates;
+ return (NumElts + 1) / 2;
}
if (Size == 32) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
index d6f1b142b36e0..5c60eb696f6b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
@@ -200,10 +200,15 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: s_add_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s16, s16, s18
-; GFX7-NEXT: s_add_i32 s17, s17, s19
-; GFX7-NEXT: v_mov_b32_e32 v0, s16
-; GFX7-NEXT: v_mov_b32_e32 v1, s17
+; GFX7-NEXT: s_lshr_b32 s4, s16, 16
+; GFX7-NEXT: s_lshr_b32 s5, s17, 16
+; GFX7-NEXT: s_add_i32 s4, s4, s5
+; GFX7-NEXT: s_add_i32 s16, s16, s17
+; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX7-NEXT: s_and_b32 s5, s16, 0xffff
+; GFX7-NEXT: s_lshl_b32 s4, s4, 16
+; GFX7-NEXT: s_or_b32 s4, s5, s4
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_add_v2i16:
@@ -278,8 +283,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
; GFX7-LABEL: v_add_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 814acc3be1fc0..244d006844a09 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -9,8 +9,14 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
; GFX7-LABEL: v_add_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16:
@@ -40,13 +46,15 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
; GFX7-LABEL: v_add_v2i16_fneg_lhs:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_lhs:
@@ -79,13 +87,15 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
; GFX7-LABEL: v_add_v2i16_fneg_rhs:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_rhs:
@@ -118,18 +128,16 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
@@ -165,8 +173,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -197,8 +210,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
@@ -230,8 +248,13 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
@@ -262,6 +285,7 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_sub_i32 s1, s1, 64
; GFX7-NEXT: s_sub_i32 s0, s0, 64
; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
@@ -304,6 +328,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_add_i32 s1, s1, 4
; GFX7-NEXT: s_sub_i32 s0, s0, 64
; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
@@ -346,6 +371,7 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_sub_i32 s1, s1, 64
; GFX7-NEXT: s_add_i32 s0, s0, 4
; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
@@ -388,9 +414,11 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: s_add_v2i16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_add_i32 s1, s1, s3
-; GFX7-NEXT: s_add_i32 s0, s0, s2
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT: s_lshr_b32 s2, s0, 16
+; GFX7-NEXT: s_lshr_b32 s3, s1, 16
+; GFX7-NEXT: s_add_i32 s2, s2, s3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_or_b32 s0, s0, s1
@@ -433,14 +461,12 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: s_add_v2i16_fneg_lhs:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_lshl_b32 s1, s1, 16
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
-; GFX7-NEXT: s_lshr_b32 s1, s0, 16
-; GFX7-NEXT: s_add_i32 s1, s1, s3
-; GFX7-NEXT: s_add_i32 s0, s0, s2
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT: s_lshr_b32 s2, s0, 16
+; GFX7-NEXT: s_lshr_b32 s3, s1, 16
+; GFX7-NEXT: s_add_i32 s2, s2, s3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_or_b32 s0, s0, s1
@@ -488,14 +514,12 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) {
; GFX7-LABEL: s_add_v2i16_fneg_rhs:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_lshl_b32 s3, s3, 16
-; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX7-NEXT: s_or_b32 s2, s3, s2
-; GFX7-NEXT: s_xor_b32 s2, s2, 0x80008000
-; GFX7-NEXT: s_lshr_b32 s3, s2, 16
-; GFX7-NEXT: s_add_i32 s1, s1, s3
-; GFX7-NEXT: s_add_i32 s0, s0, s2
-; GFX7-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000
+; GFX7-NEXT: s_lshr_b32 s2, s0, 16
+; GFX7-NEXT: s_lshr_b32 s3, s1, 16
+; GFX7-NEXT: s_add_i32 s2, s2, s3
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: s_and_b32 s1, s2, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_or_b32 s0, s0, s1
@@ -543,12 +567,6 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
; GFX7-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_lshl_b32 s1, s1, 16
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT: s_or_b32 s0, s1, s0
-; GFX7-NEXT: s_lshl_b32 s1, s3, 16
-; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX7-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX7-NEXT: s_lshr_b32 s2, s0, 16
@@ -609,7 +627,11 @@ define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) {
; GFX7-LABEL: add_inline_imm_neg1_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, -1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_inline_imm_neg1_0:
@@ -640,7 +662,11 @@ define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) {
; GFX7-LABEL: add_inline_imm_1_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_inline_imm_1_0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 22b63a7de5f89..29a688ccf280d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -513,14 +513,8 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s3, 16
-; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s5, 16
-; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT: s_or_b32 s1, s1, s2
-; GFX6-NEXT: s_xor_b32 s1, s1, -1
-; GFX6-NEXT: s_and_b32 s0, s0, s1
+; GFX6-NEXT: s_xor_b32 s0, s3, -1
+; GFX6-NEXT: s_and_b32 s0, s2, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16:
@@ -546,14 +540,8 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_commute:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s3, 16
-; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s5, 16
-; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT: s_or_b32 s1, s1, s2
-; GFX6-NEXT: s_xor_b32 s1, s1, -1
-; GFX6-NEXT: s_and_b32 s0, s1, s0
+; GFX6-NEXT: s_xor_b32 s0, s3, -1
+; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_commute:
@@ -579,14 +567,8 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_multi_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s3, 16
-; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s5, 16
-; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT: s_or_b32 s1, s1, s2
-; GFX6-NEXT: s_xor_b32 s1, s1, -1
-; GFX6-NEXT: s_and_b32 s0, s0, s1
+; GFX6-NEXT: s_xor_b32 s1, s3, -1
+; GFX6-NEXT: s_and_b32 s0, s2, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_use:
@@ -619,18 +601,9 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s3, 16
-; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_lshl_b32 s1, s5, 16
-; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT: s_or_b32 s1, s1, s2
-; GFX6-NEXT: s_lshl_b32 s2, s7, 16
-; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_xor_b32 s2, s2, -1
-; GFX6-NEXT: s_and_b32 s0, s0, s2
-; GFX6-NEXT: s_and_b32 s1, s1, s2
+; GFX6-NEXT: s_xor_b32 s1, s4, -1
+; GFX6-NEXT: s_and_b32 s0, s2, s1
+; GFX6-NEXT: s_and_b32 s1, s3, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
@@ -662,26 +635,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg
}
define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
-; GFX6-LABEL: v_andn2_v2i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1
-; GFX6-NEXT: v_and_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_andn2_v2i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
-; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_andn2_v2i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
+; GCN-NEXT: v_and_b32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_v2i16:
; GFX10PLUS: ; %bb.0:
@@ -698,19 +657,19 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v3i16:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_lshr_b32 s7, s4, 16
; GFX6-NEXT: s_mov_b32 s0, -1
-; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX6-NEXT: s_lshl_b32 s6, s6, 16
-; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX6-NEXT: s_lshl_b32 s7, s7, 16
+; GFX6-NEXT: s_lshr_b32 s6, s2, 16
; GFX6-NEXT: s_mov_b32 s1, 0xffff
-; GFX6-NEXT: s_or_b32 s6, s5, s6
-; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_or_b32 s4, s4, s7
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX6-NEXT: s_lshl_b32 s3, s3, 16
-; GFX6-NEXT: s_xor_b...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/175781
More information about the llvm-branch-commits
mailing list