[llvm] [AMDGPU] Promote uniform ops to i32 in GISel (PR #106557)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 01:34:00 PDT 2024
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/106557
>From bf5070f87de9df9cd7c5dffab4a8874bc1508393 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 29 Aug 2024 15:32:18 +0200
Subject: [PATCH] [AMDGPU] Promote uniform ops to i32 in GISel
GlobalISel counterpart of #106383
See ##64591
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 31 +-
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 109 ++++++
llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll | 114 +++---
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 364 ++++++++++--------
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 338 +++++++++-------
llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 114 +++---
6 files changed, 663 insertions(+), 407 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..b3065f660d7581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,34 @@ def expand_promoted_fmed3 : GICombineRule<
} // End Predicates = [NotHasMed3_16]
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_SMIN, G_UMIN, G_SMAX, G_UMAX, G_ADD,
+ G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND,
+ G_XOR, G_OR, G_MUL],
+ (pattern (op $dst, $lhs, $rhs)))>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+ (outs root:$dst), (ins),
+ !foreach(op, [G_ICMP, G_SELECT],
+ (pattern (op $dst, $first, $lhs, $rhs)))>;
+
+let Predicates = [Has16BitInsts] in {
+def promote_i16_uniform_binops : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_binops_frag $dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+ (defs root:$dst),
+ (match (promote_i16_uniform_ternary_frag $dst):$mi,
+ [{ return matchPromote16to32(*${mi}); }]),
+ (apply [{ applyPromote16to32(*${mi}); }])
+>;
+}
+
// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
@@ -169,5 +197,6 @@ def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+ fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..d906d1ebaa5494 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchPromote16to32(MachineInstr &MI) const;
+ void applyPromote16to32(MachineInstr &MI) const;
+
private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
@@ -348,6 +351,112 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ const auto *RB = MRI.getRegBankOrNull(Dst);
+
+ // Only promote between 2 and 16 bits.
+ // For ICMP use the LHS of the comparison to get the type.
+ unsigned TyOpIdx = (MI.getOpcode() == AMDGPU::G_ICMP) ? 2 : 0;
+ LLT OpTy = MRI.getType(MI.getOperand(TyOpIdx).getReg());
+ if (OpTy.getScalarSizeInBits() < 2 || OpTy.getScalarSizeInBits() > 16)
+ return false;
+
+ // Only promote uniform instructions.
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ // TODO: Support vectors. Vectors will create illegal ops, such as
+ // 2x32 exts, that we'd need to legalize.
+ // We could just scalarize all vectors but then we don't respect
+ // the legalizer's rules. Ideally we should be able to call
+ // the legalizer here, or this should move into the legalizer
+ // if it can tell between uniform and non-uniform values at
+ // some point.
+ if (DstTy.isVector())
+ return false;
+
+ // Promote only if:
+ // - We have 16 bit insts (not true 16 bit insts).
+ // - This is already checked by the predicate on the combine rule.
+ // - We don't have packed instructions (for vector types only).
+ // TODO: For vector types, the set of packed operations is more limited, so
+ // may want to promote some anyway.
+ assert(STI.has16BitInsts());
+ return (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_ASHR:
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX:
+ return AMDGPU::G_SEXT;
+ case AMDGPU::G_LSHR:
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX:
+ return AMDGPU::G_ZEXT;
+ case AMDGPU::G_ADD:
+ case AMDGPU::G_SUB:
+ case AMDGPU::G_AND:
+ case AMDGPU::G_OR:
+ case AMDGPU::G_XOR:
+ case AMDGPU::G_SHL:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_MUL:
+ // operation result won't be influenced by garbage high bits.
+ return AMDGPU::G_ANYEXT;
+ case AMDGPU::G_ICMP: {
+ return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+ : AMDGPU::G_ZEXT;
+ }
+ default:
+ llvm_unreachable("unexpected opcode!");
+ }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+
+ Register Dst = MI.getOperand(0).getReg();
+
+ const bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+ const bool IsShift =
+ (Opc == AMDGPU::G_ASHR || Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_SHL);
+ Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+ Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+ assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID &&
+ MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID &&
+ MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+
+ const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+ LLT S32 = MRI.getType(LHS).changeElementSize(32);
+
+ B.setInstrAndDebugLoc(MI);
+ LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+ RHS = B.buildInstr(IsShift ? AMDGPU::G_ZEXT : ExtOpc, {S32}, {RHS}).getReg(0);
+
+ MRI.setRegBank(LHS, RB);
+ MRI.setRegBank(RHS, RB);
+
+ MachineInstr *NewInst;
+ if (IsSelectOrCmp)
+ NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+ else
+ NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+ if (Opc != AMDGPU::G_ICMP) {
+ Register Dst32 = NewInst->getOperand(0).getReg();
+ MRI.setRegBank(Dst32, RB);
+ B.buildTrunc(Dst, Dst32);
+ }
+
+ MI.eraseFromParent();
+}
+
void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
Register &Reg) const {
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 4be00fedb972e7..cfa93a0a301671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -349,63 +349,67 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
}
define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_andn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_andn2_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s0, s2, s3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s0, s3, -1
+; GFX9-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_andn2_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT: s_and_b32 s0, s2, s0
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %src0, %not.src1
ret i16 %and
}
define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16_commute:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_andn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_commute:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_andn2_i16_commute:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s0, s2, s3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s0, s3, -1
+; GFX9-NEXT: s_and_b32 s0, s0, s2
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_andn2_i16_commute:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_commute:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %not.src1, %src0
ret i16 %and
}
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_xor_b32 s1, s3, -1
-; GCN-NEXT: s_andn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_multi_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_xor_b32 s1, s3, -1
+; GFX6-NEXT: s_andn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_andn2_i16_multi_use:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s0, s2, s3
-; GFX10-NEXT: s_xor_b32 s1, s3, -1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s1, s3, -1
+; GFX9-NEXT: s_and_b32 s0, s2, s1
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_andn2_i16_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT: s_xor_b32 s1, s3, -1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_multi_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s1, s3, -1
+; GFX10PLUS-NEXT: s_and_b32 s0, s2, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %src0, %not.src1
%insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
@@ -414,23 +418,25 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
}
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GCN-LABEL: s_andn2_i16_multi_foldable_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_andn2_b32 s0, s2, s4
-; GCN-NEXT: s_andn2_b32 s1, s3, s4
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_andn2_b32 s0, s2, s4
+; GFX6-NEXT: s_andn2_b32 s1, s3, s4
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s0, s2, s4
-; GFX10-NEXT: s_andn2_b32 s1, s3, s4
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s1, s4, -1
+; GFX9-NEXT: s_and_b32 s0, s2, s1
+; GFX9-NEXT: s_and_b32 s1, s3, s1
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s0, s2, s4
-; GFX11-NEXT: s_and_not1_b32 s1, s3, s4
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s1, s4, -1
+; GFX10PLUS-NEXT: s_and_b32 s0, s2, s1
+; GFX10PLUS-NEXT: s_and_b32 s1, s3, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src2 = xor i16 %src2, -1
%and0 = and i16 %src0, %not.src2
%and1 = and i16 %src1, %not.src2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 3bd3486ec261d4..5e1e39d4cbaeec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -349,10 +349,11 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshl_i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s3, s2, 7
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -363,10 +364,11 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshl_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s3, s2, 7
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_xor_b32 s2, s2, -1
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
@@ -377,10 +379,11 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshl_i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s3, s2, 7
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_xor_b32 s2, s2, -1
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
@@ -391,10 +394,11 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshl_i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s3, s2, 7
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_xor_b32 s2, s2, -1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
@@ -699,11 +703,12 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8-LABEL: s_fshl_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s1, 8
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
; GFX8-NEXT: s_and_b32 s6, s2, 7
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -713,10 +718,11 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 7
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, s4, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
+; GFX8-NEXT: s_and_b32 s2, s4, 0xff
+; GFX8-NEXT: s_xor_b32 s3, s5, -1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s5
+; GFX8-NEXT: s_and_b32 s3, s3, 7
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
@@ -730,11 +736,12 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9-LABEL: s_fshl_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s4, s1, 8
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
; GFX9-NEXT: s_and_b32 s6, s2, 7
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_xor_b32 s2, s2, -1
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
@@ -744,10 +751,11 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s5, 7
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, s4, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
+; GFX9-NEXT: s_and_b32 s2, s4, 0xff
+; GFX9-NEXT: s_xor_b32 s3, s5, -1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s5
+; GFX9-NEXT: s_and_b32 s3, s3, 7
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
@@ -760,19 +768,21 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX10-LABEL: s_fshl_v2i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s4, s1, 8
; GFX10-NEXT: s_and_b32 s5, s2, 7
+; GFX10-NEXT: s_lshr_b32 s4, s1, 8
; GFX10-NEXT: s_lshr_b32 s6, s2, 8
; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_lshl_b32 s0, s0, s5
; GFX10-NEXT: s_and_b32 s5, s6, 7
+; GFX10-NEXT: s_and_b32 s4, s4, 0xff
+; GFX10-NEXT: s_xor_b32 s6, s6, -1
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_xor_b32 s2, s2, -1
; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT: s_andn2_b32 s6, 7, s6
+; GFX10-NEXT: s_and_b32 s6, s6, 7
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
@@ -791,19 +801,21 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX11-LABEL: s_fshl_v2i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
; GFX11-NEXT: s_and_b32 s5, s2, 7
+; GFX11-NEXT: s_lshr_b32 s4, s1, 8
; GFX11-NEXT: s_lshr_b32 s6, s2, 8
; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, s5
; GFX11-NEXT: s_and_b32 s5, s6, 7
+; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_xor_b32 s6, s6, -1
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_xor_b32 s2, s2, -1
; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s6
+; GFX11-NEXT: s_and_b32 s6, s6, 7
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
@@ -1025,13 +1037,14 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s6, s1, 8
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshr_b32 s8, s1, 24
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshr_b32 s9, s2, 8
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: s_and_b32 s12, s2, 7
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -1043,20 +1056,22 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s9, 7
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, s6, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
+; GFX8-NEXT: s_and_b32 s2, s6, 0xff
+; GFX8-NEXT: s_xor_b32 s3, s9, -1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s9
+; GFX8-NEXT: s_and_b32 s3, s3, 7
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s10, 7
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_and_b32 s3, s7, 0xff
; GFX8-NEXT: s_lshl_b32 s2, s4, s2
+; GFX8-NEXT: s_and_b32 s3, s7, 0xff
+; GFX8-NEXT: s_xor_b32 s4, s10, -1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_andn2_b32 s4, 7, s10
+; GFX8-NEXT: s_and_b32 s4, s4, 7
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
@@ -1064,7 +1079,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_and_b32 s3, s11, 7
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, s3
-; GFX8-NEXT: s_andn2_b32 s5, 7, s11
+; GFX8-NEXT: s_xor_b32 s5, s11, -1
+; GFX8-NEXT: s_and_b32 s5, s5, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshr_b32 s4, s8, 1
; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
@@ -1086,13 +1102,14 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s6, s1, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_lshr_b32 s8, s1, 24
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshr_b32 s9, s2, 8
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
; GFX9-NEXT: s_and_b32 s12, s2, 7
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_xor_b32 s2, s2, -1
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
@@ -1104,20 +1121,22 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s9, 7
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, s6, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
+; GFX9-NEXT: s_and_b32 s2, s6, 0xff
+; GFX9-NEXT: s_xor_b32 s3, s9, -1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s9
+; GFX9-NEXT: s_and_b32 s3, s3, 7
; GFX9-NEXT: s_lshr_b32 s2, s2, 1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s10, 7
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_and_b32 s3, s7, 0xff
; GFX9-NEXT: s_lshl_b32 s2, s4, s2
+; GFX9-NEXT: s_and_b32 s3, s7, 0xff
+; GFX9-NEXT: s_xor_b32 s4, s10, -1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_andn2_b32 s4, 7, s10
+; GFX9-NEXT: s_and_b32 s4, s4, 7
; GFX9-NEXT: s_lshr_b32 s3, s3, 1
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_lshr_b32 s3, s3, s4
@@ -1125,7 +1144,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_and_b32 s3, s11, 7
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: s_andn2_b32 s5, 7, s11
+; GFX9-NEXT: s_xor_b32 s5, s11, -1
+; GFX9-NEXT: s_and_b32 s5, s5, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshr_b32 s4, s8, 1
; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
@@ -1147,58 +1167,62 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s6, s1, 8
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_lshr_b32 s9, s2, 8
+; GFX10-NEXT: s_lshr_b32 s10, s2, 16
; GFX10-NEXT: s_and_b32 s11, s2, 7
+; GFX10-NEXT: s_lshr_b32 s12, s2, 24
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_xor_b32 s2, s2, -1
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_andn2_b32 s12, 7, s2
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
-; GFX10-NEXT: s_lshr_b32 s9, s2, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, s11
-; GFX10-NEXT: s_lshr_b32 s1, s1, s12
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_and_b32 s2, s9, 7
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_and_b32 s1, s9, 7
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX10-NEXT: s_and_b32 s2, s6, 0xff
+; GFX10-NEXT: s_xor_b32 s6, s9, -1
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_and_b32 s6, s6, 7
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_andn2_b32 s9, 7, s9
-; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_lshr_b32 s6, s6, 1
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
; GFX10-NEXT: s_lshl_b32 s1, s3, s1
-; GFX10-NEXT: s_lshr_b32 s3, s6, s9
-; GFX10-NEXT: s_and_b32 s6, s10, 7
-; GFX10-NEXT: s_or_b32 s1, s1, s3
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s6, s7, 0xff
-; GFX10-NEXT: s_lshr_b32 s2, s2, 24
-; GFX10-NEXT: s_lshl_b32 s3, s4, s3
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshr_b32 s4, s4, 1
+; GFX10-NEXT: s_lshr_b32 s2, s2, s6
+; GFX10-NEXT: s_and_b32 s3, s10, 7
+; GFX10-NEXT: s_or_b32 s1, s1, s2
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX10-NEXT: s_and_b32 s3, s7, 0xff
+; GFX10-NEXT: s_xor_b32 s6, s10, -1
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: s_and_b32 s6, s6, 7
+; GFX10-NEXT: s_lshr_b32 s3, s3, 1
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s7, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX10-NEXT: s_lshl_b32 s2, s4, s2
+; GFX10-NEXT: s_lshr_b32 s3, s3, s6
+; GFX10-NEXT: s_xor_b32 s6, s12, -1
+; GFX10-NEXT: s_and_b32 s4, s12, 7
+; GFX10-NEXT: s_and_b32 s6, s6, 7
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: s_lshr_b32 s7, s8, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s5, s5, s6
-; GFX10-NEXT: s_lshr_b32 s2, s7, s2
-; GFX10-NEXT: s_or_b32 s3, s3, s4
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshl_b32 s4, s5, s4
+; GFX10-NEXT: s_lshr_b32 s5, s7, s6
+; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_or_b32 s2, s5, s2
+; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s3, s3, 0xff
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s3, 16
; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
+; GFX10-NEXT: s_lshl_b32 s1, s2, 16
+; GFX10-NEXT: s_and_b32 s2, s3, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_lshl_b32 s1, s2, 24
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
@@ -1208,58 +1232,62 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_lshr_b32 s9, s2, 8
+; GFX11-NEXT: s_lshr_b32 s10, s2, 16
; GFX11-NEXT: s_and_b32 s11, s2, 7
+; GFX11-NEXT: s_lshr_b32 s12, s2, 24
+; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_xor_b32 s2, s2, -1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s9, s2, 8
; GFX11-NEXT: s_lshl_b32 s0, s0, s11
-; GFX11-NEXT: s_lshr_b32 s1, s1, s12
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_and_b32 s2, s9, 7
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_and_b32 s1, s9, 7
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s2, s6, 0xff
+; GFX11-NEXT: s_xor_b32 s6, s9, -1
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s6, s6, 7
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_lshr_b32 s6, s6, 1
-; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
; GFX11-NEXT: s_lshl_b32 s1, s3, s1
-; GFX11-NEXT: s_lshr_b32 s3, s6, s9
-; GFX11-NEXT: s_and_b32 s6, s10, 7
-; GFX11-NEXT: s_or_b32 s1, s1, s3
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s6, s7, 0xff
-; GFX11-NEXT: s_lshr_b32 s2, s2, 24
-; GFX11-NEXT: s_lshl_b32 s3, s4, s3
-; GFX11-NEXT: s_and_b32 s4, 0xffff, s6
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_lshr_b32 s2, s2, s6
+; GFX11-NEXT: s_and_b32 s3, s10, 7
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX11-NEXT: s_and_b32 s3, s7, 0xff
+; GFX11-NEXT: s_xor_b32 s6, s10, -1
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT: s_and_b32 s6, s6, 7
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s7, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s7
+; GFX11-NEXT: s_lshl_b32 s2, s4, s2
+; GFX11-NEXT: s_lshr_b32 s3, s3, s6
+; GFX11-NEXT: s_xor_b32 s6, s12, -1
+; GFX11-NEXT: s_and_b32 s4, s12, 7
+; GFX11-NEXT: s_and_b32 s6, s6, 7
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
; GFX11-NEXT: s_lshr_b32 s7, s8, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s5, s5, s6
-; GFX11-NEXT: s_lshr_b32 s2, s7, s2
-; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_lshl_b32 s4, s5, s4
+; GFX11-NEXT: s_lshr_b32 s5, s7, s6
+; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_or_b32 s2, s5, s2
+; GFX11-NEXT: s_or_b32 s3, s4, s5
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s3, s3, 0xff
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s3, 16
; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: s_lshl_b32 s1, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s3, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_lshl_b32 s1, s2, 24
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -3336,7 +3364,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-LABEL: s_fshl_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s3, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
@@ -3349,7 +3378,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-LABEL: s_fshl_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s2, 15
-; GFX9-NEXT: s_andn2_b32 s2, 15, s2
+; GFX9-NEXT: s_xor_b32 s2, s2, -1
+; GFX9-NEXT: s_and_b32 s2, s2, 15
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
@@ -3361,27 +3391,29 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX10-LABEL: s_fshl_i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s3, s2, 15
-; GFX10-NEXT: s_andn2_b32 s2, 15, s2
+; GFX10-NEXT: s_xor_b32 s3, s2, -1
+; GFX10-NEXT: s_and_b32 s2, s2, 15
+; GFX10-NEXT: s_and_b32 s3, s3, 15
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s0, s0, s3
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: s_lshl_b32 s0, s0, s2
+; GFX10-NEXT: s_lshr_b32 s1, s1, s3
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s3, s2, 15
-; GFX11-NEXT: s_and_not1_b32 s2, 15, s2
+; GFX11-NEXT: s_xor_b32 s3, s2, -1
+; GFX11-NEXT: s_and_b32 s2, s2, 15
+; GFX11-NEXT: s_and_b32 s3, s3, 15
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s0, s0, s3
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT: s_lshl_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s1, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -3720,7 +3752,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX8-LABEL: v_fshl_i16_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s2, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
@@ -3731,7 +3764,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX9-LABEL: v_fshl_i16_svs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 15
-; GFX9-NEXT: s_andn2_b32 s1, 15, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, -1
+; GFX9-NEXT: s_and_b32 s1, s1, 15
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
@@ -3742,8 +3776,9 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX10-LABEL: v_fshl_i16_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshrrev_b16 v0, 1, v0
-; GFX10-NEXT: s_andn2_b32 s2, 15, s1
+; GFX10-NEXT: s_xor_b32 s2, s1, -1
; GFX10-NEXT: s_and_b32 s1, s1, 15
+; GFX10-NEXT: s_and_b32 s2, s2, 15
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
@@ -3753,9 +3788,9 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX11-LABEL: v_fshl_i16_svs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_lshrrev_b16 v0, 1, v0
-; GFX11-NEXT: s_and_not1_b32 s2, 15, s1
+; GFX11-NEXT: s_xor_b32 s2, s1, -1
; GFX11-NEXT: s_and_b32 s1, s1, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s2, s2, 15
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX11-NEXT: s_lshl_b32 s0, s0, s1
@@ -3783,7 +3818,8 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-LABEL: v_fshl_i16_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s2, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -3795,7 +3831,8 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-LABEL: v_fshl_i16_vss:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 15
-; GFX9-NEXT: s_andn2_b32 s1, 15, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, -1
+; GFX9-NEXT: s_and_b32 s1, s1, 15
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: s_lshr_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
@@ -3806,24 +3843,26 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX10-LABEL: v_fshl_i16_vss:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s2, s1, 15
-; GFX10-NEXT: s_andn2_b32 s1, 15, s1
+; GFX10-NEXT: s_xor_b32 s2, s1, -1
+; GFX10-NEXT: s_and_b32 s1, s1, 15
+; GFX10-NEXT: s_and_b32 s2, s2, 15
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT: v_lshlrev_b16 v0, s1, v0
; GFX10-NEXT: s_lshr_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i16_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s2, s1, 15
-; GFX11-NEXT: s_and_not1_b32 s1, 15, s1
+; GFX11-NEXT: s_xor_b32 s2, s1, -1
+; GFX11-NEXT: s_and_b32 s1, s1, 15
+; GFX11-NEXT: s_and_b32 s2, s2, 15
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT: v_lshlrev_b16 v0, s1, v0
; GFX11-NEXT: s_lshr_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3861,10 +3900,11 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_lshr_b32 s5, s2, 16
; GFX8-NEXT: s_and_b32 s6, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
+; GFX8-NEXT: s_lshr_b32 s4, s1, 16
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
@@ -3874,7 +3914,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
+; GFX8-NEXT: s_xor_b32 s2, s5, -1
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
; GFX8-NEXT: s_lshr_b32 s3, s4, 1
@@ -4242,9 +4283,10 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
;
; GFX8-LABEL: v_fshl_v2i16_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s4, s1, 15
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_and_b32 s4, s1, 15
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
@@ -4252,8 +4294,9 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
+; GFX8-NEXT: s_xor_b32 s1, s3, -1
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_andn2_b32 s1, 15, s3
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_lshl_b32 s0, s2, s0
@@ -4339,18 +4382,20 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
;
; GFX8-LABEL: v_fshl_v2i16_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s4, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
+; GFX8-NEXT: s_xor_b32 s1, s3, -1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s3
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4461,10 +4506,11 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
;
; GFX8-LABEL: s_fshl_v3i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_lshr_b32 s7, s2, 16
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
@@ -4474,7 +4520,8 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
+; GFX8-NEXT: s_xor_b32 s4, s8, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_lshr_b32 s6, s7, 1
@@ -4482,7 +4529,8 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, s5, 15
-; GFX8-NEXT: s_andn2_b32 s5, 15, s5
+; GFX8-NEXT: s_xor_b32 s5, s5, -1
+; GFX8-NEXT: s_and_b32 s5, s5, 15
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
@@ -4818,10 +4866,11 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
;
; GFX8-LABEL: s_fshl_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s2, 16
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
; GFX8-NEXT: s_and_b32 s12, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_lshr_b32 s8, s2, 16
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
@@ -4831,17 +4880,19 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_lshr_b32 s2, s2, s4
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s10, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s10
+; GFX8-NEXT: s_xor_b32 s4, s10, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_lshr_b32 s6, s8, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s11, s5, 16
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s4, s5, 15
+; GFX8-NEXT: s_xor_b32 s5, s5, -1
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
-; GFX8-NEXT: s_andn2_b32 s5, 15, s5
+; GFX8-NEXT: s_and_b32 s5, s5, 15
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
@@ -4849,9 +4900,10 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_xor_b32 s4, s11, -1
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s3, s11, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s11
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s5, s9, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 58304d2072d7f6..1ac89c97dec37b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -347,7 +347,8 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshr_i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_andn2_b32 s3, 7, s2
+; GFX8-NEXT: s_xor_b32 s3, s2, -1
+; GFX8-NEXT: s_and_b32 s3, s3, 7
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
@@ -361,7 +362,8 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshr_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_andn2_b32 s3, 7, s2
+; GFX9-NEXT: s_xor_b32 s3, s2, -1
+; GFX9-NEXT: s_and_b32 s3, s3, 7
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
@@ -375,8 +377,9 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshr_i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s3, 7, s2
+; GFX10-NEXT: s_xor_b32 s3, s2, -1
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_and_b32 s3, s3, 7
; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
@@ -389,8 +392,9 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshr_i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s3, 7, s2
+; GFX11-NEXT: s_xor_b32 s3, s2, -1
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_and_b32 s3, s3, 7
; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
@@ -697,17 +701,19 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s1, 8
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
-; GFX8-NEXT: s_andn2_b32 s6, 7, s2
+; GFX8-NEXT: s_xor_b32 s6, s2, -1
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_and_b32 s6, s6, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_andn2_b32 s2, 7, s5
+; GFX8-NEXT: s_xor_b32 s2, s5, -1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s3, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -728,17 +734,19 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s4, s1, 8
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
-; GFX9-NEXT: s_andn2_b32 s6, 7, s2
+; GFX9-NEXT: s_xor_b32 s6, s2, -1
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s6, s6, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_lshl_b32 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_andn2_b32 s2, 7, s5
+; GFX9-NEXT: s_xor_b32 s2, s5, -1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s6
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_lshl_b32 s1, s3, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
@@ -757,26 +765,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX10-LABEL: s_fshr_v2i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b32 s5, 7, s2
+; GFX10-NEXT: s_xor_b32 s4, s2, -1
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s4, s1, 8
+; GFX10-NEXT: s_and_b32 s4, s4, 7
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: s_lshr_b32 s6, s2, 8
-; GFX10-NEXT: s_lshl_b32 s0, s0, s5
-; GFX10-NEXT: s_andn2_b32 s5, 7, s6
-; GFX10-NEXT: s_and_b32 s4, s4, 0xff
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_xor_b32 s4, s6, -1
+; GFX10-NEXT: s_and_b32 s5, s5, 0xff
+; GFX10-NEXT: s_and_b32 s4, s4, 7
; GFX10-NEXT: s_and_b32 s6, s6, 7
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_lshl_b32 s3, s3, 1
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_lshl_b32 s3, s3, s5
-; GFX10-NEXT: s_lshr_b32 s4, s4, s6
+; GFX10-NEXT: s_lshl_b32 s3, s3, s4
+; GFX10-NEXT: s_lshr_b32 s4, s5, s6
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s2, s3, s4
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -788,26 +798,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX11-LABEL: s_fshr_v2i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s2
+; GFX11-NEXT: s_xor_b32 s4, s2, -1
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-NEXT: s_and_b32 s4, s4, 7
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
; GFX11-NEXT: s_lshr_b32 s6, s2, 8
-; GFX11-NEXT: s_lshl_b32 s0, s0, s5
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s6
-; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_lshr_b32 s5, s1, 8
+; GFX11-NEXT: s_lshl_b32 s0, s0, s4
+; GFX11-NEXT: s_xor_b32 s4, s6, -1
+; GFX11-NEXT: s_and_b32 s5, s5, 0xff
+; GFX11-NEXT: s_and_b32 s4, s4, 7
; GFX11-NEXT: s_and_b32 s6, s6, 7
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_lshl_b32 s3, s3, 1
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_lshl_b32 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s4, s4, s6
+; GFX11-NEXT: s_lshl_b32 s3, s3, s4
+; GFX11-NEXT: s_lshr_b32 s4, s5, s6
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_or_b32 s2, s3, s4
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -1027,19 +1039,21 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s9, s2, 8
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
-; GFX8-NEXT: s_andn2_b32 s12, 7, s2
+; GFX8-NEXT: s_xor_b32 s12, s2, -1
; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_and_b32 s12, s12, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_lshl_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
-; GFX8-NEXT: s_andn2_b32 s2, 7, s9
+; GFX8-NEXT: s_xor_b32 s2, s9, -1
+; GFX8-NEXT: s_lshl_b32 s0, s0, s12
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s3, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
@@ -1049,7 +1063,8 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s2, s3, s2
-; GFX8-NEXT: s_andn2_b32 s3, 7, s10
+; GFX8-NEXT: s_xor_b32 s3, s10, -1
+; GFX8-NEXT: s_and_b32 s3, s3, 7
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_lshl_b32 s2, s4, 1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
@@ -1059,7 +1074,8 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s3, s4, s3
-; GFX8-NEXT: s_andn2_b32 s4, 7, s11
+; GFX8-NEXT: s_xor_b32 s4, s11, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 7
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
@@ -1088,19 +1104,21 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s9, s2, 8
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
-; GFX9-NEXT: s_andn2_b32 s12, 7, s2
+; GFX9-NEXT: s_xor_b32 s12, s2, -1
; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s12, s12, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT: s_lshl_b32 s0, s0, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
-; GFX9-NEXT: s_andn2_b32 s2, 7, s9
+; GFX9-NEXT: s_xor_b32 s2, s9, -1
+; GFX9-NEXT: s_lshl_b32 s0, s0, s12
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_lshl_b32 s1, s3, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
@@ -1110,7 +1128,8 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s2, s3, s2
-; GFX9-NEXT: s_andn2_b32 s3, 7, s10
+; GFX9-NEXT: s_xor_b32 s3, s10, -1
+; GFX9-NEXT: s_and_b32 s3, s3, 7
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s4, 1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
@@ -1120,7 +1139,8 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s3, s4, s3
-; GFX9-NEXT: s_andn2_b32 s4, 7, s11
+; GFX9-NEXT: s_xor_b32 s4, s11, -1
+; GFX9-NEXT: s_and_b32 s4, s4, 7
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_lshl_b32 s3, s5, 1
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
@@ -1143,36 +1163,39 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX10-LABEL: s_fshr_v4i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_xor_b32 s11, s2, -1
; GFX10-NEXT: s_lshr_b32 s6, s1, 8
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_lshr_b32 s9, s2, 8
; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_lshr_b32 s11, s2, 24
-; GFX10-NEXT: s_andn2_b32 s12, 7, s2
+; GFX10-NEXT: s_lshr_b32 s12, s2, 24
+; GFX10-NEXT: s_and_b32 s11, s11, 7
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_and_b32 s2, s2, 7
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_andn2_b32 s2, 7, s9
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
-; GFX10-NEXT: s_and_b32 s9, s9, 7
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, 1
+; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s2, s3, s2
-; GFX10-NEXT: s_lshr_b32 s3, s6, s9
+; GFX10-NEXT: s_lshl_b32 s0, s0, s11
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_xor_b32 s2, s9, -1
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s2, s3
-; GFX10-NEXT: s_andn2_b32 s2, 7, s10
+; GFX10-NEXT: s_and_b32 s1, s2, 7
+; GFX10-NEXT: s_lshl_b32 s2, s3, 1
+; GFX10-NEXT: s_and_b32 s3, s6, 0xff
+; GFX10-NEXT: s_and_b32 s6, s9, 7
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshl_b32 s1, s2, s1
+; GFX10-NEXT: s_lshr_b32 s2, s3, s6
+; GFX10-NEXT: s_xor_b32 s3, s10, -1
+; GFX10-NEXT: s_or_b32 s1, s1, s2
+; GFX10-NEXT: s_and_b32 s2, s3, 7
; GFX10-NEXT: s_lshl_b32 s3, s4, 1
; GFX10-NEXT: s_and_b32 s4, s7, 0xff
; GFX10-NEXT: s_and_b32 s6, s10, 7
@@ -1181,12 +1204,13 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
; GFX10-NEXT: s_lshl_b32 s2, s3, s2
; GFX10-NEXT: s_lshr_b32 s3, s4, s6
-; GFX10-NEXT: s_lshl_b32 s4, s5, 1
-; GFX10-NEXT: s_andn2_b32 s5, 7, s11
-; GFX10-NEXT: s_and_b32 s6, s11, 7
-; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_xor_b32 s4, s12, -1
+; GFX10-NEXT: s_and_b32 s6, s12, 7
+; GFX10-NEXT: s_and_b32 s4, s4, 7
+; GFX10-NEXT: s_lshl_b32 s5, s5, 1
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_lshl_b32 s4, s4, s5
+; GFX10-NEXT: s_lshl_b32 s4, s5, s4
; GFX10-NEXT: s_lshr_b32 s5, s8, s6
; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
@@ -1204,36 +1228,39 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX11-LABEL: s_fshr_v4i8:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_xor_b32 s11, s2, -1
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
; GFX11-NEXT: s_lshr_b32 s9, s2, 8
; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_lshr_b32 s11, s2, 24
-; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT: s_lshr_b32 s12, s2, 24
+; GFX11-NEXT: s_and_b32 s11, s11, 7
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_and_b32 s2, s2, 7
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s9
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
-; GFX11-NEXT: s_and_b32 s9, s9, 7
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, 1
+; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
-; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s2, s3, s2
-; GFX11-NEXT: s_lshr_b32 s3, s6, s9
+; GFX11-NEXT: s_lshl_b32 s0, s0, s11
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_xor_b32 s2, s9, -1
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s10
+; GFX11-NEXT: s_and_b32 s1, s2, 7
+; GFX11-NEXT: s_lshl_b32 s2, s3, 1
+; GFX11-NEXT: s_and_b32 s3, s6, 0xff
+; GFX11-NEXT: s_and_b32 s6, s9, 7
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_lshl_b32 s1, s2, s1
+; GFX11-NEXT: s_lshr_b32 s2, s3, s6
+; GFX11-NEXT: s_xor_b32 s3, s10, -1
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_and_b32 s2, s3, 7
; GFX11-NEXT: s_lshl_b32 s3, s4, 1
; GFX11-NEXT: s_and_b32 s4, s7, 0xff
; GFX11-NEXT: s_and_b32 s6, s10, 7
@@ -1242,12 +1269,13 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
; GFX11-NEXT: s_lshl_b32 s2, s3, s2
; GFX11-NEXT: s_lshr_b32 s3, s4, s6
-; GFX11-NEXT: s_lshl_b32 s4, s5, 1
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s11
-; GFX11-NEXT: s_and_b32 s6, s11, 7
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_xor_b32 s4, s12, -1
+; GFX11-NEXT: s_and_b32 s6, s12, 7
+; GFX11-NEXT: s_and_b32 s4, s4, 7
+; GFX11-NEXT: s_lshl_b32 s5, s5, 1
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_lshl_b32 s4, s4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s5, s4
; GFX11-NEXT: s_lshr_b32 s5, s8, s6
; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
@@ -3080,7 +3108,8 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-LABEL: s_fshr_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s3, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
@@ -3093,7 +3122,8 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-LABEL: s_fshr_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s2, 15
-; GFX9-NEXT: s_andn2_b32 s2, 15, s2
+; GFX9-NEXT: s_xor_b32 s2, s2, -1
+; GFX9-NEXT: s_and_b32 s2, s2, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
@@ -3105,27 +3135,29 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX10-LABEL: s_fshr_i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s3, s2, 15
-; GFX10-NEXT: s_andn2_b32 s2, 15, s2
+; GFX10-NEXT: s_xor_b32 s3, s2, -1
+; GFX10-NEXT: s_and_b32 s2, s2, 15
+; GFX10-NEXT: s_and_b32 s3, s3, 15
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT: s_lshl_b32 s0, s0, s2
-; GFX10-NEXT: s_lshr_b32 s1, s1, s3
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s3, s2, 15
-; GFX11-NEXT: s_and_not1_b32 s2, 15, s2
+; GFX11-NEXT: s_xor_b32 s3, s2, -1
+; GFX11-NEXT: s_and_b32 s2, s2, 15
+; GFX11-NEXT: s_and_b32 s3, s3, 15
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT: s_lshl_b32 s0, s0, s2
-; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -3462,7 +3494,8 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX8-LABEL: v_fshr_i16_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s2, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
@@ -3473,7 +3506,8 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX9-LABEL: v_fshr_i16_svs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 15
-; GFX9-NEXT: s_andn2_b32 s1, 15, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, -1
+; GFX9-NEXT: s_and_b32 s1, s1, 15
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
@@ -3483,22 +3517,24 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
;
; GFX10-LABEL: v_fshr_i16_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s2, s1, 15
-; GFX10-NEXT: s_andn2_b32 s1, 15, s1
-; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
+; GFX10-NEXT: s_xor_b32 s2, s1, -1
+; GFX10-NEXT: s_and_b32 s1, s1, 15
+; GFX10-NEXT: s_and_b32 s2, s2, 15
+; GFX10-NEXT: v_lshrrev_b16 v0, s1, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i16_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s2, s1, 15
-; GFX11-NEXT: s_and_not1_b32 s1, 15, s1
-; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0
+; GFX11-NEXT: s_xor_b32 s2, s1, -1
+; GFX11-NEXT: s_and_b32 s1, s1, 15
+; GFX11-NEXT: s_and_b32 s2, s2, 15
+; GFX11-NEXT: v_lshrrev_b16 v0, s1, v0
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3526,7 +3562,8 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX8-LABEL: v_fshr_i16_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s2, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3538,7 +3575,8 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX9-LABEL: v_fshr_i16_vss:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 15
-; GFX9-NEXT: s_andn2_b32 s1, 15, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, -1
+; GFX9-NEXT: s_and_b32 s1, s1, 15
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
@@ -3550,11 +3588,12 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX10-LABEL: v_fshr_i16_vss:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT: s_andn2_b32 s2, 15, s1
+; GFX10-NEXT: s_xor_b32 s2, s1, -1
; GFX10-NEXT: s_and_b32 s1, s1, 15
+; GFX10-NEXT: s_and_b32 s2, s2, 15
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshr_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
@@ -3562,11 +3601,13 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX11-LABEL: v_fshr_i16_vss:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
-; GFX11-NEXT: s_and_not1_b32 s2, 15, s1
+; GFX11-NEXT: s_xor_b32 s2, s1, -1
; GFX11-NEXT: s_and_b32 s1, s1, 15
+; GFX11-NEXT: s_and_b32 s2, s2, 15
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
@@ -3626,12 +3667,13 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_or_b32 s0, s0, s5
; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_lshr_b32 s5, s4, 15
-; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_xor_b32 s2, s2, -1
; GFX8-NEXT: s_or_b32 s3, s3, s5
+; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_lshr_b32 s5, s2, 16
; GFX8-NEXT: s_and_b32 s6, s2, 15
-; GFX8-NEXT: s_andn2_b32 s2, 15, s2
+; GFX8-NEXT: s_xor_b32 s2, s2, -1
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
@@ -3641,8 +3683,9 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 15
; GFX8-NEXT: s_lshl_b32 s4, s4, 1
+; GFX8-NEXT: s_xor_b32 s2, s5, -1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_andn2_b32 s2, 15, s5
+; GFX8-NEXT: s_and_b32 s2, s2, 15
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
@@ -4076,17 +4119,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
; GFX8-NEXT: s_lshl_b32 s0, s2, 1
; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 1
; GFX8-NEXT: s_xor_b32 s0, s1, -1
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s2, s0, 15
-; GFX8-NEXT: s_andn2_b32 s0, 15, s0
+; GFX8-NEXT: s_xor_b32 s0, s0, -1
+; GFX8-NEXT: v_mov_b32_e32 v4, 1
+; GFX8-NEXT: s_and_b32 s0, s0, 15
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_lshrrev_b16_e32 v3, s0, v3
; GFX8-NEXT: s_and_b32 s0, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0
@@ -4205,12 +4250,13 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_lshr_b32 s3, s2, 15
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_xor_b32 s1, s1, -1
; GFX8-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s4, s1, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s1
+; GFX8-NEXT: s_xor_b32 s1, s1, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
@@ -4219,7 +4265,8 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: v_or_b32_e32 v1, s0, v1
; GFX8-NEXT: s_and_b32 s0, s3, 15
-; GFX8-NEXT: s_andn2_b32 s1, 15, s3
+; GFX8-NEXT: s_xor_b32 s1, s3, -1
+; GFX8-NEXT: s_and_b32 s1, s1, 15
; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0
; GFX8-NEXT: s_and_b32 s0, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s0, s0, 1
@@ -4349,12 +4396,13 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_or_b32 s0, s0, s8
; GFX8-NEXT: s_lshl_b32 s6, s6, 1
; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_xor_b32 s4, s4, -1
; GFX8-NEXT: s_or_b32 s6, s6, s8
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
@@ -4364,8 +4412,9 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
; GFX8-NEXT: s_lshl_b32 s7, s7, 1
+; GFX8-NEXT: s_xor_b32 s4, s8, -1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
; GFX8-NEXT: s_lshr_b32 s6, s6, 1
@@ -4376,10 +4425,11 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
; GFX8-NEXT: s_lshl_b32 s1, s1, 1
; GFX8-NEXT: s_lshr_b32 s4, s4, 15
; GFX8-NEXT: s_or_b32 s1, s1, s4
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_xor_b32 s4, s5, -1
+; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_and_b32 s5, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
@@ -4785,12 +4835,13 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_or_b32 s0, s0, s8
; GFX8-NEXT: s_lshl_b32 s6, s6, 1
; GFX8-NEXT: s_lshr_b32 s8, s7, 15
-; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_xor_b32 s4, s4, -1
; GFX8-NEXT: s_or_b32 s6, s6, s8
+; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_lshr_b32 s8, s4, 16
; GFX8-NEXT: s_and_b32 s9, s4, 15
-; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_and_b32 s9, 0xffff, s9
; GFX8-NEXT: s_lshr_b32 s2, s2, 1
@@ -4800,8 +4851,9 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_or_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s2, s8, 15
; GFX8-NEXT: s_lshl_b32 s7, s7, 1
+; GFX8-NEXT: s_xor_b32 s4, s8, -1
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT: s_andn2_b32 s4, 15, s8
+; GFX8-NEXT: s_and_b32 s4, s4, 15
; GFX8-NEXT: s_lshl_b32 s2, s6, s2
; GFX8-NEXT: s_and_b32 s6, 0xffff, s7
; GFX8-NEXT: s_lshr_b32 s6, s6, 1
@@ -4820,12 +4872,13 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_or_b32 s1, s1, s6
; GFX8-NEXT: s_lshl_b32 s2, s2, 1
; GFX8-NEXT: s_lshr_b32 s6, s4, 15
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_xor_b32 s5, s5, -1
; GFX8-NEXT: s_or_b32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b32 s3, s3, 1
; GFX8-NEXT: s_lshr_b32 s6, s5, 16
; GFX8-NEXT: s_and_b32 s7, s5, 15
-; GFX8-NEXT: s_andn2_b32 s5, 15, s5
+; GFX8-NEXT: s_xor_b32 s5, s5, -1
+; GFX8-NEXT: s_and_b32 s5, s5, 15
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_and_b32 s7, 0xffff, s7
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
@@ -4835,8 +4888,9 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
; GFX8-NEXT: s_or_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s3, s6, 15
; GFX8-NEXT: s_lshl_b32 s4, s4, 1
+; GFX8-NEXT: s_xor_b32 s5, s6, -1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_andn2_b32 s5, 15, s6
+; GFX8-NEXT: s_and_b32 s5, s5, 15
; GFX8-NEXT: s_lshl_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
; GFX8-NEXT: s_lshr_b32 s3, s3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e7119c89ac06cd..2e58696518a9f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -349,63 +349,67 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
}
define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_orn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_orn2_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_orn2_b32 s0, s2, s3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s0, s3, -1
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_orn2_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT: s_or_b32 s0, s2, s0
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%or = or i16 %src0, %not.src1
ret i16 %or
}
define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16_commute:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_orn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_commute:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_orn2_i16_commute:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_orn2_b32 s0, s2, s3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s0, s3, -1
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_orn2_i16_commute:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_commute:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%or = or i16 %not.src1, %src0
ret i16 %or
}
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16_multi_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_xor_b32 s1, s3, -1
-; GCN-NEXT: s_orn2_b32 s0, s2, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_multi_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_xor_b32 s1, s3, -1
+; GFX6-NEXT: s_orn2_b32 s0, s2, s3
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_orn2_i16_multi_use:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_orn2_b32 s0, s2, s3
-; GFX10-NEXT: s_xor_b32 s1, s3, -1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s1, s3, -1
+; GFX9-NEXT: s_or_b32 s0, s2, s1
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_orn2_i16_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT: s_xor_b32 s1, s3, -1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_multi_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s1, s3, -1
+; GFX10PLUS-NEXT: s_or_b32 s0, s2, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%or = or i16 %src0, %not.src1
%insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0
@@ -414,23 +418,25 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
}
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GCN-LABEL: s_orn2_i16_multi_foldable_use:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_orn2_b32 s0, s2, s4
-; GCN-NEXT: s_orn2_b32 s1, s3, s4
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_orn2_b32 s0, s2, s4
+; GFX6-NEXT: s_orn2_b32 s1, s3, s4
+; GFX6-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_orn2_b32 s0, s2, s4
-; GFX10-NEXT: s_orn2_b32 s1, s3, s4
-; GFX10-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_xor_b32 s1, s4, -1
+; GFX9-NEXT: s_or_b32 s0, s2, s1
+; GFX9-NEXT: s_or_b32 s1, s3, s1
+; GFX9-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_or_not1_b32 s0, s2, s4
-; GFX11-NEXT: s_or_not1_b32 s1, s3, s4
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_xor_b32 s1, s4, -1
+; GFX10PLUS-NEXT: s_or_b32 s0, s2, s1
+; GFX10PLUS-NEXT: s_or_b32 s1, s3, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src2 = xor i16 %src2, -1
%or0 = or i16 %src0, %not.src2
%or1 = or i16 %src1, %not.src2
More information about the llvm-commits
mailing list