[llvm] ba4bcce - [GlobalIsel] Combine trunc of binop (#107721)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 11 06:05:00 PDT 2024
Author: Thorsten Schütt
Date: 2024-09-11T15:04:55+02:00
New Revision: ba4bcce5f5ffa9e7d4af72c20fe4f1baf97075fc
URL: https://github.com/llvm/llvm-project/commit/ba4bcce5f5ffa9e7d4af72c20fe4f1baf97075fc
DIFF: https://github.com/llvm/llvm-project/commit/ba4bcce5f5ffa9e7d4af72c20fe4f1baf97075fc.diff
LOG: [GlobalIsel] Combine trunc of binop (#107721)
trunc (binop X, C) --> binop (trunc X, trunc C) --> binop (trunc X, C`)
Try to narrow the width of math or bitwise logic instructions by pulling
a truncate ahead of binary operators.
Vx and Nx cores consider 32-bit and 64-bit basic arithmetic equal in
costs.
Added:
llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
llvm/test/CodeGen/AMDGPU/constrained-shift.ll
llvm/test/CodeGen/AMDGPU/ctlz.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 9b62d6067be39c..828532dcffb7d3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -831,6 +831,12 @@ class CombinerHelper {
/// Combine ors.
bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// trunc (binop X, C) --> binop (trunc X, trunc C).
+ bool matchNarrowBinop(const MachineInstr &TruncMI,
+ const MachineInstr &BinopMI, BuildFnTy &MatchInfo);
+
+ bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo);
+
/// Combine addos.
bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 525cc815e73cef..a595a51d7b01ff 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1867,6 +1867,33 @@ class buildvector_of_opcode<Instruction castOpcode> : GICombineRule <
def buildvector_of_truncate : buildvector_of_opcode<G_TRUNC>;
+// narrow binop.
+// trunc (binop X, C) --> binop (trunc X, trunc C)
+class narrow_binop_opcode<Instruction binopOpcode> : GICombineRule <
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (G_CONSTANT $const, $imm),
+ (binopOpcode $binop, $x, $const):$Binop,
+ (G_TRUNC $root, $binop):$Trunc,
+ [{ return Helper.matchNarrowBinop(*${Trunc}, *${Binop}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${Trunc}, ${matchinfo}); }])>;
+
+def narrow_binop_add : narrow_binop_opcode<G_ADD>;
+def narrow_binop_sub : narrow_binop_opcode<G_SUB>;
+def narrow_binop_mul : narrow_binop_opcode<G_MUL>;
+def narrow_binop_and : narrow_binop_opcode<G_AND>;
+def narrow_binop_or : narrow_binop_opcode<G_OR>;
+def narrow_binop_xor : narrow_binop_opcode<G_XOR>;
+
+// Cast of integer.
+class integer_of_opcode<Instruction castOpcode> : GICombineRule <
+ (defs root:$root, apint_matchinfo:$matchinfo),
+ (match (G_CONSTANT $int, $imm),
+ (castOpcode $root, $int):$Cast,
+ [{ return Helper.matchCastOfInteger(*${Cast}, ${matchinfo}); }]),
+ (apply [{ Helper.replaceInstWithConstant(*${Cast}, ${matchinfo}); }])>;
+
+def integer_of_truncate : integer_of_opcode<G_TRUNC>;
+
def cast_combines: GICombineGroup<[
truncate_of_zext,
truncate_of_sext,
@@ -1881,7 +1908,14 @@ def cast_combines: GICombineGroup<[
anyext_of_anyext,
anyext_of_zext,
anyext_of_sext,
- buildvector_of_truncate
+ buildvector_of_truncate,
+ narrow_binop_add,
+ narrow_binop_sub,
+ narrow_binop_mul,
+ narrow_binop_and,
+ narrow_binop_or,
+ narrow_binop_xor,
+ integer_of_truncate
]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 8714fdabf65494..30557e6a2304e6 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -313,3 +313,49 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
return true;
}
+
+bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI,
+ const MachineInstr &BinopMI,
+ BuildFnTy &MatchInfo) {
+ const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
+ const GBinOp *BinOp = cast<GBinOp>(&BinopMI);
+
+ if (!MRI.hasOneNonDBGUse(BinOp->getReg(0)))
+ return false;
+
+ Register Dst = Trunc->getReg(0);
+ LLT DstTy = MRI.getType(Dst);
+
+ // Is narrow binop legal?
+ if (!isLegalOrBeforeLegalizer({BinOp->getOpcode(), {DstTy}}))
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ auto LHS = B.buildTrunc(DstTy, BinOp->getLHSReg());
+ auto RHS = B.buildTrunc(DstTy, BinOp->getRHSReg());
+ B.buildInstr(BinOp->getOpcode(), {Dst}, {LHS, RHS});
+ };
+
+ return true;
+}
+
+bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
+ APInt &MatchInfo) {
+ const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+
+ APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI);
+
+ LLT DstTy = MRI.getType(Cast->getReg(0));
+
+ if (!isConstantLegalOrBeforeLegalizer(DstTy))
+ return false;
+
+ switch (Cast->getOpcode()) {
+ case TargetOpcode::G_TRUNC: {
+ MatchInfo = Input.trunc(DstTy.getScalarSizeInBits());
+ return true;
+ }
+ default:
+ return false;
+ }
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
new file mode 100644
index 00000000000000..f207e9c149a476
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK
+
+---
+name: test_combine_trunc_xor_i64
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_xor_i64
+ ; CHECK: %lhs:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_XOR [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s64) = COPY $x0
+ %rhs:_(s64) = G_CONSTANT i64 5
+ %res:_(s64) = G_XOR %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s64)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_add_i64
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_add_i64
+ ; CHECK: %lhs:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s64) = COPY $x0
+ %rhs:_(s64) = G_CONSTANT i64 5
+ %res:_(s64) = G_ADD %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s64)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_mul_i64
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_mul_i64
+ ; CHECK: %lhs:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_MUL [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s64) = COPY $x0
+ %rhs:_(s64) = G_CONSTANT i64 5
+ %res:_(s64) = G_MUL %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s64)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_and_i64
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_and_i64
+ ; CHECK: %lhs:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_AND [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s64) = COPY $x0
+ %rhs:_(s64) = G_CONSTANT i64 5
+ %res:_(s64) = G_AND %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s64)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_or_i64
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_or_i64
+ ; CHECK: %lhs:_(s64) = COPY $x0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_OR [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s64) = COPY $x0
+ %rhs:_(s64) = G_CONSTANT i64 5
+ %res:_(s64) = G_OR %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s64)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_sub_i128
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_sub_i128
+ ; CHECK: %lhs:_(s128) = COPY $q0
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]]
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s128) = COPY $q0
+ %rhs:_(s128) = G_CONSTANT i128 5
+ %res:_(s128) = G_SUB %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s128)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_sub_i128_multi_use
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use
+ ; CHECK: %lhs:_(s128) = COPY $q0
+ ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5
+ ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs
+ ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128)
+ ; CHECK-NEXT: $q0 = COPY %res(s128)
+ ; CHECK-NEXT: $w0 = COPY %small(s32)
+ %lhs:_(s128) = COPY $q0
+ %rhs:_(s128) = G_CONSTANT i128 5
+ %res:_(s128) = G_SUB %lhs, %rhs
+ %small:_(s32) = G_TRUNC %res(s128)
+ $q0 = COPY %res(s128)
+ $w0 = COPY %small(s32)
+...
+---
+name: test_combine_trunc_xor_vector_pattern_did_not_match
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: test_combine_trunc_xor_vector_pattern_did_not_match
+ ; CHECK: %arg1:_(s64) = COPY $x0
+ ; CHECK-NEXT: %arg2:_(s64) = COPY $x0
+ ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ ; CHECK-NEXT: %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+ ; CHECK-NEXT: %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+ ; CHECK-NEXT: $w0 = COPY %small(<2 x s16>)
+ %arg1:_(s64) = COPY $x0
+ %arg2:_(s64) = COPY $x0
+ %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+ %res:_(<2 x s64>) = G_XOR %lhs, %rhs
+ %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>)
+ $w0 = COPY %small(<2 x s16>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index fee5afd3ddbb2a..9ed1e2d9eee3b4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -224,10 +224,10 @@ body: |
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
- ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
- ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+ ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+ ; CHECK-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1)
; CHECK-NEXT: RET_ReallyLR
%0:_(p0) = COPY $x0
%1:_(s8) = G_CONSTANT i8 64
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
index e51d9bd13163b4..a87ff305d15351 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
@@ -8,9 +8,8 @@ tracksRegLiveness: true
body: |
bb.1:
; CHECK-LABEL: name: test
- ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
- ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s16) = G_CONSTANT i16 0
%2:_(s1) = G_CONSTANT i1 true
@@ -41,9 +40,7 @@ body: |
bb.1:
; CHECK-LABEL: name: test_inverted_div_rem
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
- ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8)
- ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[C]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s16) = G_CONSTANT i16 0
%2:_(s1) = G_CONSTANT i1 true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
index e4f11dfa9e027e..d6135d86022be3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
@@ -193,10 +193,10 @@ body: |
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
- ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+ ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = G_CONSTANT i64 255
@@ -216,10 +216,10 @@ body: |
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
- ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+ ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]]
; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = G_CONSTANT i64 255
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
index 3b914df7f8f8a3..3423af64162e52 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir
@@ -12,9 +12,11 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %var:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
- ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
- ; GCN-NEXT: $vgpr0 = COPY %low_bits(s32)
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+ ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %zext(s32)
%var:_(s32) = COPY $vgpr0
%c3FFF:_(s32) = G_CONSTANT i32 16383
%low_bits:_(s32) = G_AND %var, %c3FFF
@@ -34,10 +36,8 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %var:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %cFFFFF:_(s32) = G_CONSTANT i32 1048575
- ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %cFFFFF
- ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
- ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+ ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16)
; GCN-NEXT: $vgpr0 = COPY %zext(s32)
%var:_(s32) = COPY $vgpr0
%cFFFFF:_(s32) = G_CONSTANT i32 1048575
@@ -58,9 +58,9 @@ body: |
; GCN: liveins: $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1
- ; GCN-NEXT: %c3FFF:_(s64) = G_CONSTANT i64 16383
- ; GCN-NEXT: %low_bits:_(s64) = G_AND %var, %c3FFF
- ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s64)
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s64)
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+ ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16)
; GCN-NEXT: $vgpr0 = COPY %zext(s32)
%var:_(s64) = COPY $vgpr0_vgpr1
@@ -82,9 +82,9 @@ body: |
; GCN: liveins: $vgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: %var:_(s32) = COPY $vgpr0
- ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383
- ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF
- ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32)
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32)
+ ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383
+ ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]]
; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16)
; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64)
%var:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 966a481b6594dc..bb7bc0447aea04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -238,13 +238,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
-; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_xor_b32_e32 v3, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $vgpr3
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index afffebea451a0e..3bd3486ec261d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -350,10 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX8-LABEL: s_fshl_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_and_b32 s3, s2, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -362,10 +364,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX9-LABEL: s_fshl_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_and_b32 s3, s2, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -377,7 +381,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX10-NEXT: s_and_b32 s3, s2, 7
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -389,7 +395,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
; GFX11-NEXT: s_and_b32 s3, s2, 7
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -416,11 +424,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX8-NEXT: v_mov_b32_e32 v3, 1
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -429,11 +437,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 1
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -441,11 +449,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX10-LABEL: v_fshl_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v3, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
@@ -454,12 +462,12 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX11-LABEL: v_fshl_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v3, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1
@@ -692,22 +700,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX8: ; %bb.0:
; GFX8-NEXT: s_lshr_b32 s4, s1, 8
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
; GFX8-NEXT: s_and_b32 s6, s2, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s5, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, s4, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s3, s4, 0xff
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s3, 7, s5
+; GFX8-NEXT: s_lshr_b32 s2, s2, 1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_andn2_b32 s2, 7, s5
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
@@ -719,22 +731,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s4, s1, 8
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
; GFX9-NEXT: s_and_b32 s6, s2, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s5, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, s4, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s3, s4, 0xff
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s3, 7, s5
+; GFX9-NEXT: s_lshr_b32 s2, s2, 1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_andn2_b32 s2, 7, s5
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_lshr_b32 s2, s3, s2
+; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
@@ -745,21 +761,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX10-LABEL: s_fshl_v2i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s4, s1, 8
-; GFX10-NEXT: s_lshr_b32 s5, s2, 8
+; GFX10-NEXT: s_and_b32 s5, s2, 7
+; GFX10-NEXT: s_lshr_b32 s6, s2, 8
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
-; GFX10-NEXT: s_and_b32 s6, s2, 7
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_and_b32 s5, s6, 7
; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
+; GFX10-NEXT: s_andn2_b32 s6, 7, s6
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_lshl_b32 s0, s0, s6
-; GFX10-NEXT: s_and_b32 s6, s5, 7
-; GFX10-NEXT: s_andn2_b32 s5, 7, s5
-; GFX10-NEXT: s_lshr_b32 s4, s4, 1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshr_b32 s4, s4, 1
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
-; GFX10-NEXT: s_lshl_b32 s3, s3, s6
-; GFX10-NEXT: s_lshr_b32 s4, s4, s5
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s3, s3, s5
+; GFX10-NEXT: s_lshr_b32 s4, s4, s6
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s2, s3, s4
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -772,21 +792,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
; GFX11-LABEL: s_fshl_v2i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s4, s1, 8
-; GFX11-NEXT: s_lshr_b32 s5, s2, 8
+; GFX11-NEXT: s_and_b32 s5, s2, 7
+; GFX11-NEXT: s_lshr_b32 s6, s2, 8
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
-; GFX11-NEXT: s_and_b32 s6, s2, 7
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_lshl_b32 s0, s0, s5
+; GFX11-NEXT: s_and_b32 s5, s6, 7
; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-NEXT: s_and_not1_b32 s6, 7, s6
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_lshl_b32 s0, s0, s6
-; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
-; GFX11-NEXT: s_lshl_b32 s3, s3, s6
-; GFX11-NEXT: s_lshr_b32 s4, s4, s5
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshl_b32 s3, s3, s5
+; GFX11-NEXT: s_lshr_b32 s4, s4, s6
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_or_b32 s2, s3, s4
; GFX11-NEXT: s_or_b32 s0, s0, s1
@@ -837,20 +861,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0
; GFX8-NEXT: v_mov_b32_e32 v6, 1
+; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_and_b32_e32 v1, 7, v5
-; GFX8-NEXT: v_not_b32_e32 v2, v5
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3
-; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3
+; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5
+; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -863,20 +887,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0
; GFX9-NEXT: v_mov_b32_e32 v6, 1
+; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v1, 7, v5
-; GFX9-NEXT: v_not_b32_e32 v2, v5
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3
-; GFX9-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5
+; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2
; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
@@ -886,24 +910,24 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-LABEL: v_fshl_v2i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; GFX10-NEXT: v_not_b32_e32 v7, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 7, v4
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5
+; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4
+; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3
; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v3, v4
+; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -913,26 +937,26 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX11-LABEL: v_fshl_v2i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-NEXT: v_not_b32_e32 v7, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 7, v4
; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX11-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX11-NEXT: v_lshrrev_b16 v4, 1, v4
; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
-; GFX11-NEXT: v_lshlrev_b16 v3, v3, v5
+; GFX11-NEXT: v_lshrrev_b16 v3, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX11-NEXT: v_lshlrev_b16 v4, v4, v5
; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b16 v4, v6, v4
+; GFX11-NEXT: v_lshrrev_b16 v3, v6, v3
; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
@@ -1002,13 +1026,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshr_b32 s8, s1, 24
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshr_b32 s9, s2, 8
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
; GFX8-NEXT: s_and_b32 s12, s2, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_lshr_b32 s5, s0, 24
@@ -1016,29 +1042,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s9, 7
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, s6, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s3, s1
-; GFX8-NEXT: s_and_b32 s3, s6, 0xff
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_andn2_b32 s3, 7, s9
+; GFX8-NEXT: s_lshr_b32 s2, s2, 1
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_andn2_b32 s2, 7, s9
-; GFX8-NEXT: s_lshr_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s2, s3, s2
+; GFX8-NEXT: s_lshr_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s2, s10, 7
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_and_b32 s3, s7, 0xff
; GFX8-NEXT: s_lshl_b32 s2, s4, s2
-; GFX8-NEXT: s_and_b32 s4, s7, 0xff
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_andn2_b32 s4, 7, s10
+; GFX8-NEXT: s_lshr_b32 s3, s3, 1
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_andn2_b32 s3, 7, s10
-; GFX8-NEXT: s_lshr_b32 s4, s4, 1
-; GFX8-NEXT: s_lshr_b32 s3, s4, s3
+; GFX8-NEXT: s_lshr_b32 s3, s3, s4
; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: s_and_b32 s3, s11, 7
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_andn2_b32 s4, 7, s11
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, s3
-; GFX8-NEXT: s_lshr_b32 s5, s8, 1
+; GFX8-NEXT: s_andn2_b32 s5, 7, s11
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_lshr_b32 s4, s8, 1
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_lshr_b32 s4, s5, s4
+; GFX8-NEXT: s_lshr_b32 s4, s4, s5
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
; GFX8-NEXT: s_or_b32 s3, s3, s4
@@ -1055,13 +1087,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_lshr_b32 s8, s1, 24
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshr_b32 s9, s2, 8
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
; GFX9-NEXT: s_and_b32 s12, s2, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s0, 24
@@ -1069,29 +1103,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s9, 7
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, s6, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s3, s1
-; GFX9-NEXT: s_and_b32 s3, s6, 0xff
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_andn2_b32 s3, 7, s9
+; GFX9-NEXT: s_lshr_b32 s2, s2, 1
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_andn2_b32 s2, 7, s9
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_lshr_b32 s2, s3, s2
+; GFX9-NEXT: s_lshr_b32 s2, s2, s3
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s2, s10, 7
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_and_b32 s3, s7, 0xff
; GFX9-NEXT: s_lshl_b32 s2, s4, s2
-; GFX9-NEXT: s_and_b32 s4, s7, 0xff
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s4, 7, s10
+; GFX9-NEXT: s_lshr_b32 s3, s3, 1
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT: s_andn2_b32 s3, 7, s10
-; GFX9-NEXT: s_lshr_b32 s4, s4, 1
-; GFX9-NEXT: s_lshr_b32 s3, s4, s3
+; GFX9-NEXT: s_lshr_b32 s3, s3, s4
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s11, 7
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_andn2_b32 s4, 7, s11
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: s_lshr_b32 s5, s8, 1
+; GFX9-NEXT: s_andn2_b32 s5, 7, s11
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_lshr_b32 s4, s8, 1
+; GFX9-NEXT: s_and_b32 s5, 0xffff, s5
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
-; GFX9-NEXT: s_lshr_b32 s4, s5, s4
+; GFX9-NEXT: s_lshr_b32 s4, s4, s5
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s2, 0xff
; GFX9-NEXT: s_or_b32 s3, s3, s4
@@ -1108,48 +1148,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshr_b32 s9, s2, 8
+; GFX10-NEXT: s_and_b32 s11, s2, 7
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_lshr_b32 s10, s2, 16
-; GFX10-NEXT: s_lshr_b32 s11, s2, 24
-; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_andn2_b32 s12, 7, s2
+; GFX10-NEXT: s_and_b32 s11, 0xffff, s11
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s1, s1, s2
-; GFX10-NEXT: s_and_b32 s2, s6, 0xff
-; GFX10-NEXT: s_and_b32 s6, s9, 7
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_andn2_b32 s9, 7, s9
-; GFX10-NEXT: s_lshr_b32 s2, s2, 1
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
-; GFX10-NEXT: s_lshl_b32 s0, s0, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, s6
-; GFX10-NEXT: s_lshr_b32 s2, s2, s9
+; GFX10-NEXT: s_lshr_b32 s9, s2, 8
+; GFX10-NEXT: s_lshl_b32 s0, s0, s11
+; GFX10-NEXT: s_lshr_b32 s1, s1, s12
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s3, s2
-; GFX10-NEXT: s_and_b32 s2, s7, 0xff
-; GFX10-NEXT: s_and_b32 s3, s10, 7
-; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT: s_andn2_b32 s6, 7, s10
-; GFX10-NEXT: s_lshr_b32 s2, s2, 1
+; GFX10-NEXT: s_and_b32 s1, s9, 7
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_andn2_b32 s9, 7, s9
+; GFX10-NEXT: s_lshr_b32 s10, s2, 16
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_lshr_b32 s6, s6, 1
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT: s_lshl_b32 s1, s3, s1
+; GFX10-NEXT: s_lshr_b32 s3, s6, s9
+; GFX10-NEXT: s_and_b32 s6, s10, 7
+; GFX10-NEXT: s_or_b32 s1, s1, s3
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX10-NEXT: s_and_b32 s6, s7, 0xff
+; GFX10-NEXT: s_lshr_b32 s2, s2, 24
; GFX10-NEXT: s_lshl_b32 s3, s4, s3
-; GFX10-NEXT: s_lshr_b32 s2, s2, s6
-; GFX10-NEXT: s_and_b32 s4, s11, 7
-; GFX10-NEXT: s_andn2_b32 s6, 7, s11
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX10-NEXT: s_andn2_b32 s6, 7, s10
+; GFX10-NEXT: s_lshr_b32 s4, s4, 1
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_and_b32 s7, s2, 7
+; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshr_b32 s4, s4, s6
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s7
; GFX10-NEXT: s_lshr_b32 s7, s8, 1
-; GFX10-NEXT: s_lshl_b32 s4, s5, s4
-; GFX10-NEXT: s_lshr_b32 s5, s7, s6
-; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s5, s5, s6
+; GFX10-NEXT: s_lshr_b32 s2, s7, s2
+; GFX10-NEXT: s_or_b32 s3, s3, s4
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_or_b32 s3, s4, s5
+; GFX10-NEXT: s_or_b32 s2, s5, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
+; GFX10-NEXT: s_and_b32 s3, s3, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_lshl_b32 s1, s2, 16
-; GFX10-NEXT: s_and_b32 s2, s3, 0xff
+; GFX10-NEXT: s_lshl_b32 s1, s3, 16
+; GFX10-NEXT: s_and_b32 s2, s2, 0xff
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_lshl_b32 s1, s2, 24
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -1161,48 +1209,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshr_b32 s9, s2, 8
+; GFX11-NEXT: s_and_b32 s11, s2, 7
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_lshr_b32 s10, s2, 16
-; GFX11-NEXT: s_lshr_b32 s11, s2, 24
-; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
+; GFX11-NEXT: s_and_b32 s11, 0xffff, s11
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s1, s1, s2
-; GFX11-NEXT: s_and_b32 s2, s6, 0xff
-; GFX11-NEXT: s_and_b32 s6, s9, 7
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT: s_lshr_b32 s2, s2, 1
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshl_b32 s0, s0, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, s6
-; GFX11-NEXT: s_lshr_b32 s2, s2, s9
+; GFX11-NEXT: s_lshr_b32 s9, s2, 8
+; GFX11-NEXT: s_lshl_b32 s0, s0, s11
+; GFX11-NEXT: s_lshr_b32 s1, s1, s12
+; GFX11-NEXT: s_and_b32 s6, s6, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_and_b32 s2, s7, 0xff
-; GFX11-NEXT: s_and_b32 s3, s10, 7
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_and_b32 s1, s9, 7
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT: s_lshr_b32 s10, s2, 16
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_lshr_b32 s6, s6, 1
+; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT: s_lshl_b32 s1, s3, s1
+; GFX11-NEXT: s_lshr_b32 s3, s6, s9
+; GFX11-NEXT: s_and_b32 s6, s10, 7
+; GFX11-NEXT: s_or_b32 s1, s1, s3
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s6
+; GFX11-NEXT: s_and_b32 s6, s7, 0xff
+; GFX11-NEXT: s_lshr_b32 s2, s2, 24
; GFX11-NEXT: s_lshl_b32 s3, s4, s3
-; GFX11-NEXT: s_lshr_b32 s2, s2, s6
-; GFX11-NEXT: s_and_b32 s4, s11, 7
-; GFX11-NEXT: s_and_not1_b32 s6, 7, s11
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX11-NEXT: s_and_not1_b32 s6, 7, s10
+; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_and_b32 s7, s2, 7
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_lshr_b32 s4, s4, s6
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s7
; GFX11-NEXT: s_lshr_b32 s7, s8, 1
-; GFX11-NEXT: s_lshl_b32 s4, s5, s4
-; GFX11-NEXT: s_lshr_b32 s5, s7, s6
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshl_b32 s5, s5, s6
+; GFX11-NEXT: s_lshr_b32 s2, s7, s2
+; GFX11-NEXT: s_or_b32 s3, s3, s4
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_or_b32 s3, s4, s5
+; GFX11-NEXT: s_or_b32 s2, s5, s2
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
; GFX11-NEXT: s_lshl_b32 s1, s1, 8
-; GFX11-NEXT: s_and_b32 s2, s2, 0xff
+; GFX11-NEXT: s_and_b32 s3, s3, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_lshl_b32 s1, s2, 16
-; GFX11-NEXT: s_and_b32 s2, s3, 0xff
+; GFX11-NEXT: s_lshl_b32 s1, s3, 16
+; GFX11-NEXT: s_and_b32 s2, s2, 0xff
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_lshl_b32 s1, s2, 24
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -1271,37 +1327,38 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-LABEL: v_fshl_v4i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_not_b32_e32 v7, v2
-; GFX8-NEXT: v_mov_b32_e32 v9, 1
+; GFX8-NEXT: v_mov_b32_e32 v8, 1
+; GFX8-NEXT: v_xor_b32_e32 v10, -1, v2
; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v10, 7, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX8-NEXT: v_lshrrev_b16_e32 v9, v10, v9
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX8-NEXT: v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 7, v5
+; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, v9, v3
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, 0xff
+; GFX8-NEXT: v_mov_b32_e32 v7, 0xff
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
; GFX8-NEXT: v_mov_b32_e32 v4, 7
+; GFX8-NEXT: v_mov_b32_e32 v9, -1
; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7
+; GFX8-NEXT: v_and_b32_e32 v10, 7, v10
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7
; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
@@ -1320,46 +1377,47 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-LABEL: v_fshl_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_not_b32_e32 v7, v2
-; GFX9-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-NEXT: v_xor_b32_e32 v10, -1, v2
; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v10, 7, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0
-; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT: v_lshrrev_b16_e32 v9, v10, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT: v_not_b32_e32 v5, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX9-NEXT: v_and_b32_e32 v9, 7, v5
+; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, v9, v3
; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4
-; GFX9-NEXT: v_mov_b32_e32 v8, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v7, 0xff
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v4, 7
+; GFX9-NEXT: v_mov_b32_e32 v10, -1
; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9
+; GFX9-NEXT: v_and_b32_e32 v11, 7, v11
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9
; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
-; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v9
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1
+; GFX9-NEXT: v_and_or_b32 v1, v6, v7, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
@@ -1368,41 +1426,42 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-LABEL: v_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2
-; GFX10-NEXT: v_and_b32_e32 v9, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v11, 0xff, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT: v_and_b32_e32 v8, 7, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1
+; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2
+; GFX10-NEXT: v_and_b32_e32 v11, 7, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_not_b32_e32 v12, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_lshrrev_b16 v9, 1, v11
-; GFX10-NEXT: v_and_b32_e32 v11, 7, v12
-; GFX10-NEXT: v_mov_b32_e32 v12, 0xff
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT: v_lshlrev_b16 v3, v7, v3
-; GFX10-NEXT: v_mov_b32_e32 v7, 7
-; GFX10-NEXT: v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_not_b32_e32 v8, v2
-; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT: v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9
+; GFX10-NEXT: v_and_b32_e32 v9, 7, v10
+; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3
+; GFX10-NEXT: v_mov_b32_e32 v10, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v11, -1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1
+; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7
+; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX10-NEXT: v_mov_b32_e32 v13, 7
+; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7
+; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v12
-; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10
-; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6
+; GFX10-NEXT: v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12
+; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
+; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4
-; GFX10-NEXT: v_lshrrev_b16 v1, v13, v1
+; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT: v_lshrrev_b16 v5, v7, v10
-; GFX10-NEXT: v_lshrrev_b16 v7, v8, v9
+; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12
+; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8
; GFX10-NEXT: v_or_b32_e32 v3, v3, v6
; GFX10-NEXT: v_mov_b32_e32 v6, 8
; GFX10-NEXT: v_or_b32_e32 v1, v4, v1
@@ -1426,7 +1485,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_not_b32_e32 v13, v9
+; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2
; GFX11-NEXT: v_and_b32_e32 v9, 7, v9
; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1
@@ -1434,22 +1493,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3
-; GFX11-NEXT: v_not_b32_e32 v9, v10
+; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6
-; GFX11-NEXT: v_not_b32_e32 v13, v11
+; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX11-NEXT: v_and_b32_e32 v12, 7, v2
-; GFX11-NEXT: v_not_b32_e32 v2, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX11-NEXT: v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT: v_and_b32_e32 v9, 7, v9
; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7
+; GFX11-NEXT: v_and_b32_e32 v9, 7, v9
; GFX11-NEXT: v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8
-; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4
; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7
@@ -5087,23 +5146,48 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
}
define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
-; GCN-LABEL: s_fshl_i64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_and_b64 s[6:7], s[4:5], 63
-; GCN-NEXT: s_andn2_b64 s[4:5], 63, s[4:5]
-; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s6
-; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT: ; return to shader part epilog
+; GFX6-LABEL: s_fshl_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT: s_not_b32 s4, s4
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT: s_not_b32 s4, s4
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT: s_not_b32 s4, s4
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT: s_not_b32 s5, s4
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 63
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[4:5]
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_not_b32 s5, s4
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: ; return to shader part epilog
@@ -5181,8 +5265,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
+; GFX6-NEXT: v_not_b32_e32 v4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
@@ -5194,8 +5278,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5207,8 +5291,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v4
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX9-NEXT: v_not_b32_e32 v4, v4
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -5362,36 +5446,36 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
; GFX6: ; %bb.0:
; GFX6-NEXT: v_and_b32_e32 v1, 63, v0
; GFX6-NEXT: v_not_b32_e32 v0, v0
-; GFX6-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v1
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX6-NEXT: v_lshr_b64 v[2:3], s[0:1], v2
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v2, v4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i64_ssv:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_and_b32_e32 v1, 63, v0
; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i64_ssv:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_and_b32_e32 v1, 63, v0
; GFX9-NEXT: v_not_b32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1]
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v2, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i64_ssv:
@@ -5429,10 +5513,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
; GFX6-LABEL: v_fshl_i64_svs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT: s_andn2_b32 s3, 63, s2
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
; GFX6-NEXT: ; return to shader part epilog
@@ -5440,10 +5523,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
; GFX8-LABEL: v_fshl_i64_svs:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT: s_andn2_b32 s3, 63, s2
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-NEXT: ; return to shader part epilog
@@ -5451,10 +5533,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
; GFX9-LABEL: v_fshl_i64_svs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT: s_andn2_b32 s3, 63, s2
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: v_or_b32_e32 v1, s1, v1
; GFX9-NEXT: ; return to shader part epilog
@@ -5462,10 +5543,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
; GFX10-LABEL: v_fshl_i64_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT: s_andn2_b32 s3, 63, s2
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v1
; GFX10-NEXT: ; return to shader part epilog
@@ -5473,13 +5553,12 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
; GFX11-LABEL: v_fshl_i64_svs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_not1_b32 s3, 63, s2
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, s1, v1
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5490,10 +5569,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) {
; GFX6-LABEL: v_fshl_i64_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4
+; GFX6-NEXT: s_and_b32 s3, s2, 63
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX6-NEXT: s_not_b32 s2, s2
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
@@ -5501,10 +5580,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
;
; GFX8-LABEL: v_fshl_i64_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX8-NEXT: s_and_b32 s3, s2, 63
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX8-NEXT: s_not_b32 s2, s2
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
@@ -5512,10 +5591,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
;
; GFX9-LABEL: v_fshl_i64_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX9-NEXT: s_and_b32 s3, s2, 63
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX9-NEXT: s_not_b32 s2, s2
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: v_or_b32_e32 v1, s1, v1
@@ -5523,10 +5602,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
;
; GFX10-LABEL: v_fshl_i64_vss:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT: s_and_b32 s3, s2, 63
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT: s_not_b32 s2, s2
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v1
@@ -5534,10 +5613,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
;
; GFX11-LABEL: v_fshl_i64_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT: s_and_b32 s3, s2, 63
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT: s_not_b32 s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5553,80 +5632,70 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
+; GFX6-NEXT: s_not_b32 s8, s8
; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT: s_not_b32 s6, s10
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshl_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
+; GFX8-NEXT: s_not_b32 s8, s8
; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
+; GFX8-NEXT: s_not_b32 s6, s10
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
+; GFX9-NEXT: s_not_b32 s8, s8
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
+; GFX9-NEXT: s_not_b32 s6, s10
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshl_v2i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
+; GFX10-NEXT: s_not_b32 s9, s8
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT: s_and_b64 s[8:9], s[10:11], 63
-; GFX10-NEXT: s_andn2_b64 s[10:11], 63, s[10:11]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10
+; GFX10-NEXT: s_not_b32 s8, s10
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s9
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[8:9]
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
+; GFX11-NEXT: s_not_b32 s9, s8
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], 63
-; GFX11-NEXT: s_and_not1_b64 s[10:11], 63, s[10:11]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10
+; GFX11-NEXT: s_not_b32 s8, s10
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s9
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s10
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
; GFX11-NEXT: ; return to shader part epilog
@@ -5639,18 +5708,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT: v_not_b32_e32 v8, v8
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1
+; GFX6-NEXT: v_not_b32_e32 v8, v8
; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
-; GFX6-NEXT: v_not_b32_e32 v8, v10
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v8
+; GFX6-NEXT: v_not_b32_e32 v4, v10
+; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_or_b32_e32 v3, v3, v7
@@ -5660,18 +5729,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT: v_not_b32_e32 v8, v8
; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX8-NEXT: v_not_b32_e32 v8, v8
; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX8-NEXT: v_not_b32_e32 v8, v10
; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX8-NEXT: v_not_b32_e32 v4, v10
+; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v3, v3, v7
@@ -5681,18 +5750,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT: v_not_b32_e32 v8, v8
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
+; GFX9-NEXT: v_not_b32_e32 v8, v8
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v8, v10
; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7]
+; GFX9-NEXT: v_not_b32_e32 v4, v10
+; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
; GFX9-NEXT: v_or_b32_e32 v2, v2, v6
; GFX9-NEXT: v_or_b32_e32 v3, v3, v7
@@ -5750,231 +5819,236 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshl_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT: s_sub_i32 s9, s10, 64
-; GFX6-NEXT: s_sub_i32 s11, 64, s10
-; GFX6-NEXT: s_cmp_lt_u32 s10, 64
-; GFX6-NEXT: s_cselect_b32 s13, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s10, 0
+; GFX6-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT: s_sub_i32 s11, s9, 64
+; GFX6-NEXT: s_sub_i32 s14, 64, s9
+; GFX6-NEXT: s_cmp_lt_u32 s9, 64
; GFX6-NEXT: s_cselect_b32 s18, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s10
-; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s11
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10
-; GFX6-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT: s_cmp_lg_u32 s13, 0
-; GFX6-NEXT: s_cselect_b64 s[14:15], s[14:15], 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT: s_cmp_eq_u32 s9, 0
+; GFX6-NEXT: s_cselect_b32 s9, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[14:15], s[0:1], s14
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s8
+; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s8
+; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_mov_b32 s12, 0
+; GFX6-NEXT: s_cselect_b64 s[12:13], s[12:13], 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s9, 0
+; GFX6-NEXT: s_mov_b32 s10, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
-; GFX6-NEXT: s_lshl_b32 s13, s6, 31
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX6-NEXT: s_lshl_b32 s11, s6, 31
; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX6-NEXT: s_sub_i32 s12, s8, 64
-; GFX6-NEXT: s_sub_i32 s10, 64, s8
-; GFX6-NEXT: s_cmp_lt_u32 s8, 64
-; GFX6-NEXT: s_cselect_b32 s13, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s8, 0
+; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s8
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT: s_not_b32 s9, s8
+; GFX6-NEXT: s_sub_i32 s14, s6, 64
+; GFX6-NEXT: s_sub_i32 s10, 64, s6
+; GFX6-NEXT: s_cmp_lt_u32 s6, 64
+; GFX6-NEXT: s_cselect_b32 s15, 1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s6, 0
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s9
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s9
; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
-; GFX6-NEXT: s_cmp_lg_u32 s13, 0
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
+; GFX6-NEXT: s_cmp_lg_u32 s15, 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT: s_cmp_lg_u32 s13, 0
+; GFX6-NEXT: s_cmp_lg_u32 s15, 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1]
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshl_i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT: s_sub_i32 s9, s10, 64
-; GFX8-NEXT: s_sub_i32 s11, 64, s10
-; GFX8-NEXT: s_cmp_lt_u32 s10, 64
-; GFX8-NEXT: s_cselect_b32 s13, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s10, 0
+; GFX8-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT: s_sub_i32 s11, s9, 64
+; GFX8-NEXT: s_sub_i32 s14, 64, s9
+; GFX8-NEXT: s_cmp_lt_u32 s9, 64
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s10
-; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s11
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10
-; GFX8-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT: s_cmp_lg_u32 s13, 0
-; GFX8-NEXT: s_cselect_b64 s[14:15], s[14:15], 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT: s_cmp_eq_u32 s9, 0
+; GFX8-NEXT: s_cselect_b32 s9, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[14:15], s[0:1], s14
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[2:3], s8
+; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s8
+; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
; GFX8-NEXT: s_cmp_lg_u32 s18, 0
-; GFX8-NEXT: s_mov_b32 s12, 0
+; GFX8-NEXT: s_cselect_b64 s[12:13], s[12:13], 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT: s_cmp_lg_u32 s9, 0
+; GFX8-NEXT: s_mov_b32 s10, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
-; GFX8-NEXT: s_lshl_b32 s13, s6, 31
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX8-NEXT: s_lshl_b32 s11, s6, 31
; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX8-NEXT: s_sub_i32 s12, s8, 64
-; GFX8-NEXT: s_sub_i32 s10, 64, s8
-; GFX8-NEXT: s_cmp_lt_u32 s8, 64
-; GFX8-NEXT: s_cselect_b32 s13, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s8, 0
+; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s8
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX8-NEXT: s_not_b32 s9, s8
+; GFX8-NEXT: s_sub_i32 s14, s6, 64
+; GFX8-NEXT: s_sub_i32 s10, 64, s6
+; GFX8-NEXT: s_cmp_lt_u32 s6, 64
+; GFX8-NEXT: s_cselect_b32 s15, 1, 0
+; GFX8-NEXT: s_cmp_eq_u32 s6, 0
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s9
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s9
; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
-; GFX8-NEXT: s_cmp_lg_u32 s13, 0
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
+; GFX8-NEXT: s_cmp_lg_u32 s15, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_cmp_lg_u32 s13, 0
+; GFX8-NEXT: s_cmp_lg_u32 s15, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX8-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1]
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT: s_sub_i32 s9, s10, 64
-; GFX9-NEXT: s_sub_i32 s11, 64, s10
-; GFX9-NEXT: s_cmp_lt_u32 s10, 64
-; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, 0
+; GFX9-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT: s_sub_i32 s11, s9, 64
+; GFX9-NEXT: s_sub_i32 s14, 64, s9
+; GFX9-NEXT: s_cmp_lt_u32 s9, 64
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s10
-; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s11
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b64 s[14:15], s[14:15], 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT: s_cmp_eq_u32 s9, 0
+; GFX9-NEXT: s_cselect_b32 s9, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[14:15], s[0:1], s14
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[2:3], s8
+; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s8
+; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_mov_b32 s12, 0
+; GFX9-NEXT: s_cselect_b64 s[12:13], s[12:13], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT: s_cmp_lg_u32 s9, 0
+; GFX9-NEXT: s_mov_b32 s10, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
-; GFX9-NEXT: s_lshl_b32 s13, s6, 31
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX9-NEXT: s_lshl_b32 s11, s6, 31
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX9-NEXT: s_sub_i32 s12, s8, 64
-; GFX9-NEXT: s_sub_i32 s10, 64, s8
-; GFX9-NEXT: s_cmp_lt_u32 s8, 64
-; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s8, 0
+; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s8
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX9-NEXT: s_not_b32 s9, s8
+; GFX9-NEXT: s_sub_i32 s14, s6, 64
+; GFX9-NEXT: s_sub_i32 s10, 64, s6
+; GFX9-NEXT: s_cmp_lt_u32 s6, 64
+; GFX9-NEXT: s_cselect_b32 s15, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s6, 0
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s9
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s9
; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
-; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0
-; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
+; GFX9-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1]
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshl_i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX10-NEXT: s_sub_i32 s9, s10, 64
-; GFX10-NEXT: s_sub_i32 s11, 64, s10
-; GFX10-NEXT: s_cmp_lt_u32 s10, 64
-; GFX10-NEXT: s_mov_b32 s12, 0
-; GFX10-NEXT: s_cselect_b32 s13, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s10, 0
+; GFX10-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX10-NEXT: s_mov_b32 s10, 0
+; GFX10-NEXT: s_sub_i32 s11, s9, 64
+; GFX10-NEXT: s_sub_i32 s12, 64, s9
+; GFX10-NEXT: s_cmp_lt_u32 s9, 64
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s11
-; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s10
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
-; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
-; GFX10-NEXT: s_cmp_lg_u32 s13, 0
-; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s9, 0
+; GFX10-NEXT: s_cselect_b32 s9, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8
+; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s8
+; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
+; GFX10-NEXT: s_cselect_b64 s[14:15], s[16:17], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT: s_lshl_b32 s13, s6, 31
+; GFX10-NEXT: s_lshl_b32 s11, s6, 31
; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX10-NEXT: s_sub_i32 s14, s8, 64
-; GFX10-NEXT: s_sub_i32 s9, 64, s8
-; GFX10-NEXT: s_cmp_lt_u32 s8, 64
-; GFX10-NEXT: s_cselect_b32 s15, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s8, 0
+; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s8
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT: s_not_b32 s10, s8
+; GFX10-NEXT: s_sub_i32 s12, s6, 64
+; GFX10-NEXT: s_sub_i32 s8, 64, s6
+; GFX10-NEXT: s_cmp_lt_u32 s6, 64
+; GFX10-NEXT: s_cselect_b32 s13, 1, 0
+; GFX10-NEXT: s_cmp_eq_u32 s6, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], s9
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
-; GFX10-NEXT: s_cmp_lg_u32 s15, 0
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s10
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
+; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT: s_cmp_lg_u32 s15, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0
-; GFX10-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s13, 0
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0
+; GFX10-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i128:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9]
-; GFX11-NEXT: s_sub_i32 s9, s10, 64
-; GFX11-NEXT: s_sub_i32 s11, 64, s10
-; GFX11-NEXT: s_cmp_lt_u32 s10, 64
-; GFX11-NEXT: s_mov_b32 s12, 0
-; GFX11-NEXT: s_cselect_b32 s13, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s10, 0
+; GFX11-NEXT: s_and_b32 s9, s8, 0x7f
+; GFX11-NEXT: s_mov_b32 s10, 0
+; GFX11-NEXT: s_sub_i32 s11, s9, 64
+; GFX11-NEXT: s_sub_i32 s12, 64, s9
+; GFX11-NEXT: s_cmp_lt_u32 s9, 64
; GFX11-NEXT: s_cselect_b32 s18, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s11
-; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s10
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
-; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
-; GFX11-NEXT: s_cmp_lg_u32 s13, 0
-; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s9, 0
+; GFX11-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s12
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8
+; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s8
+; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
; GFX11-NEXT: s_cmp_lg_u32 s18, 0
+; GFX11-NEXT: s_cselect_b64 s[14:15], s[16:17], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT: s_lshl_b32 s13, s6, 31
+; GFX11-NEXT: s_lshl_b32 s11, s6, 31
; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
-; GFX11-NEXT: s_sub_i32 s14, s8, 64
-; GFX11-NEXT: s_sub_i32 s9, 64, s8
-; GFX11-NEXT: s_cmp_lt_u32 s8, 64
-; GFX11-NEXT: s_cselect_b32 s15, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s8, 0
+; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s8
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT: s_not_b32 s10, s8
+; GFX11-NEXT: s_sub_i32 s12, s6, 64
+; GFX11-NEXT: s_sub_i32 s8, 64, s6
+; GFX11-NEXT: s_cmp_lt_u32 s6, 64
+; GFX11-NEXT: s_cselect_b32 s13, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s6, 0
; GFX11-NEXT: s_cselect_b32 s16, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT: s_lshl_b64 s[12:13], s[4:5], s9
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
-; GFX11-NEXT: s_cmp_lg_u32 s15, 0
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s10
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s16, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
-; GFX11-NEXT: s_cmp_lg_u32 s15, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0
-; GFX11-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0
+; GFX11-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1]
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -5985,143 +6059,143 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT: v_not_b32_e32 v8, v8
; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14
-; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v14
-; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], v14
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15
+; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15
+; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9
+; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15
+; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v12
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6
+; GFX6-NEXT: v_not_b32_e32 v4, v8
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1
-; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15
-; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15
+; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14
+; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14
; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v15
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v11, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v12, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX6-NEXT: v_or_b32_e32 v1, v12, v1
+; GFX6-NEXT: v_or_b32_e32 v2, v10, v2
; GFX6-NEXT: v_or_b32_e32 v3, v13, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT: v_not_b32_e32 v8, v8
; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14
-; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15
+; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15
+; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v12
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6
+; GFX8-NEXT: v_not_b32_e32 v4, v8
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
-; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15
-; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4
+; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v11, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v12, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v12, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v10, v2
; GFX8-NEXT: v_or_b32_e32 v3, v13, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT: v_not_b32_e32 v8, v8
; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14
-; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15
+; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15
+; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GFX9-NEXT: v_or_b32_e32 v10, v10, v12
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v3, vcc
+; GFX9-NEXT: v_not_b32_e32 v4, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7]
+; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1
-; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15
-; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14
+; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX9-NEXT: v_or_b32_e32 v1, v11, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v12, v2
+; GFX9-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX9-NEXT: v_or_b32_e32 v1, v12, v1
+; GFX9-NEXT: v_or_b32_e32 v2, v10, v2
; GFX9-NEXT: v_or_b32_e32 v3, v13, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -6129,15 +6203,15 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX10-NEXT: v_not_b32_e32 v8, v8
+; GFX10-NEXT: v_not_b32_e32 v10, v8
; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v18
+; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v10
; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5
; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18
-; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1]
; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19
; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1]
; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
@@ -6175,43 +6249,43 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX11-LABEL: v_fshl_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8
-; GFX11-NEXT: v_not_b32_e32 v8, v8
+; GFX11-NEXT: v_not_b32_e32 v10, v8
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5]
; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18
-; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
-; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v18
+; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v10
; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3]
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5
; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18
-; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1]
; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1]
; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18
; GFX11-NEXT: v_or_b32_e32 v10, v10, v8
; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19
; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13]
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1]
; GFX11-NEXT: v_or_b32_e32 v11, v11, v9
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13]
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19
+; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo
; GFX11-NEXT: v_or_b32_e32 v14, v14, v16
; GFX11-NEXT: v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11
; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0
; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18
+; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0
; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2
; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0
@@ -6229,173 +6303,173 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshl_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v0
; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6
-; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v6
-; GFX6-NEXT: v_lshl_b64 v[4:5], s[0:1], v6
-; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7
+; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7
+; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v8
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v8
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT: v_not_b32_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s8, 0
-; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT: v_mov_b32_e32 v3, s2
+; GFX6-NEXT: v_mov_b32_e32 v4, s3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX6-NEXT: s_lshl_b32 s9, s6, 31
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v10
; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7
+; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11
-; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v10
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v9, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v9, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v6
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: s_mov_b32 s8, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX8-NEXT: s_lshl_b32 s9, s6, 31
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7
+; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v9, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v9, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v6
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1]
-; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1]
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT: v_not_b32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s8, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
; GFX9-NEXT: s_lshl_b32 s9, s6, 31
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7
+; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX9-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT: v_or_b32_e32 v1, v9, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX9-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX9-NEXT: v_or_b32_e32 v3, v9, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i128_ssv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT: v_not_b32_e32 v0, v0
+; GFX10-NEXT: v_not_b32_e32 v2, v0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
; GFX10-NEXT: s_lshl_b32 s9, s6, 31
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v12
+; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1]
; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12
; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9]
@@ -6434,58 +6508,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
; GFX11-LABEL: v_fshl_i128_ssv:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT: v_not_b32_e32 v0, v0
+; GFX11-NEXT: v_not_b32_e32 v2, v0
; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
; GFX11-NEXT: s_lshl_b32 s9, s6, 31
; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1]
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v2
; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo
-; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12
-; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12
-; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo
-; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1]
-; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v12
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3]
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13
+; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12
; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9]
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7]
+; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1]
+; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13
; GFX11-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13
; GFX11-NEXT: v_or_b32_e32 v3, v3, v1
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v6, v6, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo
-; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v7, v7, v9
-; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13
+; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo
; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7]
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4
; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1
-; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v2, v6, v2
; GFX11-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v1, v5, v1
+; GFX11-NEXT: v_or_b32_e32 v3, v7, v3
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -6495,43 +6563,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT: s_sub_i32 s5, s6, 64
-; GFX6-NEXT: s_sub_i32 s7, 64, s6
-; GFX6-NEXT: s_cmp_lt_u32 s6, 64
-; GFX6-NEXT: s_cselect_b32 s12, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s6, 0
+; GFX6-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT: s_sub_i32 s12, s5, 64
+; GFX6-NEXT: s_sub_i32 s8, 64, s5
+; GFX6-NEXT: s_cmp_lt_u32 s5, 64
; GFX6-NEXT: s_cselect_b32 s13, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s6
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s7
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[2:3], s6
-; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_cmp_eq_u32 s5, 0
+; GFX6-NEXT: s_cselect_b32 s5, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s4
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s5, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: s_sub_i32 s0, s4, 64
-; GFX6-NEXT: s_sub_i32 s1, 64, s4
+; GFX6-NEXT: s_sub_i32 s1, s0, 64
+; GFX6-NEXT: s_sub_i32 s4, 64, s0
; GFX6-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX6-NEXT: s_cmp_lt_u32 s4, 64
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s5, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, 0
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT: s_cselect_b32 s6, 1, 0
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4
+; GFX6-NEXT: s_cselect_b32 s8, 1, 0
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1
; GFX6-NEXT: s_and_b32 s0, 1, s5
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT: s_and_b32 s0, 1, s6
+; GFX6-NEXT: s_and_b32 s0, 1, s8
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6539,51 +6607,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, s8, v0
-; GFX6-NEXT: v_or_b32_e32 v1, s9, v1
+; GFX6-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX6-NEXT: v_or_b32_e32 v1, s7, v1
; GFX6-NEXT: v_or_b32_e32 v2, s2, v2
; GFX6-NEXT: v_or_b32_e32 v3, s3, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshl_i128_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT: s_sub_i32 s5, s6, 64
-; GFX8-NEXT: s_sub_i32 s7, 64, s6
-; GFX8-NEXT: s_cmp_lt_u32 s6, 64
-; GFX8-NEXT: s_cselect_b32 s12, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s6, 0
+; GFX8-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT: s_sub_i32 s12, s5, 64
+; GFX8-NEXT: s_sub_i32 s8, 64, s5
+; GFX8-NEXT: s_cmp_lt_u32 s5, 64
; GFX8-NEXT: s_cselect_b32 s13, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s6
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s7
-; GFX8-NEXT: s_lshl_b64 s[6:7], s[2:3], s6
-; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
-; GFX8-NEXT: s_cmp_lg_u32 s12, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX8-NEXT: s_cmp_eq_u32 s5, 0
+; GFX8-NEXT: s_cselect_b32 s5, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s4
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT: s_cmp_lg_u32 s5, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: s_sub_i32 s0, s4, 64
-; GFX8-NEXT: s_sub_i32 s1, 64, s4
+; GFX8-NEXT: s_sub_i32 s1, s0, 64
+; GFX8-NEXT: s_sub_i32 s4, 64, s0
; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT: s_cmp_lt_u32 s4, 64
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX8-NEXT: s_cselect_b32 s8, 1, 0
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
; GFX8-NEXT: s_and_b32 s0, 1, s5
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: s_and_b32 s0, 1, s6
+; GFX8-NEXT: s_and_b32 s0, 1, s8
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6591,50 +6659,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, s8, v0
-; GFX8-NEXT: v_or_b32_e32 v1, s9, v1
+; GFX8-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX8-NEXT: v_or_b32_e32 v1, s7, v1
; GFX8-NEXT: v_or_b32_e32 v2, s2, v2
; GFX8-NEXT: v_or_b32_e32 v3, s3, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshl_i128_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT: s_sub_i32 s5, s6, 64
-; GFX9-NEXT: s_sub_i32 s7, 64, s6
-; GFX9-NEXT: s_cmp_lt_u32 s6, 64
-; GFX9-NEXT: s_cselect_b32 s12, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s6, 0
+; GFX9-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT: s_sub_i32 s12, s5, 64
+; GFX9-NEXT: s_sub_i32 s8, 64, s5
+; GFX9-NEXT: s_cmp_lt_u32 s5, 64
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s6
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s7
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[2:3], s6
-; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX9-NEXT: s_cmp_eq_u32 s5, 0
+; GFX9-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s4
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s4
+; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s4
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: s_sub_i32 s0, s4, 64
-; GFX9-NEXT: s_sub_i32 s1, 64, s4
-; GFX9-NEXT: s_cmp_lt_u32 s4, 64
+; GFX9-NEXT: s_sub_i32 s1, s0, 64
+; GFX9-NEXT: s_sub_i32 s4, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
+; GFX9-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
; GFX9-NEXT: s_and_b32 s0, 1, s5
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: s_and_b32 s0, 1, s6
+; GFX9-NEXT: s_and_b32 s0, 1, s8
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6642,50 +6710,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, s8, v0
-; GFX9-NEXT: v_or_b32_e32 v1, s9, v1
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v0
+; GFX9-NEXT: v_or_b32_e32 v1, s7, v1
; GFX9-NEXT: v_or_b32_e32 v2, s2, v2
; GFX9-NEXT: v_or_b32_e32 v3, s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshl_i128_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT: s_sub_i32 s5, s6, 64
-; GFX10-NEXT: s_sub_i32 s7, 64, s6
-; GFX10-NEXT: s_cmp_lt_u32 s6, 64
+; GFX10-NEXT: s_and_b32 s5, s4, 0x7f
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s6, 0
+; GFX10-NEXT: s_sub_i32 s12, s5, 64
+; GFX10-NEXT: s_sub_i32 s6, 64, s5
+; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s7
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s6
-; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
-; GFX10-NEXT: s_cmp_lg_u32 s12, 0
+; GFX10-NEXT: s_cmp_eq_u32 s5, 0
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s4
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT: s_sub_i32 s0, 64, s4
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT: s_sub_i32 s0, s4, 64
-; GFX10-NEXT: s_cmp_lt_u32 s4, 64
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10-NEXT: s_andn2_b32 s0, 0x7f, s4
+; GFX10-NEXT: s_sub_i32 s1, 64, s0
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT: s_sub_i32 s1, s0, 64
+; GFX10-NEXT: s_cmp_lt_u32 s0, 64
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s1
+; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
; GFX10-NEXT: s_and_b32 s0, 1, s5
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3]
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6695,62 +6763,62 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0
; GFX10-NEXT: v_or_b32_e32 v2, s2, v2
; GFX10-NEXT: v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT: v_or_b32_e32 v0, s6, v0
-; GFX10-NEXT: v_or_b32_e32 v1, s7, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT: v_or_b32_e32 v1, s9, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshl_i128_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT: s_sub_i32 s5, s6, 64
-; GFX11-NEXT: s_sub_i32 s7, 64, s6
-; GFX11-NEXT: s_cmp_lt_u32 s6, 64
+; GFX11-NEXT: s_and_b32 s5, s4, 0x7f
; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 0
+; GFX11-NEXT: s_sub_i32 s12, s5, 64
+; GFX11-NEXT: s_sub_i32 s6, 64, s5
+; GFX11-NEXT: s_cmp_lt_u32 s5, 64
; GFX11-NEXT: s_cselect_b32 s13, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s7
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s6
-; GFX11-NEXT: s_lshl_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
-; GFX11-NEXT: s_cmp_lg_u32 s12, 0
+; GFX11-NEXT: s_cmp_eq_u32 s5, 0
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
-; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s4
+; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s4
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX11-NEXT: s_cmp_lg_u32 s13, 0
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1]
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s5, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT: s_sub_i32 s0, 64, s4
+; GFX11-NEXT: s_and_not1_b32 s0, 0x7f, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT: s_sub_i32 s0, s4, 64
-; GFX11-NEXT: s_cmp_lt_u32 s4, 64
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-NEXT: s_sub_i32 s1, 64, s0
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT: s_sub_i32 s1, s0, 64
+; GFX11-NEXT: s_cmp_lt_u32 s0, 64
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: v_or_b32_e32 v4, v4, v6
; GFX11-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-NEXT: s_and_b32 s0, 1, s1
+; GFX11-NEXT: s_and_b32 s1, 1, s4
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
; GFX11-NEXT: s_and_b32 s0, 1, s5
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT: v_or_b32_e32 v0, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v1, s7, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT: v_or_b32_e32 v1, s9, v1
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -6760,25 +6828,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshl_i128_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT: s_sub_i32 s5, s6, 64
-; GFX6-NEXT: s_sub_i32 s7, 64, s6
-; GFX6-NEXT: s_cmp_lt_u32 s6, 64
+; GFX6-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT: s_sub_i32 s7, s5, 64
+; GFX6-NEXT: s_sub_i32 s8, 64, s5
+; GFX6-NEXT: s_cmp_lt_u32 s5, 64
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s6, 0
-; GFX6-NEXT: s_mov_b32 s8, 0
+; GFX6-NEXT: s_cmp_eq_u32 s5, 0
+; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_cselect_b32 s10, 1, 0
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s7
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s6
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5
-; GFX6-NEXT: s_and_b32 s5, 1, s9
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s5
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s7
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: s_lshl_b32 s9, s2, 31
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s6
+; GFX6-NEXT: s_lshl_b32 s7, s2, 31
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5
+; GFX6-NEXT: s_and_b32 s5, 1, s9
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT: s_not_b32 s6, s4
+; GFX6-NEXT: s_andn2_b32 s4, 0x7f, s4
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX6-NEXT: s_and_b32 s5, 1, s10
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_sub_i32 s10, s4, 64
; GFX6-NEXT: s_sub_i32 s8, 64, s4
@@ -6793,19 +6862,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX6-NEXT: s_cselect_b32 s12, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v6
; GFX6-NEXT: v_or_b32_e32 v1, s1, v7
; GFX6-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6814,25 +6883,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshl_i128_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT: s_sub_i32 s5, s6, 64
-; GFX8-NEXT: s_sub_i32 s7, 64, s6
-; GFX8-NEXT: s_cmp_lt_u32 s6, 64
+; GFX8-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT: s_sub_i32 s7, s5, 64
+; GFX8-NEXT: s_sub_i32 s8, 64, s5
+; GFX8-NEXT: s_cmp_lt_u32 s5, 64
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s6, 0
-; GFX8-NEXT: s_mov_b32 s8, 0
+; GFX8-NEXT: s_cmp_eq_u32 s5, 0
+; GFX8-NEXT: s_mov_b32 s6, 0
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX8-NEXT: s_and_b32 s5, 1, s9
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: s_lshl_b32 s9, s2, 31
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX8-NEXT: s_lshl_b32 s7, s2, 31
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX8-NEXT: s_and_b32 s5, 1, s9
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_not_b32 s6, s4
+; GFX8-NEXT: s_andn2_b32 s4, 0x7f, s4
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX8-NEXT: s_and_b32 s5, 1, s10
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_sub_i32 s10, s4, 64
; GFX8-NEXT: s_sub_i32 s8, 64, s4
@@ -6847,19 +6917,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX8-NEXT: s_cselect_b32 s12, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v6
; GFX8-NEXT: v_or_b32_e32 v1, s1, v7
; GFX8-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6868,25 +6938,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshl_i128_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT: s_sub_i32 s5, s6, 64
-; GFX9-NEXT: s_sub_i32 s7, 64, s6
-; GFX9-NEXT: s_cmp_lt_u32 s6, 64
+; GFX9-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT: s_sub_i32 s7, s5, 64
+; GFX9-NEXT: s_sub_i32 s8, 64, s5
+; GFX9-NEXT: s_cmp_lt_u32 s5, 64
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s6, 0
-; GFX9-NEXT: s_mov_b32 s8, 0
+; GFX9-NEXT: s_cmp_eq_u32 s5, 0
+; GFX9-NEXT: s_mov_b32 s6, 0
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX9-NEXT: s_and_b32 s5, 1, s9
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: s_lshl_b32 s9, s2, 31
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX9-NEXT: s_lshl_b32 s7, s2, 31
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
+; GFX9-NEXT: s_and_b32 s5, 1, s9
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX9-NEXT: s_not_b32 s6, s4
+; GFX9-NEXT: s_andn2_b32 s4, 0x7f, s4
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX9-NEXT: s_and_b32 s5, 1, s10
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_sub_i32 s10, s4, 64
; GFX9-NEXT: s_sub_i32 s8, 64, s4
@@ -6901,19 +6972,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
; GFX9-NEXT: v_or_b32_e32 v0, s0, v6
; GFX9-NEXT: v_or_b32_e32 v1, s1, v7
; GFX9-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6922,53 +6993,54 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX10-LABEL: v_fshl_i128_vss:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT: s_sub_i32 s5, s6, 64
-; GFX10-NEXT: s_sub_i32 s7, 64, s6
-; GFX10-NEXT: s_cmp_lt_u32 s6, 64
+; GFX10-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX10-NEXT: s_sub_i32 s6, s5, 64
+; GFX10-NEXT: s_sub_i32 s7, 64, s5
+; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s6, 0
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX10-NEXT: s_cmp_eq_u32 s5, 0
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX10-NEXT: s_and_b32 s6, 1, s8
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1]
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
; GFX10-NEXT: s_lshl_b32 s7, s2, 31
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT: s_and_b32 s5, 1, s9
+; GFX10-NEXT: s_and_b32 s5, 1, s8
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_sub_i32 s10, s4, 64
-; GFX10-NEXT: s_sub_i32 s8, 64, s4
+; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s4
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
; GFX10-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT: s_cmp_lt_u32 s4, 64
+; GFX10-NEXT: s_and_b32 s5, 1, s9
+; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX10-NEXT: s_not_b32 s8, s4
+; GFX10-NEXT: s_sub_i32 s10, s6, 64
+; GFX10-NEXT: s_sub_i32 s7, 64, s6
+; GFX10-NEXT: s_cmp_lt_u32 s6, 64
; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10-NEXT: s_cmp_eq_u32 s6, 0
; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
-; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
-; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT: s_cselect_b32 s12, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8
+; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s7
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
+; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v7
; GFX10-NEXT: v_or_b32_e32 v2, s2, v2
; GFX10-NEXT: v_or_b32_e32 v3, s3, v3
@@ -6976,50 +7048,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX11-LABEL: v_fshl_i128_vss:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT: s_sub_i32 s5, s6, 64
-; GFX11-NEXT: s_sub_i32 s7, 64, s6
-; GFX11-NEXT: s_cmp_lt_u32 s6, 64
+; GFX11-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s6, s5, 64
+; GFX11-NEXT: s_sub_i32 s7, 64, s5
+; GFX11-NEXT: s_cmp_lt_u32 s5, 64
; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
; GFX11-NEXT: s_cselect_b32 s8, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 0
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3]
+; GFX11-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX11-NEXT: s_cselect_b32 s9, 1, 0
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1]
-; GFX11-NEXT: s_and_b32 s6, 1, s8
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1]
; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1
; GFX11-NEXT: s_lshl_b32 s7, s2, 31
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT: s_and_b32 s5, 1, s9
+; GFX11-NEXT: s_and_b32 s5, 1, s8
; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_sub_i32 s10, s4, 64
-; GFX11-NEXT: s_sub_i32 s8, 64, s4
+; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s4
+; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
; GFX11-NEXT: v_or_b32_e32 v4, v4, v6
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT: s_cmp_lt_u32 s4, 64
+; GFX11-NEXT: s_and_b32 s5, 1, s9
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
+; GFX11-NEXT: s_not_b32 s8, s4
+; GFX11-NEXT: s_sub_i32 s10, s6, 64
+; GFX11-NEXT: s_sub_i32 s7, 64, s6
+; GFX11-NEXT: s_cmp_lt_u32 s6, 64
; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
; GFX11-NEXT: s_cselect_b32 s11, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-NEXT: s_cmp_eq_u32 s6, 0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
-; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
-; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT: s_cselect_b32 s12, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8
+; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s7
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
+; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0
; GFX11-NEXT: v_or_b32_e32 v1, s1, v7
; GFX11-NEXT: v_or_b32_e32 v2, s2, v2
; GFX11-NEXT: v_or_b32_e32 v3, s3, v3
@@ -7152,40 +7226,41 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshl_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX6-NEXT: s_sub_i32 s17, s18, 64
-; GFX6-NEXT: s_sub_i32 s19, 64, s18
-; GFX6-NEXT: s_cmp_lt_u32 s18, 64
-; GFX6-NEXT: s_cselect_b32 s23, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, 0
+; GFX6-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT: s_sub_i32 s19, s17, 64
+; GFX6-NEXT: s_sub_i32 s21, 64, s17
+; GFX6-NEXT: s_cmp_lt_u32 s17, 64
; GFX6-NEXT: s_cselect_b32 s28, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s18
-; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s19
-; GFX6-NEXT: s_lshl_b64 s[18:19], s[2:3], s18
-; GFX6-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT: s_cmp_lg_u32 s23, 0
-; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0
-; GFX6-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX6-NEXT: s_cmp_eq_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s17, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[24:25], s[0:1], s21
+; GFX6-NEXT: s_lshl_b64 s[26:27], s[2:3], s16
+; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s16
+; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
; GFX6-NEXT: s_cmp_lg_u32 s28, 0
-; GFX6-NEXT: s_mov_b32 s22, 0
+; GFX6-NEXT: s_cselect_b64 s[22:23], s[22:23], 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_mov_b32 s18, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
-; GFX6-NEXT: s_lshl_b32 s23, s10, 31
-; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX6-NEXT: s_lshl_b32 s19, s10, 31
; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT: s_sub_i32 s23, s16, 64
-; GFX6-NEXT: s_sub_i32 s18, 64, s16
-; GFX6-NEXT: s_cmp_lt_u32 s16, 64
+; GFX6-NEXT: s_andn2_b32 s10, 0x7f, s16
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX6-NEXT: s_not_b32 s17, s16
+; GFX6-NEXT: s_sub_i32 s19, s10, 64
+; GFX6-NEXT: s_sub_i32 s21, 64, s10
+; GFX6-NEXT: s_cmp_lt_u32 s10, 64
; GFX6-NEXT: s_cselect_b32 s26, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, 0
+; GFX6-NEXT: s_cmp_eq_u32 s10, 0
; GFX6-NEXT: s_cselect_b32 s27, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
-; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT: s_lshl_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s23
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s17
+; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s17
+; GFX6-NEXT: s_lshl_b64 s[24:25], s[8:9], s21
+; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s19
; GFX6-NEXT: s_cmp_lg_u32 s26, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s27, 0
@@ -7193,86 +7268,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX6-NEXT: s_cmp_lg_u32 s26, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX6-NEXT: s_sub_i32 s11, s8, 64
-; GFX6-NEXT: s_sub_i32 s9, 64, s8
+; GFX6-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX6-NEXT: s_sub_i32 s19, s8, 64
+; GFX6-NEXT: s_sub_i32 s10, 64, s8
; GFX6-NEXT: s_cmp_lt_u32 s8, 64
-; GFX6-NEXT: s_cselect_b32 s20, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s8, 0
; GFX6-NEXT: s_cselect_b32 s21, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s9
-; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT: s_cmp_eq_u32 s8, 0
+; GFX6-NEXT: s_cselect_b32 s22, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[6:7], s20
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], s20
+; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s19
; GFX6-NEXT: s_cmp_lg_u32 s21, 0
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT: s_cmp_lg_u32 s22, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
-; GFX6-NEXT: s_lshl_b32 s23, s14, 31
-; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
-; GFX6-NEXT: s_sub_i32 s18, s10, 64
-; GFX6-NEXT: s_sub_i32 s14, 64, s10
-; GFX6-NEXT: s_cmp_lt_u32 s10, 64
+; GFX6-NEXT: s_lshl_b32 s19, s14, 31
+; GFX6-NEXT: s_andn2_b32 s12, 0x7f, s20
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
+; GFX6-NEXT: s_not_b32 s14, s20
+; GFX6-NEXT: s_sub_i32 s18, s12, 64
+; GFX6-NEXT: s_sub_i32 s16, 64, s12
+; GFX6-NEXT: s_cmp_lt_u32 s12, 64
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s10, 0
+; GFX6-NEXT: s_cmp_eq_u32 s12, 0
; GFX6-NEXT: s_cselect_b32 s20, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
-; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s14
+; GFX6-NEXT: s_lshr_b64 s[14:15], s[4:5], s14
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[10:11], s16
+; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
-; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], 0
+; GFX6-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshl_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX8-NEXT: s_sub_i32 s17, s18, 64
-; GFX8-NEXT: s_sub_i32 s19, 64, s18
-; GFX8-NEXT: s_cmp_lt_u32 s18, 64
-; GFX8-NEXT: s_cselect_b32 s23, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s18, 0
+; GFX8-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT: s_sub_i32 s19, s17, 64
+; GFX8-NEXT: s_sub_i32 s21, 64, s17
+; GFX8-NEXT: s_cmp_lt_u32 s17, 64
; GFX8-NEXT: s_cselect_b32 s28, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s18
-; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s19
-; GFX8-NEXT: s_lshl_b64 s[18:19], s[2:3], s18
-; GFX8-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT: s_cmp_lg_u32 s23, 0
-; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0
-; GFX8-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX8-NEXT: s_cmp_eq_u32 s17, 0
+; GFX8-NEXT: s_cselect_b32 s17, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[24:25], s[0:1], s21
+; GFX8-NEXT: s_lshl_b64 s[26:27], s[2:3], s16
+; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s16
+; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
; GFX8-NEXT: s_cmp_lg_u32 s28, 0
-; GFX8-NEXT: s_mov_b32 s22, 0
+; GFX8-NEXT: s_cselect_b64 s[22:23], s[22:23], 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX8-NEXT: s_cmp_lg_u32 s17, 0
+; GFX8-NEXT: s_mov_b32 s18, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
-; GFX8-NEXT: s_lshl_b32 s23, s10, 31
-; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX8-NEXT: s_lshl_b32 s19, s10, 31
; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT: s_sub_i32 s23, s16, 64
-; GFX8-NEXT: s_sub_i32 s18, 64, s16
-; GFX8-NEXT: s_cmp_lt_u32 s16, 64
+; GFX8-NEXT: s_andn2_b32 s10, 0x7f, s16
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX8-NEXT: s_not_b32 s17, s16
+; GFX8-NEXT: s_sub_i32 s19, s10, 64
+; GFX8-NEXT: s_sub_i32 s21, 64, s10
+; GFX8-NEXT: s_cmp_lt_u32 s10, 64
; GFX8-NEXT: s_cselect_b32 s26, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s16, 0
+; GFX8-NEXT: s_cmp_eq_u32 s10, 0
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
-; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT: s_lshl_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s23
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s17
+; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s17
+; GFX8-NEXT: s_lshl_b64 s[24:25], s[8:9], s21
+; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s19
; GFX8-NEXT: s_cmp_lg_u32 s26, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
@@ -7280,86 +7357,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX8-NEXT: s_cmp_lg_u32 s26, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX8-NEXT: s_sub_i32 s11, s8, 64
-; GFX8-NEXT: s_sub_i32 s9, 64, s8
+; GFX8-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX8-NEXT: s_sub_i32 s19, s8, 64
+; GFX8-NEXT: s_sub_i32 s10, 64, s8
; GFX8-NEXT: s_cmp_lt_u32 s8, 64
-; GFX8-NEXT: s_cselect_b32 s20, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s8, 0
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s9
-; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT: s_cmp_lg_u32 s20, 0
-; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT: s_cmp_eq_u32 s8, 0
+; GFX8-NEXT: s_cselect_b32 s22, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[6:7], s20
+; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], s20
+; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s19
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT: s_cmp_lg_u32 s22, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
-; GFX8-NEXT: s_lshl_b32 s23, s14, 31
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
-; GFX8-NEXT: s_sub_i32 s18, s10, 64
-; GFX8-NEXT: s_sub_i32 s14, 64, s10
-; GFX8-NEXT: s_cmp_lt_u32 s10, 64
+; GFX8-NEXT: s_lshl_b32 s19, s14, 31
+; GFX8-NEXT: s_andn2_b32 s12, 0x7f, s20
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
+; GFX8-NEXT: s_not_b32 s14, s20
+; GFX8-NEXT: s_sub_i32 s18, s12, 64
+; GFX8-NEXT: s_sub_i32 s16, 64, s12
+; GFX8-NEXT: s_cmp_lt_u32 s12, 64
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s10, 0
+; GFX8-NEXT: s_cmp_eq_u32 s12, 0
; GFX8-NEXT: s_cselect_b32 s20, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
-; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s14
+; GFX8-NEXT: s_lshr_b64 s[14:15], s[4:5], s14
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[10:11], s16
+; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s20, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], 0
+; GFX8-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshl_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX9-NEXT: s_sub_i32 s17, s18, 64
-; GFX9-NEXT: s_sub_i32 s19, 64, s18
-; GFX9-NEXT: s_cmp_lt_u32 s18, 64
-; GFX9-NEXT: s_cselect_b32 s23, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s18, 0
+; GFX9-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT: s_sub_i32 s19, s17, 64
+; GFX9-NEXT: s_sub_i32 s21, 64, s17
+; GFX9-NEXT: s_cmp_lt_u32 s17, 64
; GFX9-NEXT: s_cselect_b32 s28, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s18
-; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s19
-; GFX9-NEXT: s_lshl_b64 s[18:19], s[2:3], s18
-; GFX9-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19]
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT: s_cmp_lg_u32 s23, 0
-; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0
-; GFX9-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX9-NEXT: s_cmp_eq_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s17, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[0:1], s21
+; GFX9-NEXT: s_lshl_b64 s[26:27], s[2:3], s16
+; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s16
+; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
; GFX9-NEXT: s_cmp_lg_u32 s28, 0
-; GFX9-NEXT: s_mov_b32 s22, 0
+; GFX9-NEXT: s_cselect_b64 s[22:23], s[22:23], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_mov_b32 s18, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
-; GFX9-NEXT: s_lshl_b32 s23, s10, 31
-; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX9-NEXT: s_lshl_b32 s19, s10, 31
; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT: s_sub_i32 s23, s16, 64
-; GFX9-NEXT: s_sub_i32 s18, 64, s16
-; GFX9-NEXT: s_cmp_lt_u32 s16, 64
+; GFX9-NEXT: s_andn2_b32 s10, 0x7f, s16
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX9-NEXT: s_not_b32 s17, s16
+; GFX9-NEXT: s_sub_i32 s19, s10, 64
+; GFX9-NEXT: s_sub_i32 s21, 64, s10
+; GFX9-NEXT: s_cmp_lt_u32 s10, 64
; GFX9-NEXT: s_cselect_b32 s26, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, 0
+; GFX9-NEXT: s_cmp_eq_u32 s10, 0
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16
-; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT: s_lshl_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s23
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s17
+; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s17
+; GFX9-NEXT: s_lshl_b64 s[24:25], s[8:9], s21
+; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s19
; GFX9-NEXT: s_cmp_lg_u32 s26, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
@@ -7367,222 +7446,227 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
; GFX9-NEXT: s_cmp_lg_u32 s26, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
-; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
-; GFX9-NEXT: s_sub_i32 s11, s8, 64
-; GFX9-NEXT: s_sub_i32 s9, 64, s8
+; GFX9-NEXT: s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX9-NEXT: s_sub_i32 s19, s8, 64
+; GFX9-NEXT: s_sub_i32 s10, 64, s8
; GFX9-NEXT: s_cmp_lt_u32 s8, 64
-; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s8, 0
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s9
-; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9]
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT: s_cmp_eq_u32 s8, 0
+; GFX9-NEXT: s_cselect_b32 s22, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[6:7], s20
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], s20
+; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s19
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT: s_cmp_lg_u32 s22, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
-; GFX9-NEXT: s_lshl_b32 s23, s14, 31
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
-; GFX9-NEXT: s_sub_i32 s18, s10, 64
-; GFX9-NEXT: s_sub_i32 s14, 64, s10
-; GFX9-NEXT: s_cmp_lt_u32 s10, 64
+; GFX9-NEXT: s_lshl_b32 s19, s14, 31
+; GFX9-NEXT: s_andn2_b32 s12, 0x7f, s20
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
+; GFX9-NEXT: s_not_b32 s14, s20
+; GFX9-NEXT: s_sub_i32 s18, s12, 64
+; GFX9-NEXT: s_sub_i32 s16, 64, s12
+; GFX9-NEXT: s_cmp_lt_u32 s12, 64
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, 0
+; GFX9-NEXT: s_cmp_eq_u32 s12, 0
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
-; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s14
+; GFX9-NEXT: s_lshr_b64 s[14:15], s[4:5], s14
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[10:11], s16
+; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0
-; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5]
-; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], 0
+; GFX9-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshl_v2i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
-; GFX10-NEXT: s_sub_i32 s17, s18, 64
-; GFX10-NEXT: s_sub_i32 s19, 64, s18
-; GFX10-NEXT: s_cmp_lt_u32 s18, 64
-; GFX10-NEXT: s_mov_b32 s22, 0
-; GFX10-NEXT: s_cselect_b32 s23, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s18, 0
+; GFX10-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX10-NEXT: s_mov_b32 s18, 0
+; GFX10-NEXT: s_sub_i32 s19, s17, 64
+; GFX10-NEXT: s_sub_i32 s21, 64, s17
+; GFX10-NEXT: s_cmp_lt_u32 s17, 64
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s19
-; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s18
-; GFX10-NEXT: s_lshl_b64 s[18:19], s[0:1], s18
-; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
-; GFX10-NEXT: s_cmp_lg_u32 s23, 0
-; GFX10-NEXT: s_cselect_b64 s[18:19], s[18:19], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s17, 0
+; GFX10-NEXT: s_cselect_b32 s17, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s21
+; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s16
+; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s16
+; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
+; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT: s_lshl_b32 s23, s10, 31
+; GFX10-NEXT: s_lshl_b32 s19, s10, 31
; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT: s_sub_i32 s23, s16, 64
-; GFX10-NEXT: s_sub_i32 s17, 64, s16
-; GFX10-NEXT: s_cmp_lt_u32 s16, 64
+; GFX10-NEXT: s_andn2_b32 s10, 0x7f, s16
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT: s_not_b32 s19, s16
+; GFX10-NEXT: s_sub_i32 s21, s10, 64
+; GFX10-NEXT: s_sub_i32 s16, 64, s10
+; GFX10-NEXT: s_cmp_lt_u32 s10, 64
; GFX10-NEXT: s_cselect_b32 s26, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s16, 0
+; GFX10-NEXT: s_cmp_eq_u32 s10, 0
; GFX10-NEXT: s_cselect_b32 s27, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
-; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17
-; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
-; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s23
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s19
+; GFX10-NEXT: s_lshl_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT: s_lshr_b64 s[22:23], s[8:9], s19
+; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s21
; GFX10-NEXT: s_cmp_lg_u32 s26, 0
; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s27, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s26, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
-; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0
+; GFX10-NEXT: s_and_b32 s10, s20, 0x7f
+; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX10-NEXT: s_sub_i32 s11, s8, 64
-; GFX10-NEXT: s_sub_i32 s9, 64, s8
-; GFX10-NEXT: s_cmp_lt_u32 s8, 64
-; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s8, 0
+; GFX10-NEXT: s_sub_i32 s19, s10, 64
+; GFX10-NEXT: s_sub_i32 s8, 64, s10
+; GFX10-NEXT: s_cmp_lt_u32 s10, 64
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s8
-; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT: s_cmp_lg_u32 s20, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX10-NEXT: s_cmp_eq_u32 s10, 0
+; GFX10-NEXT: s_cselect_b32 s22, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s20
+; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s20
+; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s19
; GFX10-NEXT: s_cmp_lg_u32 s21, 0
+; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT: s_cmp_lg_u32 s22, 0
; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT: s_lshl_b32 s23, s14, 31
-; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1
-; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX10-NEXT: s_sub_i32 s18, s10, 64
-; GFX10-NEXT: s_sub_i32 s11, 64, s10
-; GFX10-NEXT: s_cmp_lt_u32 s10, 64
+; GFX10-NEXT: s_lshl_b32 s19, s14, 31
+; GFX10-NEXT: s_andn2_b32 s12, 0x7f, s20
+; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
+; GFX10-NEXT: s_not_b32 s16, s20
+; GFX10-NEXT: s_sub_i32 s18, s12, 64
+; GFX10-NEXT: s_sub_i32 s14, 64, s12
+; GFX10-NEXT: s_cmp_lt_u32 s12, 64
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s10, 0
+; GFX10-NEXT: s_cmp_eq_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10
-; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10
-; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18
+; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s16
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
+; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
-; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s20, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
-; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
+; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_v2i128:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17]
-; GFX11-NEXT: s_sub_i32 s17, s18, 64
-; GFX11-NEXT: s_sub_i32 s19, 64, s18
-; GFX11-NEXT: s_cmp_lt_u32 s18, 64
-; GFX11-NEXT: s_mov_b32 s22, 0
-; GFX11-NEXT: s_cselect_b32 s23, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s18, 0
+; GFX11-NEXT: s_and_b32 s17, s16, 0x7f
+; GFX11-NEXT: s_mov_b32 s18, 0
+; GFX11-NEXT: s_sub_i32 s19, s17, 64
+; GFX11-NEXT: s_sub_i32 s21, 64, s17
+; GFX11-NEXT: s_cmp_lt_u32 s17, 64
; GFX11-NEXT: s_cselect_b32 s28, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s19
-; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s18
-; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], s18
-; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17
-; GFX11-NEXT: s_cmp_lg_u32 s23, 0
-; GFX11-NEXT: s_cselect_b64 s[18:19], s[18:19], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s17, 0
+; GFX11-NEXT: s_cselect_b32 s17, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s21
+; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s16
+; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s16
+; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s17, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT: s_lshl_b32 s23, s10, 31
+; GFX11-NEXT: s_lshl_b32 s19, s10, 31
; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT: s_sub_i32 s23, s16, 64
-; GFX11-NEXT: s_sub_i32 s17, 64, s16
-; GFX11-NEXT: s_cmp_lt_u32 s16, 64
+; GFX11-NEXT: s_and_not1_b32 s10, 0x7f, s16
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT: s_not_b32 s19, s16
+; GFX11-NEXT: s_sub_i32 s21, s10, 64
+; GFX11-NEXT: s_sub_i32 s16, 64, s10
+; GFX11-NEXT: s_cmp_lt_u32 s10, 64
; GFX11-NEXT: s_cselect_b32 s26, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s16, 0
+; GFX11-NEXT: s_cmp_eq_u32 s10, 0
; GFX11-NEXT: s_cselect_b32 s27, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16
-; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17
-; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
-; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s23
+; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s19
+; GFX11-NEXT: s_lshl_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT: s_lshr_b64 s[22:23], s[8:9], s19
+; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s21
; GFX11-NEXT: s_cmp_lg_u32 s26, 0
; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX11-NEXT: s_cmp_lg_u32 s27, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
; GFX11-NEXT: s_cmp_lg_u32 s26, 0
-; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
-; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0
+; GFX11-NEXT: s_and_b32 s10, s20, 0x7f
+; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1]
-; GFX11-NEXT: s_sub_i32 s11, s8, 64
-; GFX11-NEXT: s_sub_i32 s9, 64, s8
-; GFX11-NEXT: s_cmp_lt_u32 s8, 64
-; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s8, 0
+; GFX11-NEXT: s_sub_i32 s19, s10, 64
+; GFX11-NEXT: s_sub_i32 s8, 64, s10
+; GFX11-NEXT: s_cmp_lt_u32 s10, 64
; GFX11-NEXT: s_cselect_b32 s21, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s8
-; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5]
+; GFX11-NEXT: s_cmp_eq_u32 s10, 0
+; GFX11-NEXT: s_cselect_b32 s22, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
+; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s20
+; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s20
+; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s19
; GFX11-NEXT: s_cmp_lg_u32 s21, 0
+; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s22, 0
; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT: s_lshl_b32 s23, s14, 31
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1
-; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23]
-; GFX11-NEXT: s_sub_i32 s18, s10, 64
-; GFX11-NEXT: s_sub_i32 s11, 64, s10
-; GFX11-NEXT: s_cmp_lt_u32 s10, 64
+; GFX11-NEXT: s_lshl_b32 s19, s14, 31
+; GFX11-NEXT: s_and_not1_b32 s12, 0x7f, s20
+; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19]
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
+; GFX11-NEXT: s_not_b32 s16, s20
+; GFX11-NEXT: s_sub_i32 s18, s12, 64
+; GFX11-NEXT: s_sub_i32 s14, 64, s12
+; GFX11-NEXT: s_cmp_lt_u32 s12, 64
; GFX11-NEXT: s_cselect_b32 s19, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s10, 0
+; GFX11-NEXT: s_cmp_eq_u32 s12, 0
; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[4:5], s10
-; GFX11-NEXT: s_lshl_b64 s[16:17], s[12:13], s11
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], s10
-; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17]
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s18
+; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], s16
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[8:9], s14
+; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s18
; GFX11-NEXT: s_cmp_lg_u32 s19, 0
-; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13]
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9]
; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13]
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
; GFX11-NEXT: s_cmp_lg_u32 s19, 0
-; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
+; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX11-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
ret <2 x i128> %result
@@ -7592,56 +7676,54 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-LABEL: v_fshl_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT: v_not_b32_e32 v16, v16
-; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v23
-; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v23
-; GFX6-NEXT: v_lshr_b64 v[16:17], v[0:1], v16
-; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v23
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v23
+; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19
+; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25
-; GFX6-NEXT: v_or_b32_e32 v16, v16, v18
-; GFX6-NEXT: v_or_b32_e32 v17, v17, v19
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX6-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX6-NEXT: v_or_b32_e32 v17, v17, v21
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v22
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX6-NEXT: v_not_b32_e32 v8, v16
; GFX6-NEXT: v_or_b32_e32 v1, v1, v2
; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v24
-; GFX6-NEXT: v_subrev_i32_e32 v23, vcc, 64, v24
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v24
+; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23
+; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23
; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10
-; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v24
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v23
+; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24
; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v0, v18, v0
-; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT: v_not_b32_e32 v8, v20
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX6-NEXT: v_or_b32_e32 v2, v18, v2
+; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT: v_or_b32_e32 v1, v19, v1
-; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18
-; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v18
+; GFX6-NEXT: v_or_b32_e32 v3, v19, v3
+; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18
; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8
; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18
; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v20
+; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19
; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7651,88 +7733,88 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14
+; GFX6-NEXT: v_not_b32_e32 v8, v20
; GFX6-NEXT: v_or_b32_e32 v5, v5, v6
; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1
-; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v19
-; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v19
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v19
+; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14
+; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14
; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10
-; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v19
-; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v14
+; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14
+; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15
; GFX6-NEXT: v_or_b32_e32 v8, v8, v10
; GFX6-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX6-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX6-NEXT: v_or_b32_e32 v1, v22, v1
; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
; GFX6-NEXT: v_or_b32_e32 v5, v17, v5
; GFX6-NEXT: v_or_b32_e32 v6, v18, v6
-; GFX6-NEXT: v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v19, v7
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshl_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT: v_not_b32_e32 v16, v16
-; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v23
-; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v23
-; GFX8-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19
+; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX8-NEXT: v_or_b32_e32 v16, v16, v18
-; GFX8-NEXT: v_or_b32_e32 v17, v17, v19
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX8-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc
+; GFX8-NEXT: v_or_b32_e32 v17, v17, v21
+; GFX8-NEXT: v_or_b32_e32 v18, v18, v22
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10
+; GFX8-NEXT: v_not_b32_e32 v8, v16
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v24
-; GFX8-NEXT: v_subrev_u32_e32 v23, vcc, 64, v24
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23
+; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v0, v18, v0
-; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT: v_not_b32_e32 v8, v20
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX8-NEXT: v_or_b32_e32 v2, v18, v2
+; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT: v_or_b32_e32 v1, v19, v1
-; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8
; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18
-; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v18
+; GFX8-NEXT: v_or_b32_e32 v3, v19, v3
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18
; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5]
; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7742,87 +7824,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc
; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14
+; GFX8-NEXT: v_not_b32_e32 v8, v20
; GFX8-NEXT: v_or_b32_e32 v5, v5, v6
; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
-; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v19
-; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v19
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v8, v8, v10
; GFX8-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX8-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v22, v1
; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
; GFX8-NEXT: v_or_b32_e32 v5, v17, v5
; GFX8-NEXT: v_or_b32_e32 v6, v18, v6
-; GFX8-NEXT: v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT: v_or_b32_e32 v7, v19, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT: v_not_b32_e32 v16, v16
-; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT: v_sub_u32_e32 v16, 64, v23
-; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v23
-; GFX9-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16
+; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19
+; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1]
-; GFX9-NEXT: v_or_b32_e32 v16, v16, v18
-; GFX9-NEXT: v_or_b32_e32 v17, v17, v19
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v16, v1, v17, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc
+; GFX9-NEXT: v_or_b32_e32 v17, v17, v21
+; GFX9-NEXT: v_or_b32_e32 v18, v18, v22
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v16, v3, vcc
+; GFX9-NEXT: v_not_b32_e32 v8, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc
; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11]
+; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8
; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1
-; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT: v_subrev_u32_e32 v23, 64, v24
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23
+; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
+; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GFX9-NEXT: v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
+; GFX9-NEXT: v_or_b32_e32 v2, v18, v2
; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT: v_not_b32_e32 v8, v20
-; GFX9-NEXT: v_or_b32_e32 v1, v19, v1
-; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18
-; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v18
+; GFX9-NEXT: v_or_b32_e32 v3, v19, v3
+; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18
; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5]
; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
@@ -7833,89 +7915,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13]
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v8, v7, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc
+; GFX9-NEXT: v_not_b32_e32 v8, v20
; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5
-; GFX9-NEXT: v_sub_u32_e32 v10, 64, v19
-; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v19
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15]
+; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14
+; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
-; GFX9-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7]
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7]
+; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7]
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v8, v8, v10
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc
-; GFX9-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v22, v3
+; GFX9-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX9-NEXT: v_or_b32_e32 v1, v22, v1
; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
; GFX9-NEXT: v_or_b32_e32 v5, v17, v5
; GFX9-NEXT: v_or_b32_e32 v6, v18, v6
-; GFX9-NEXT: v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT: v_or_b32_e32 v7, v19, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshl_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX10-NEXT: v_not_b32_e32 v16, v16
+; GFX10-NEXT: v_not_b32_e32 v21, v16
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27
-; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16
+; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21
; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3]
; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28
; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27
-; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28
; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
; GFX10-NEXT: v_or_b32_e32 v18, v16, v18
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28
; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v19, v17, v19
-; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28
-; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11]
; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v23, v23, v25
; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v0, v24, v26
-; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX10-NEXT: v_or_b32_e32 v24, v24, v26
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s4
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v23, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v19, v3, s4
; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v23, v19, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v17, v9, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5
+; GFX10-NEXT: v_not_b32_e32 v16, v20
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo
; GFX10-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX10-NEXT: v_not_b32_e32 v3, v20
; GFX10-NEXT: v_or_b32_e32 v1, v22, v8
; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
-; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v24
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v24
+; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v16
; GFX10-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v3
; GFX10-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7]
-; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24
-; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5]
+; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5]
; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22
-; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5]
+; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24
; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24
; GFX10-NEXT: v_or_b32_e32 v12, v10, v12
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22
@@ -7953,88 +8037,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16
-; GFX11-NEXT: v_not_b32_e32 v16, v16
+; GFX11-NEXT: v_not_b32_e32 v21, v16
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27
-; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21
+; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11]
-; GFX11-NEXT: v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22
+; GFX11-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo
; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27
; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3]
-; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28
; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27
-; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28
-; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1]
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v19, v17, v19
; GFX11-NEXT: v_or_b32_e32 v18, v16, v18
+; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18
+; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28
; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28
-; GFX11-NEXT: v_or_b32_e32 v19, v17, v19
-; GFX11-NEXT: v_or_b32_e32 v23, v23, v25
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo
+; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9]
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
+; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11]
; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v0, v24, v26
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27
+; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v23, v23, v25
+; GFX11-NEXT: v_or_b32_e32 v24, v24, v26
+; GFX11-NEXT: v_dual_cndmask_b32 v25, 0, v1 :: v_dual_cndmask_b32 v16, v16, v23
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v23, v19, v3, s0
; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v20
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s0
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11]
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v17, v9, s1
-; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v24
-; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1
+; GFX11-NEXT: v_not_b32_e32 v16, v20
+; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo
+; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24
; GFX11-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-NEXT: v_not_b32_e32 v3, v20
; GFX11-NEXT: v_or_b32_e32 v1, v22, v8
; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13]
+; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v16
; GFX11-NEXT: v_or_b32_e32 v2, v2, v10
-; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5]
; GFX11-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7]
; GFX11-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5]
-; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24
-; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v3
-; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5]
; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9
; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15]
-; GFX11-NEXT: v_or_b32_e32 v12, v10, v12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
-; GFX11-NEXT: v_or_b32_e32 v5, v11, v13
-; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo
; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22
+; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24
+; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22
+; GFX11-NEXT: v_or_b32_e32 v12, v10, v12
; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22
; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9]
-; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22
-; GFX11-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo
; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
+; GFX11-NEXT: v_or_b32_e32 v5, v11, v13
; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15]
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15]
+; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22
-; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24
; GFX11-NEXT: v_or_b32_e32 v16, v18, v20
; GFX11-NEXT: v_or_b32_e32 v18, v19, v21
+; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5
+; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15]
; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2
; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24
; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2
; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2
; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0
; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 8538dcabca924b..58304d2072d7f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -347,49 +347,57 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshr_i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_andn2_b32 s3, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT: s_lshl_b32 s0, s0, s2
-; GFX8-NEXT: s_lshr_b32 s1, s1, s3
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s3
+; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_i8:
; GFX9: ; %bb.0:
+; GFX9-NEXT: s_andn2_b32 s3, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT: s_lshl_b32 s0, s0, s2
-; GFX9-NEXT: s_lshr_b32 s1, s1, s3
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s3
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_andn2_b32 s3, 7, s2
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_lshl_b32 s0, s0, s2
-; GFX10-NEXT: s_lshr_b32 s1, s1, s3
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i8:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_not1_b32 s3, 7, s2
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_lshl_b32 s0, s0, s2
-; GFX11-NEXT: s_lshr_b32 s1, s1, s3
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s3
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -414,33 +422,33 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX8-LABEL: v_fshr_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshr_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_not_b32_e32 v3, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
@@ -451,9 +459,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX11-LABEL: v_fshr_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_not_b32_e32 v3, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
@@ -687,25 +695,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX8-LABEL: s_fshr_v2i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s3, s0, 8
; GFX8-NEXT: s_lshr_b32 s4, s1, 8
; GFX8-NEXT: s_lshr_b32 s5, s2, 8
-; GFX8-NEXT: s_and_b32 s6, s2, 7
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_andn2_b32 s6, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, s2
+; GFX8-NEXT: s_lshr_b32 s3, s0, 8
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s6, 0xffff, s6
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_andn2_b32 s2, 7, s5
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s1, s1, s6
-; GFX8-NEXT: s_lshl_b32 s2, s3, s2
-; GFX8-NEXT: s_and_b32 s3, s4, 0xff
; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s5, 7
+; GFX8-NEXT: s_lshl_b32 s1, s3, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s1, s1, s2
+; GFX8-NEXT: s_and_b32 s2, s5, 7
+; GFX8-NEXT: s_and_b32 s3, s4, 0xff
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshr_b32 s1, s3, s1
-; GFX8-NEXT: s_or_b32 s1, s2, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshr_b32 s2, s3, s2
+; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
@@ -714,25 +726,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX9-LABEL: s_fshr_v2i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s3, s0, 8
; GFX9-NEXT: s_lshr_b32 s4, s1, 8
; GFX9-NEXT: s_lshr_b32 s5, s2, 8
-; GFX9-NEXT: s_and_b32 s6, s2, 7
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_andn2_b32 s6, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, s2
+; GFX9-NEXT: s_lshr_b32 s3, s0, 8
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_and_b32 s6, 0xffff, s6
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s6
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_andn2_b32 s2, 7, s5
-; GFX9-NEXT: s_lshl_b32 s3, s3, 1
-; GFX9-NEXT: s_lshr_b32 s1, s1, s6
-; GFX9-NEXT: s_lshl_b32 s2, s3, s2
-; GFX9-NEXT: s_and_b32 s3, s4, 0xff
; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s5, 7
+; GFX9-NEXT: s_lshl_b32 s1, s3, 1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, s2
+; GFX9-NEXT: s_and_b32 s2, s5, 7
+; GFX9-NEXT: s_and_b32 s3, s4, 0xff
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_lshr_b32 s1, s3, s1
-; GFX9-NEXT: s_or_b32 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshr_b32 s2, s3, s2
+; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
@@ -741,24 +757,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX10-LABEL: s_fshr_v2i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s4, s1, 8
+; GFX10-NEXT: s_andn2_b32 s5, 7, s2
; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s5, s2, 8
-; GFX10-NEXT: s_and_b32 s6, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_lshr_b32 s4, s1, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_lshr_b32 s6, s2, 8
+; GFX10-NEXT: s_lshl_b32 s0, s0, s5
+; GFX10-NEXT: s_andn2_b32 s5, 7, s6
; GFX10-NEXT: s_and_b32 s4, s4, 0xff
+; GFX10-NEXT: s_and_b32 s6, s6, 7
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, s2
-; GFX10-NEXT: s_and_b32 s2, s5, 7
-; GFX10-NEXT: s_andn2_b32 s5, 7, s5
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_lshl_b32 s3, s3, 1
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_lshl_b32 s3, s3, s5
-; GFX10-NEXT: s_lshr_b32 s2, s4, s2
-; GFX10-NEXT: s_lshr_b32 s1, s1, s6
-; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_lshr_b32 s4, s4, s6
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_or_b32 s2, s3, s4
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s1, s2, 0xff
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
@@ -768,24 +788,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
;
; GFX11-LABEL: s_fshr_v2i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s4, s1, 8
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s5, s2, 8
-; GFX11-NEXT: s_and_b32 s6, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_lshr_b32 s4, s1, 8
; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_lshr_b32 s6, s2, 8
+; GFX11-NEXT: s_lshl_b32 s0, s0, s5
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s6
; GFX11-NEXT: s_and_b32 s4, s4, 0xff
+; GFX11-NEXT: s_and_b32 s6, s6, 7
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, s2
-; GFX11-NEXT: s_and_b32 s2, s5, 7
-; GFX11-NEXT: s_and_not1_b32 s5, 7, s5
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_lshl_b32 s3, s3, 1
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_lshl_b32 s3, s3, s5
-; GFX11-NEXT: s_lshr_b32 s2, s4, s2
-; GFX11-NEXT: s_lshr_b32 s1, s1, s6
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_lshr_b32 s4, s4, s6
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s2, s3, s4
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: s_and_b32 s1, s2, 0xff
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
@@ -832,23 +856,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX8-LABEL: v_fshr_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
+; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; GFX8-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_not_b32_e32 v2, v5
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_e32 v1, 7, v5
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v3
; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v5
+; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -857,23 +881,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX9-LABEL: v_fshr_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
+; GFX9-NEXT: v_xor_b32_e32 v6, -1, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0
+; GFX9-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_not_b32_e32 v2, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT: v_and_b32_e32 v1, 7, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v3
; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v3
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v5
+; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -885,20 +909,20 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v2
-; GFX10-NEXT: v_not_b32_e32 v2, v2
-; GFX10-NEXT: v_not_b32_e32 v7, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT: v_lshlrev_b16 v4, v7, v4
-; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1
-; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
+; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
+; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
@@ -912,22 +936,22 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT: v_and_b32_e32 v7, 7, v2
-; GFX11-NEXT: v_not_b32_e32 v2, v2
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3
; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: v_and_b32_e32 v6, 7, v6
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4
-; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1
+; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0
+; GFX11-NEXT: v_lshlrev_b16 v0, v7, v0
; GFX11-NEXT: v_or_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
@@ -997,50 +1021,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX8-LABEL: s_fshr_v4i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s3, s0, 8
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_lshr_b32 s5, s0, 24
; GFX8-NEXT: s_lshr_b32 s6, s1, 8
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_lshr_b32 s8, s1, 24
; GFX8-NEXT: s_lshr_b32 s9, s2, 8
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_lshr_b32 s11, s2, 24
-; GFX8-NEXT: s_and_b32 s12, s2, 7
-; GFX8-NEXT: s_andn2_b32 s2, 7, s2
-; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_andn2_b32 s12, 7, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 7
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, s2
+; GFX8-NEXT: s_lshr_b32 s3, s0, 8
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NEXT: s_lshr_b32 s5, s0, 24
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_and_b32 s12, 0xffff, s12
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s12
+; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_andn2_b32 s2, 7, s9
-; GFX8-NEXT: s_lshl_b32 s3, s3, 1
-; GFX8-NEXT: s_lshr_b32 s1, s1, s12
-; GFX8-NEXT: s_lshl_b32 s2, s3, s2
-; GFX8-NEXT: s_and_b32 s3, s6, 0xff
; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: s_and_b32 s1, s9, 7
+; GFX8-NEXT: s_lshl_b32 s1, s3, 1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s1, s1, s2
+; GFX8-NEXT: s_and_b32 s2, s9, 7
+; GFX8-NEXT: s_and_b32 s3, s6, 0xff
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT: s_lshr_b32 s1, s3, s1
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshr_b32 s2, s3, s2
; GFX8-NEXT: s_andn2_b32 s3, 7, s10
-; GFX8-NEXT: s_lshl_b32 s4, s4, 1
-; GFX8-NEXT: s_lshl_b32 s3, s4, s3
+; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: s_lshl_b32 s2, s4, 1
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s3
+; GFX8-NEXT: s_and_b32 s3, s10, 7
; GFX8-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NEXT: s_or_b32 s1, s2, s1
-; GFX8-NEXT: s_and_b32 s2, s10, 7
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX8-NEXT: s_lshr_b32 s2, s4, s2
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: s_and_b32 s3, s11, 7
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s3, s4, s3
; GFX8-NEXT: s_andn2_b32 s4, 7, s11
-; GFX8-NEXT: s_lshl_b32 s5, s5, 1
+; GFX8-NEXT: s_or_b32 s2, s2, s3
+; GFX8-NEXT: s_lshl_b32 s3, s5, 1
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_lshl_b32 s3, s3, s4
+; GFX8-NEXT: s_and_b32 s4, s11, 7
+; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
-; GFX8-NEXT: s_lshl_b32 s4, s5, s4
-; GFX8-NEXT: s_lshr_b32 s3, s8, s3
+; GFX8-NEXT: s_lshr_b32 s4, s8, s4
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NEXT: s_or_b32 s3, s4, s3
+; GFX8-NEXT: s_or_b32 s3, s3, s4
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s1, s3, 0xff
@@ -1050,50 +1082,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
;
; GFX9-LABEL: s_fshr_v4i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s3, s0, 8
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: s_lshr_b32 s5, s0, 24
; GFX9-NEXT: s_lshr_b32 s6, s1, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_lshr_b32 s8, s1, 24
; GFX9-NEXT: s_lshr_b32 s9, s2, 8
; GFX9-NEXT: s_lshr_b32 s10, s2, 16
; GFX9-NEXT: s_lshr_b32 s11, s2, 24
-; GFX9-NEXT: s_and_b32 s12, s2, 7
-; GFX9-NEXT: s_andn2_b32 s2, 7, s2
-; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_andn2_b32 s12, 7, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 7
; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, s2
+; GFX9-NEXT: s_lshr_b32 s3, s0, 8
+; GFX9-NEXT: s_lshr_b32 s4, s0, 16
+; GFX9-NEXT: s_lshr_b32 s5, s0, 24
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_and_b32 s12, 0xffff, s12
; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshl_b32 s0, s0, s12
+; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_andn2_b32 s2, 7, s9
-; GFX9-NEXT: s_lshl_b32 s3, s3, 1
-; GFX9-NEXT: s_lshr_b32 s1, s1, s12
-; GFX9-NEXT: s_lshl_b32 s2, s3, s2
-; GFX9-NEXT: s_and_b32 s3, s6, 0xff
; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: s_and_b32 s1, s9, 7
+; GFX9-NEXT: s_lshl_b32 s1, s3, 1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, s2
+; GFX9-NEXT: s_and_b32 s2, s9, 7
+; GFX9-NEXT: s_and_b32 s3, s6, 0xff
; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT: s_lshr_b32 s1, s3, s1
+; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT: s_lshr_b32 s2, s3, s2
; GFX9-NEXT: s_andn2_b32 s3, 7, s10
-; GFX9-NEXT: s_lshl_b32 s4, s4, 1
-; GFX9-NEXT: s_lshl_b32 s3, s4, s3
+; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_lshl_b32 s2, s4, 1
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_lshl_b32 s2, s2, s3
+; GFX9-NEXT: s_and_b32 s3, s10, 7
; GFX9-NEXT: s_and_b32 s4, s7, 0xff
-; GFX9-NEXT: s_or_b32 s1, s2, s1
-; GFX9-NEXT: s_and_b32 s2, s10, 7
; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT: s_lshr_b32 s2, s4, s2
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_or_b32 s2, s3, s2
-; GFX9-NEXT: s_and_b32 s3, s11, 7
+; GFX9-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_lshr_b32 s3, s4, s3
; GFX9-NEXT: s_andn2_b32 s4, 7, s11
-; GFX9-NEXT: s_lshl_b32 s5, s5, 1
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: s_lshl_b32 s3, s5, 1
+; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX9-NEXT: s_lshl_b32 s3, s3, s4
+; GFX9-NEXT: s_and_b32 s4, s11, 7
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s4, 0xffff, s4
; GFX9-NEXT: s_and_b32 s0, s0, 0xff
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
-; GFX9-NEXT: s_lshl_b32 s4, s5, s4
-; GFX9-NEXT: s_lshr_b32 s3, s8, s3
+; GFX9-NEXT: s_lshr_b32 s4, s8, s4
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s2, 0xff
-; GFX9-NEXT: s_or_b32 s3, s4, s3
+; GFX9-NEXT: s_or_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s1, s1, 16
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s1, s3, 0xff
@@ -1104,43 +1144,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX10-LABEL: s_fshr_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s6, s1, 8
-; GFX10-NEXT: s_lshr_b32 s3, s0, 8
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_lshr_b32 s7, s1, 16
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_lshr_b32 s9, s2, 8
; GFX10-NEXT: s_lshr_b32 s10, s2, 16
; GFX10-NEXT: s_lshr_b32 s11, s2, 24
-; GFX10-NEXT: s_and_b32 s12, s2, 7
-; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_andn2_b32 s12, 7, s2
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 1
-; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_and_b32 s2, s2, 7
; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT: s_lshl_b32 s0, s0, s2
-; GFX10-NEXT: s_and_b32 s2, s9, 7
-; GFX10-NEXT: s_andn2_b32 s9, 7, s9
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_lshr_b32 s3, s0, 8
+; GFX10-NEXT: s_lshr_b32 s1, s1, s2
+; GFX10-NEXT: s_andn2_b32 s2, 7, s9
+; GFX10-NEXT: s_and_b32 s6, s6, 0xff
+; GFX10-NEXT: s_and_b32 s9, s9, 7
+; GFX10-NEXT: s_lshr_b32 s4, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s0, 24
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_and_b32 s12, 0xffff, s12
; GFX10-NEXT: s_lshl_b32 s3, s3, 1
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_lshr_b32 s1, s1, s12
-; GFX10-NEXT: s_lshl_b32 s3, s3, s9
-; GFX10-NEXT: s_lshr_b32 s2, s6, s2
-; GFX10-NEXT: s_and_b32 s6, s7, 0xff
+; GFX10-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX10-NEXT: s_lshl_b32 s0, s0, s12
+; GFX10-NEXT: s_lshl_b32 s2, s3, s2
+; GFX10-NEXT: s_lshr_b32 s3, s6, s9
; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: s_or_b32 s1, s3, s2
-; GFX10-NEXT: s_and_b32 s2, s10, 7
-; GFX10-NEXT: s_andn2_b32 s3, 7, s10
-; GFX10-NEXT: s_lshl_b32 s4, s4, 1
+; GFX10-NEXT: s_or_b32 s1, s2, s3
+; GFX10-NEXT: s_andn2_b32 s2, 7, s10
+; GFX10-NEXT: s_lshl_b32 s3, s4, 1
+; GFX10-NEXT: s_and_b32 s4, s7, 0xff
+; GFX10-NEXT: s_and_b32 s6, s10, 7
+; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT: s_lshl_b32 s3, s4, s3
-; GFX10-NEXT: s_lshr_b32 s2, s6, s2
-; GFX10-NEXT: s_andn2_b32 s4, 7, s11
-; GFX10-NEXT: s_lshl_b32 s5, s5, 1
+; GFX10-NEXT: s_lshl_b32 s2, s3, s2
+; GFX10-NEXT: s_lshr_b32 s3, s4, s6
+; GFX10-NEXT: s_lshl_b32 s4, s5, 1
+; GFX10-NEXT: s_andn2_b32 s5, 7, s11
; GFX10-NEXT: s_and_b32 s6, s11, 7
-; GFX10-NEXT: s_lshl_b32 s4, s5, s4
+; GFX10-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT: s_lshl_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s5, s8, s6
-; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xff
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
@@ -1157,43 +1205,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
; GFX11-LABEL: s_fshr_v4i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
-; GFX11-NEXT: s_lshr_b32 s3, s0, 8
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 24
; GFX11-NEXT: s_lshr_b32 s7, s1, 16
; GFX11-NEXT: s_lshr_b32 s8, s1, 24
; GFX11-NEXT: s_lshr_b32 s9, s2, 8
; GFX11-NEXT: s_lshr_b32 s10, s2, 16
; GFX11-NEXT: s_lshr_b32 s11, s2, 24
-; GFX11-NEXT: s_and_b32 s12, s2, 7
-; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_not1_b32 s12, 7, s2
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
-; GFX11-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_and_b32 s2, s2, 7
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: s_lshl_b32 s0, s0, s2
-; GFX11-NEXT: s_and_b32 s2, s9, 7
-; GFX11-NEXT: s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_lshr_b32 s3, s0, 8
+; GFX11-NEXT: s_lshr_b32 s1, s1, s2
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s9
+; GFX11-NEXT: s_and_b32 s6, s6, 0xff
+; GFX11-NEXT: s_and_b32 s9, s9, 7
+; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: s_lshr_b32 s5, s0, 24
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_and_b32 s12, 0xffff, s12
; GFX11-NEXT: s_lshl_b32 s3, s3, 1
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_lshr_b32 s1, s1, s12
-; GFX11-NEXT: s_lshl_b32 s3, s3, s9
-; GFX11-NEXT: s_lshr_b32 s2, s6, s2
-; GFX11-NEXT: s_and_b32 s6, s7, 0xff
+; GFX11-NEXT: s_and_b32 s9, 0xffff, s9
+; GFX11-NEXT: s_lshl_b32 s0, s0, s12
+; GFX11-NEXT: s_lshl_b32 s2, s3, s2
+; GFX11-NEXT: s_lshr_b32 s3, s6, s9
; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: s_or_b32 s1, s3, s2
-; GFX11-NEXT: s_and_b32 s2, s10, 7
-; GFX11-NEXT: s_and_not1_b32 s3, 7, s10
-; GFX11-NEXT: s_lshl_b32 s4, s4, 1
+; GFX11-NEXT: s_or_b32 s1, s2, s3
+; GFX11-NEXT: s_and_not1_b32 s2, 7, s10
+; GFX11-NEXT: s_lshl_b32 s3, s4, 1
+; GFX11-NEXT: s_and_b32 s4, s7, 0xff
+; GFX11-NEXT: s_and_b32 s6, s10, 7
+; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s4, 0xffff, s4
; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT: s_lshl_b32 s3, s4, s3
-; GFX11-NEXT: s_lshr_b32 s2, s6, s2
-; GFX11-NEXT: s_and_not1_b32 s4, 7, s11
-; GFX11-NEXT: s_lshl_b32 s5, s5, 1
+; GFX11-NEXT: s_lshl_b32 s2, s3, s2
+; GFX11-NEXT: s_lshr_b32 s3, s4, s6
+; GFX11-NEXT: s_lshl_b32 s4, s5, 1
+; GFX11-NEXT: s_and_not1_b32 s5, 7, s11
; GFX11-NEXT: s_and_b32 s6, s11, 7
-; GFX11-NEXT: s_lshl_b32 s4, s5, s4
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT: s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT: s_lshl_b32 s4, s4, s5
; GFX11-NEXT: s_lshr_b32 s5, s8, s6
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_or_b32 s2, s2, s3
; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_or_b32 s3, s4, s5
; GFX11-NEXT: s_and_b32 s0, s0, 0xff
@@ -1272,40 +1328,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-LABEL: v_fshr_v4i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_not_b32_e32 v7, v2
-; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0
; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8
-; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX8-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX8-NEXT: v_and_b32_e32 v7, 7, v5
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3
+; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_mov_b32_e32 v7, -1
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, 7
+; GFX8-NEXT: v_mov_b32_e32 v4, 1
+; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_and_b32_e32 v9, 7, v9
; GFX8-NEXT: v_mov_b32_e32 v8, 0xff
-; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mov_b32_e32 v9, 1
-; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX8-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b16_e32 v5, v9, v5
+; GFX8-NEXT: v_mov_b32_e32 v9, 7
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v10
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v8
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX8-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, v10, v8
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v8
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 8
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1321,40 +1378,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-LABEL: v_fshr_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_not_b32_e32 v7, v2
-; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT: v_xor_b32_e32 v7, -1, v2
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, 1, v0
; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, 1, v0
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v7, v6
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8
-; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT: v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT: v_not_b32_e32 v5, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3
-; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3
+; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_mov_b32_e32 v7, -1
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, 7
-; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-NEXT: v_mov_b32_e32 v4, 1
+; GFX9-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v9, 7, v9
; GFX9-NEXT: v_mov_b32_e32 v8, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v10
-; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v10
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX9-NEXT: v_lshlrev_b16_e32 v5, v9, v5
+; GFX9-NEXT: v_mov_b32_e32 v9, 7
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v11, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 7, v4
+; GFX9-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b16_e32 v10, v10, v11
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v10
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -1371,45 +1429,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-NEXT: v_not_b32_e32 v8, v2
+; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2
+; GFX10-NEXT: v_mov_b32_e32 v3, -1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX10-NEXT: v_not_b32_e32 v10, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT: v_mov_b32_e32 v3, 7
; GFX10-NEXT: v_and_b32_e32 v10, 7, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1
+; GFX10-NEXT: v_mov_b32_e32 v14, 0xff
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT: v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v9
; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4
-; GFX10-NEXT: v_mov_b32_e32 v10, 0xff
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1
+; GFX10-NEXT: v_xor_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v10, 7
+; GFX10-NEXT: v_xor_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v12, 7, v2
; GFX10-NEXT: v_and_b32_e32 v13, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v9
-; GFX10-NEXT: v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_e32 v10, 7, v14
+; GFX10-NEXT: v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT: v_and_b32_sdwa v15, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b16 v3, v5, v9
-; GFX10-NEXT: v_lshlrev_b16 v5, v8, v6
+; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT: v_and_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v5, v5, v8
+; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6
; GFX10-NEXT: v_lshrrev_b16 v1, v15, v1
-; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7
+; GFX10-NEXT: v_lshlrev_b16 v3, v3, v7
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v11
; GFX10-NEXT: v_lshrrev_b16 v7, v12, v13
-; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, 8
-; GFX10-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX10-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, 8
+; GFX10-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX10-NEXT: v_or_b32_e32 v2, v3, v2
; GFX10-NEXT: v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3
@@ -1427,29 +1486,29 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT: v_not_b32_e32 v12, v7
+; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7
; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT: v_and_b32_e32 v12, 7, v12
; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3
-; GFX11-NEXT: v_not_b32_e32 v14, v11
+; GFX11-NEXT: v_and_b32_e32 v12, 7, v12
+; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11
; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6
-; GFX11-NEXT: v_not_b32_e32 v7, v13
+; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13
; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1
-; GFX11-NEXT: v_not_b32_e32 v10, v2
+; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2
; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3
-; GFX11-NEXT: v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT: v_and_b32_e32 v12, 7, v14
; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT: v_and_b32_e32 v12, 7, v14
+; GFX11-NEXT: v_and_b32_e32 v11, 7, v11
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT: v_and_b32_e32 v7, 7, v7
; GFX11-NEXT: v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT: v_and_b32_e32 v10, 7, v10
; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v10, 7, v10
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-NEXT: v_or_b32_e32 v3, v3, v6
; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4
@@ -5112,51 +5171,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) {
; GFX6-LABEL: s_fshr_i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 63
-; GFX6-NEXT: s_andn2_b64 s[4:5], 63, s[4:5]
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
-; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
+; GFX6-NEXT: s_not_b32 s5, s4
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
+; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 63
-; GFX8-NEXT: s_andn2_b64 s[4:5], 63, s[4:5]
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
-; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
+; GFX8-NEXT: s_not_b32 s5, s4
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
+; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 63
-; GFX9-NEXT: s_andn2_b64 s[4:5], 63, s[4:5]
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
+; GFX9-NEXT: s_not_b32 s5, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
+; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b64 s[6:7], 63, s[4:5]
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], 63
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s6
+; GFX10-NEXT: s_not_b32 s5, s4
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b64 s[6:7], 63, s[4:5]
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], 63
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6
+; GFX11-NEXT: s_not_b32 s5, s4
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: ; return to shader part epilog
@@ -5233,12 +5287,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX6-LABEL: v_fshr_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX6-NEXT: v_not_b32_e32 v4, v4
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT: v_not_b32_e32 v5, v4
+; GFX6-NEXT: v_and_b32_e32 v5, 63, v5
; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v5
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -5246,12 +5300,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX8-LABEL: v_fshr_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT: v_not_b32_e32 v5, v4
+; GFX8-NEXT: v_and_b32_e32 v5, 63, v5
; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -5259,12 +5313,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
; GFX9-LABEL: v_fshr_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v5, 63, v4
-; GFX9-NEXT: v_not_b32_e32 v4, v4
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT: v_not_b32_e32 v5, v4
+; GFX9-NEXT: v_and_b32_e32 v5, 63, v5
; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3]
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5410,38 +5464,38 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) {
; GFX6-LABEL: v_fshr_i64_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v0
-; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT: v_not_b32_e32 v1, v0
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v0
-; GFX6-NEXT: v_lshr_b64 v[2:3], s[2:3], v2
-; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1
+; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0
+; GFX6-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v2, v4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i64_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT: v_not_b32_e32 v1, v0
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX8-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v2, v4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i64_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v2, 63, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_not_b32_e32 v1, v0
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3]
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 63, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 63, v0
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3]
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v2, v4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i64_ssv:
@@ -5478,43 +5532,43 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64
define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) {
; GFX6-LABEL: v_fshr_i64_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX6-NEXT: s_not_b32 s3, s2
+; GFX6-NEXT: s_and_b32 s2, s2, 63
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s4
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i64_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX8-NEXT: s_not_b32 s3, s2
+; GFX8-NEXT: s_and_b32 s2, s2, 63
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i64_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
+; GFX9-NEXT: s_not_b32 s3, s2
+; GFX9-NEXT: s_and_b32 s2, s2, 63
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: v_or_b32_e32 v1, s1, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i64_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT: s_and_b32 s3, s2, 63
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX10-NEXT: s_not_b32 s2, s2
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v1
@@ -5522,10 +5576,10 @@ define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg
;
; GFX11-LABEL: v_fshr_i64_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3]
-; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX11-NEXT: s_and_b32 s3, s2, 63
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1]
+; GFX11-NEXT: s_not_b32 s2, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -5542,10 +5596,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
; GFX6-LABEL: v_fshr_i64_vss:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s2
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
+; GFX6-NEXT: s_andn2_b32 s3, 63, s2
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
; GFX6-NEXT: v_or_b32_e32 v1, s1, v1
; GFX6-NEXT: ; return to shader part epilog
@@ -5553,10 +5606,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
; GFX8-LABEL: v_fshr_i64_vss:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
+; GFX8-NEXT: s_andn2_b32 s3, 63, s2
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-NEXT: ; return to shader part epilog
@@ -5564,10 +5616,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
; GFX9-LABEL: v_fshr_i64_vss:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63
-; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1]
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
+; GFX9-NEXT: s_andn2_b32 s3, 63, s2
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
; GFX9-NEXT: v_or_b32_e32 v1, s1, v1
; GFX9-NEXT: ; return to shader part epilog
@@ -5575,10 +5626,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
; GFX10-LABEL: v_fshr_i64_vss:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63
+; GFX10-NEXT: s_andn2_b32 s3, 63, s2
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v1
; GFX10-NEXT: ; return to shader part epilog
@@ -5586,13 +5636,12 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
; GFX11-LABEL: v_fshr_i64_vss:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3]
-; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_not1_b32 s3, 63, s2
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1]
; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v1, s1, v1
; GFX11-NEXT: ; return to shader part epilog
%result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
@@ -5603,63 +5652,55 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg
define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
+; GFX6-NEXT: s_not_b32 s9, s8
+; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
+; GFX6-NEXT: s_not_b32 s4, s10
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s10
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
+; GFX8-NEXT: s_not_b32 s9, s8
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
+; GFX8-NEXT: s_not_b32 s4, s10
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s10
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63
-; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9]
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12
+; GFX9-NEXT: s_not_b32 s9, s8
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63
-; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
+; GFX9-NEXT: s_not_b32 s4, s10
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s10
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_v2i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_andn2_b64 s[12:13], 63, s[8:9]
-; GFX10-NEXT: s_and_b64 s[8:9], s[8:9], 63
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
-; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[10:11]
+; GFX10-NEXT: s_not_b32 s9, s8
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_and_b64 s[10:11], s[10:11], 63
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT: s_not_b32 s9, s10
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s9
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5667,15 +5708,13 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
;
; GFX11-LABEL: s_fshr_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_not1_b64 s[12:13], 63, s[8:9]
-; GFX11-NEXT: s_and_b64 s[8:9], s[8:9], 63
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[10:11]
+; GFX11-NEXT: s_not_b32 s9, s8
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_and_b64 s[10:11], s[10:11], 63
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT: s_not_b32 s9, s10
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s9
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10
; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
@@ -5688,18 +5727,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX6-LABEL: v_fshr_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX6-NEXT: v_not_b32_e32 v8, v8
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT: v_not_b32_e32 v9, v8
+; GFX6-NEXT: v_and_b32_e32 v9, 63, v9
; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v9
-; GFX6-NEXT: v_not_b32_e32 v8, v10
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX6-NEXT: v_not_b32_e32 v4, v10
+; GFX6-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4
; GFX6-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX6-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v8
; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4
; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5709,18 +5748,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX8-LABEL: v_fshr_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX8-NEXT: v_not_b32_e32 v8, v8
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX8-NEXT: v_not_b32_e32 v9, v8
+; GFX8-NEXT: v_and_b32_e32 v9, 63, v9
; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX8-NEXT: v_not_b32_e32 v8, v10
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_not_b32_e32 v4, v10
+; GFX8-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX8-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX8-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5730,18 +5769,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
; GFX9-LABEL: v_fshr_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v9, 63, v8
-; GFX9-NEXT: v_not_b32_e32 v8, v8
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT: v_not_b32_e32 v9, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 63, v9
; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5]
-; GFX9-NEXT: v_not_b32_e32 v8, v10
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX9-NEXT: v_not_b32_e32 v4, v10
+; GFX9-NEXT: v_and_b32_e32 v4, 63, v4
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
; GFX9-NEXT: v_and_b32_e32 v4, 63, v10
-; GFX9-NEXT: v_and_b32_e32 v8, 63, v8
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3]
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7]
; GFX9-NEXT: v_or_b32_e32 v1, v1, v5
; GFX9-NEXT: v_or_b32_e32 v2, v2, v6
@@ -5800,231 +5839,237 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: s_fshr_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], 1
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
; GFX6-NEXT: s_mov_b32 s1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT: s_sub_i32 s11, s8, 64
-; GFX6-NEXT: s_sub_i32 s9, 64, s8
-; GFX6-NEXT: s_cmp_lt_u32 s8, 64
-; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s8, 0
+; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT: s_not_b32 s9, s8
+; GFX6-NEXT: s_sub_i32 s16, s2, 64
+; GFX6-NEXT: s_sub_i32 s12, 64, s2
+; GFX6-NEXT: s_cmp_lt_u32 s2, 64
; GFX6-NEXT: s_cselect_b32 s17, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s8
-; GFX6-NEXT: s_lshr_b64 s[14:15], s[12:13], s9
-; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX6-NEXT: s_lshl_b64 s[12:13], s[12:13], s11
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX6-NEXT: s_cmp_eq_u32 s2, 0
+; GFX6-NEXT: s_cselect_b32 s18, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
+; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT: s_sub_i32 s14, s10, 64
-; GFX6-NEXT: s_sub_i32 s12, 64, s10
-; GFX6-NEXT: s_cmp_lt_u32 s10, 64
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX6-NEXT: s_and_b32 s0, s8, 0x7f
+; GFX6-NEXT: s_sub_i32 s14, s0, 64
+; GFX6-NEXT: s_sub_i32 s12, 64, s0
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s15, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s10, 0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s10
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
-; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
; GFX6-NEXT: s_cmp_lg_u32 s15, 0
-; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s15, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], 1
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT: s_sub_i32 s11, s8, 64
-; GFX8-NEXT: s_sub_i32 s9, 64, s8
-; GFX8-NEXT: s_cmp_lt_u32 s8, 64
-; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s8, 0
+; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT: s_not_b32 s9, s8
+; GFX8-NEXT: s_sub_i32 s16, s2, 64
+; GFX8-NEXT: s_sub_i32 s12, 64, s2
+; GFX8-NEXT: s_cmp_lt_u32 s2, 64
; GFX8-NEXT: s_cselect_b32 s17, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[12:13], s8
-; GFX8-NEXT: s_lshr_b64 s[14:15], s[12:13], s9
-; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX8-NEXT: s_lshl_b64 s[12:13], s[12:13], s11
-; GFX8-NEXT: s_cmp_lg_u32 s16, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
+; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX8-NEXT: s_cmp_lg_u32 s17, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT: s_sub_i32 s14, s10, 64
-; GFX8-NEXT: s_sub_i32 s12, 64, s10
-; GFX8-NEXT: s_cmp_lt_u32 s10, 64
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX8-NEXT: s_cmp_lg_u32 s18, 0
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX8-NEXT: s_and_b32 s0, s8, 0x7f
+; GFX8-NEXT: s_sub_i32 s14, s0, 64
+; GFX8-NEXT: s_sub_i32 s12, 64, s0
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s15, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s10, 0
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s10
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
-; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
; GFX8-NEXT: s_cmp_lg_u32 s15, 0
-; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s15, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
-; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], 1
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT: s_sub_i32 s11, s8, 64
-; GFX9-NEXT: s_sub_i32 s9, 64, s8
-; GFX9-NEXT: s_cmp_lt_u32 s8, 64
-; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s8, 0
+; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT: s_not_b32 s9, s8
+; GFX9-NEXT: s_sub_i32 s16, s2, 64
+; GFX9-NEXT: s_sub_i32 s12, 64, s2
+; GFX9-NEXT: s_cmp_lt_u32 s2, 64
; GFX9-NEXT: s_cselect_b32 s17, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[12:13], s8
-; GFX9-NEXT: s_lshr_b64 s[14:15], s[12:13], s9
-; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9]
-; GFX9-NEXT: s_lshl_b64 s[12:13], s[12:13], s11
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13]
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s18, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s12
+; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s9
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16
; GFX9-NEXT: s_cmp_lg_u32 s17, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT: s_sub_i32 s14, s10, 64
-; GFX9-NEXT: s_sub_i32 s12, 64, s10
-; GFX9-NEXT: s_cmp_lt_u32 s10, 64
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
+; GFX9-NEXT: s_cmp_lg_u32 s18, 0
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11]
+; GFX9-NEXT: s_and_b32 s0, s8, 0x7f
+; GFX9-NEXT: s_sub_i32 s14, s0, 64
+; GFX9-NEXT: s_sub_i32 s12, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s15, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s10
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8
; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12
-; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
; GFX9-NEXT: s_cmp_lg_u32 s15, 0
-; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s15, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9]
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s12, s1, 31
-; GFX10-NEXT: s_mov_b32 s13, 0
+; GFX10-NEXT: s_lshr_b32 s10, s1, 31
+; GFX10-NEXT: s_mov_b32 s11, 0
+; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX10-NEXT: s_sub_i32 s11, s8, 64
-; GFX10-NEXT: s_sub_i32 s9, 64, s8
-; GFX10-NEXT: s_cmp_lt_u32 s8, 64
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s8, 0
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT: s_not_b32 s14, s8
+; GFX10-NEXT: s_sub_i32 s16, s9, 64
+; GFX10-NEXT: s_sub_i32 s10, 64, s9
+; GFX10-NEXT: s_cmp_lt_u32 s9, 64
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s9
-; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8
-; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
-; GFX10-NEXT: s_cmp_lg_u32 s16, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s9, 0
+; GFX10-NEXT: s_cselect_b32 s9, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], s14
+; GFX10-NEXT: s_lshl_b64 s[14:15], s[0:1], s14
+; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s16
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
+; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT: s_sub_i32 s14, s10, 64
-; GFX10-NEXT: s_sub_i32 s11, 64, s10
-; GFX10-NEXT: s_cmp_lt_u32 s10, 64
+; GFX10-NEXT: s_and_b32 s0, s8, 0x7f
+; GFX10-NEXT: s_sub_i32 s14, s0, 64
+; GFX10-NEXT: s_sub_i32 s9, 64, s0
+; GFX10-NEXT: s_cmp_lt_u32 s0, 64
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s10, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s10
-; GFX10-NEXT: s_lshl_b64 s[12:13], s[6:7], s11
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[6:7], s10
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s8
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7]
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s15, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0
-; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0
+; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1]
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_i128:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9]
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s12, s1, 31
-; GFX11-NEXT: s_mov_b32 s13, 0
+; GFX11-NEXT: s_lshr_b32 s10, s1, 31
+; GFX11-NEXT: s_mov_b32 s11, 0
+; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13]
-; GFX11-NEXT: s_sub_i32 s11, s8, 64
-; GFX11-NEXT: s_sub_i32 s9, 64, s8
-; GFX11-NEXT: s_cmp_lt_u32 s8, 64
-; GFX11-NEXT: s_cselect_b32 s16, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s8, 0
+; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT: s_not_b32 s14, s8
+; GFX11-NEXT: s_sub_i32 s16, s9, 64
+; GFX11-NEXT: s_sub_i32 s10, 64, s9
+; GFX11-NEXT: s_cmp_lt_u32 s9, 64
; GFX11-NEXT: s_cselect_b32 s17, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s9
-; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8
-; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11
-; GFX11-NEXT: s_cmp_lg_u32 s16, 0
-; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s9, 0
+; GFX11-NEXT: s_cselect_b32 s9, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], s14
+; GFX11-NEXT: s_lshl_b64 s[14:15], s[0:1], s14
+; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s16
; GFX11-NEXT: s_cmp_lg_u32 s17, 0
+; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s9, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT: s_sub_i32 s14, s10, 64
-; GFX11-NEXT: s_sub_i32 s11, 64, s10
-; GFX11-NEXT: s_cmp_lt_u32 s10, 64
+; GFX11-NEXT: s_and_b32 s0, s8, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s14, s0, 64
+; GFX11-NEXT: s_sub_i32 s9, 64, s0
+; GFX11-NEXT: s_cmp_lt_u32 s0, 64
; GFX11-NEXT: s_cselect_b32 s15, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s10, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s16, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s10
-; GFX11-NEXT: s_lshl_b64 s[12:13], s[6:7], s11
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[6:7], s10
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s8
+; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s9
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11]
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14
; GFX11-NEXT: s_cmp_lg_u32 s15, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7]
; GFX11-NEXT: s_cmp_lg_u32 s16, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1]
; GFX11-NEXT: s_cmp_lg_u32 s15, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0
-; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0
+; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1]
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6035,29 +6080,29 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-LABEL: v_fshr_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX6-NEXT: v_not_b32_e32 v8, v8
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
+; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT: v_not_b32_e32 v0, v8
+; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v0
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v15
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0
+; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15
; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15
-; GFX6-NEXT: v_lshl_b64 v[12:13], v[8:9], v15
-; GFX6-NEXT: v_or_b32_e32 v10, v0, v10
-; GFX6-NEXT: v_or_b32_e32 v11, v1, v11
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[8:9], v16
+; GFX6-NEXT: v_lshl_b64 v[13:14], v[9:10], v15
+; GFX6-NEXT: v_or_b32_e32 v11, v0, v11
+; GFX6-NEXT: v_or_b32_e32 v12, v1, v12
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v16
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX6-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14
; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14
; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2
@@ -6074,38 +6119,38 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v12, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v10, v0
; GFX6-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX6-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v12, v3
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fshr_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX8-NEXT: v_not_b32_e32 v8, v8
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT: v_not_b32_e32 v0, v8
+; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15
-; GFX8-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX8-NEXT: v_or_b32_e32 v10, v0, v10
-; GFX8-NEXT: v_or_b32_e32 v11, v1, v11
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX8-NEXT: v_or_b32_e32 v11, v0, v11
+; GFX8-NEXT: v_or_b32_e32 v12, v1, v12
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
@@ -6122,39 +6167,39 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v12, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v10, v0
; GFX8-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v12, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8
-; GFX9-NEXT: v_not_b32_e32 v8, v8
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT: v_not_b32_e32 v0, v8
+; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0
; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10]
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3]
; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15
-; GFX9-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v10, v0, v10
-; GFX9-NEXT: v_or_b32_e32 v11, v1, v11
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10]
+; GFX9-NEXT: v_or_b32_e32 v11, v0, v11
+; GFX9-NEXT: v_or_b32_e32 v12, v1, v12
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc
+; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, 64, v14
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14
@@ -6170,10 +6215,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v12, v0
+; GFX9-NEXT: v_or_b32_e32 v0, v10, v0
; GFX9-NEXT: v_or_b32_e32 v1, v13, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX9-NEXT: v_or_b32_e32 v3, v12, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fshr_i128:
@@ -6282,158 +6327,158 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) {
; GFX6-LABEL: v_fshr_i128_ssv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX6-NEXT: v_not_b32_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s9, 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0
+; GFX6-NEXT: v_not_b32_e32 v1, v0
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT: s_lshr_b32 s8, s1, 31
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0
-; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7
+; GFX6-NEXT: s_lshr_b32 s0, s1, 31
+; GFX6-NEXT: s_mov_b32 s1, 0
+; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7
+; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1
+; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7
; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7
-; GFX6-NEXT: v_lshl_b64 v[4:5], s[10:11], v7
-; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX6-NEXT: v_lshl_b64 v[0:1], s[10:11], v8
+; GFX6-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v8
; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT: v_mov_b32_e32 v3, s0
+; GFX6-NEXT: v_mov_b32_e32 v4, s1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v6
-; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v6
+; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v10
; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2
-; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v6
+; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11
-; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v6
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v10
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX6-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v9, v1
+; GFX6-NEXT: v_or_b32_e32 v1, v6, v1
; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX6-NEXT: v_or_b32_e32 v3, v9, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i128_ssv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX8-NEXT: v_not_b32_e32 v0, v0
-; GFX8-NEXT: s_mov_b32 s9, 0
-; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0
+; GFX8-NEXT: v_not_b32_e32 v1, v0
+; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT: s_lshr_b32 s8, s1, 31
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX8-NEXT: s_lshr_b32 s0, s1, 31
+; GFX8-NEXT: s_mov_b32 s1, 0
+; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7
+; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1]
; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9]
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX8-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9]
; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v6
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v6
+; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v9, v1
+; GFX8-NEXT: v_or_b32_e32 v1, v6, v1
; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v9, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i128_ssv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0
-; GFX9-NEXT: v_not_b32_e32 v0, v0
-; GFX9-NEXT: s_mov_b32 s9, 0
-; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0
+; GFX9-NEXT: v_not_b32_e32 v1, v0
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT: s_lshr_b32 s8, s1, 31
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1]
+; GFX9-NEXT: s_lshr_b32 s0, s1, 31
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7
+; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1]
; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11]
-; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9]
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11]
+; GFX9-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5]
+; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7]
-; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v6
+; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7]
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; GFX9-NEXT: v_or_b32_e32 v0, v8, v0
-; GFX9-NEXT: v_or_b32_e32 v1, v9, v1
+; GFX9-NEXT: v_or_b32_e32 v1, v6, v1
; GFX9-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX9-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX9-NEXT: v_or_b32_e32 v3, v9, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i128_ssv:
@@ -6543,40 +6588,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_svs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX6-NEXT: s_lshr_b32 s0, s1, 31
; GFX6-NEXT: s_mov_b32 s1, 0
; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT: s_sub_i32 s7, s4, 64
-; GFX6-NEXT: s_sub_i32 s5, 64, s4
-; GFX6-NEXT: s_cmp_lt_u32 s4, 64
-; GFX6-NEXT: s_cselect_b32 s12, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT: s_not_b32 s5, s4
+; GFX6-NEXT: s_sub_i32 s12, s2, 64
+; GFX6-NEXT: s_sub_i32 s8, 64, s2
+; GFX6-NEXT: s_cmp_lt_u32 s2, 64
; GFX6-NEXT: s_cselect_b32 s13, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[8:9], s4
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s5
-; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s7
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_cmp_eq_u32 s2, 0
+; GFX6-NEXT: s_cselect_b32 s14, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
+; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
-; GFX6-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT: s_sub_i32 s0, s6, 64
-; GFX6-NEXT: s_sub_i32 s1, 64, s6
-; GFX6-NEXT: s_cmp_lt_u32 s6, 64
-; GFX6-NEXT: s_cselect_b32 s7, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s6, 0
-; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT: s_and_b32 s0, s4, 0x7f
+; GFX6-NEXT: s_sub_i32 s1, s0, 64
+; GFX6-NEXT: s_sub_i32 s4, 64, s0
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
+; GFX6-NEXT: s_cselect_b32 s5, 1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
+; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s6
-; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT: s_and_b32 s0, 1, s7
+; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1
+; GFX6-NEXT: s_and_b32 s0, 1, s5
; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_or_b32_e32 v5, v5, v7
; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6590,46 +6636,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX6-NEXT: v_or_b32_e32 v0, s2, v0
; GFX6-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT: v_or_b32_e32 v2, s4, v2
-; GFX6-NEXT: v_or_b32_e32 v3, s5, v3
+; GFX6-NEXT: v_or_b32_e32 v2, s6, v2
+; GFX6-NEXT: v_or_b32_e32 v3, s7, v3
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_fshr_i128_svs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX8-NEXT: s_lshr_b32 s0, s1, 31
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT: s_sub_i32 s7, s4, 64
-; GFX8-NEXT: s_sub_i32 s5, 64, s4
-; GFX8-NEXT: s_cmp_lt_u32 s4, 64
-; GFX8-NEXT: s_cselect_b32 s12, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT: s_not_b32 s5, s4
+; GFX8-NEXT: s_sub_i32 s12, s2, 64
+; GFX8-NEXT: s_sub_i32 s8, 64, s2
+; GFX8-NEXT: s_cmp_lt_u32 s2, 64
; GFX8-NEXT: s_cselect_b32 s13, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[8:9], s4
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s7
-; GFX8-NEXT: s_cmp_lg_u32 s12, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b32 s14, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
+; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT: s_sub_i32 s0, s6, 64
-; GFX8-NEXT: s_sub_i32 s1, 64, s6
-; GFX8-NEXT: s_cmp_lt_u32 s6, 64
-; GFX8-NEXT: s_cselect_b32 s7, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s6, 0
-; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT: s_cmp_lg_u32 s14, 0
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT: s_and_b32 s0, s4, 0x7f
+; GFX8-NEXT: s_sub_i32 s1, s0, 64
+; GFX8-NEXT: s_sub_i32 s4, 64, s0
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
+; GFX8-NEXT: s_cselect_b32 s5, 1, 0
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
+; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT: s_and_b32 s0, 1, s7
+; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX8-NEXT: s_and_b32 s0, 1, s5
; GFX8-NEXT: v_or_b32_e32 v4, v4, v6
; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6643,46 +6690,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX8-NEXT: v_or_b32_e32 v0, s2, v0
; GFX8-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT: v_or_b32_e32 v2, s4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, s5, v3
+; GFX8-NEXT: v_or_b32_e32 v2, s6, v2
+; GFX8-NEXT: v_or_b32_e32 v3, s7, v3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: v_fshr_i128_svs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
; GFX9-NEXT: s_lshr_b32 s0, s1, 31
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT: s_sub_i32 s7, s4, 64
-; GFX9-NEXT: s_sub_i32 s5, 64, s4
-; GFX9-NEXT: s_cmp_lt_u32 s4, 64
-; GFX9-NEXT: s_cselect_b32 s12, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT: s_not_b32 s5, s4
+; GFX9-NEXT: s_sub_i32 s12, s2, 64
+; GFX9-NEXT: s_sub_i32 s8, 64, s2
+; GFX9-NEXT: s_cmp_lt_u32 s2, 64
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[8:9], s4
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s7
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s14, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], s8
+; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s5
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
-; GFX9-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT: s_sub_i32 s0, s6, 64
-; GFX9-NEXT: s_sub_i32 s1, 64, s6
-; GFX9-NEXT: s_cmp_lt_u32 s6, 64
-; GFX9-NEXT: s_cselect_b32 s7, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s6, 0
-; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT: s_cmp_lg_u32 s14, 0
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT: s_and_b32 s0, s4, 0x7f
+; GFX9-NEXT: s_sub_i32 s1, s0, 64
+; GFX9-NEXT: s_sub_i32 s4, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
+; GFX9-NEXT: s_cselect_b32 s5, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
+; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT: s_and_b32 s0, 1, s7
+; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3]
+; GFX9-NEXT: s_and_b32 s0, 1, s5
; GFX9-NEXT: v_or_b32_e32 v4, v4, v6
; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
@@ -6696,50 +6744,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
; GFX9-NEXT: v_or_b32_e32 v0, s2, v0
; GFX9-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT: v_or_b32_e32 v2, s4, v2
-; GFX9-NEXT: v_or_b32_e32 v3, s5, v3
+; GFX9-NEXT: v_or_b32_e32 v2, s6, v2
+; GFX9-NEXT: v_or_b32_e32 v3, s7, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_fshr_i128_svs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s8, s1, 31
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: s_lshr_b32 s6, s1, 31
+; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT: s_sub_i32 s7, s4, 64
-; GFX10-NEXT: s_sub_i32 s5, 64, s4
-; GFX10-NEXT: s_cmp_lt_u32 s4, 64
-; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT: s_not_b32 s10, s4
+; GFX10-NEXT: s_sub_i32 s12, s5, 64
+; GFX10-NEXT: s_sub_i32 s6, 64, s5
+; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: s_cselect_b32 s13, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s4
-; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s7
-; GFX10-NEXT: s_cmp_lg_u32 s12, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s5, 0
+; GFX10-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s10
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT: s_sub_i32 s0, 64, s6
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT: s_sub_i32 s0, s6, 64
-; GFX10-NEXT: s_cmp_lt_u32 s6, 64
-; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s6, 0
+; GFX10-NEXT: s_and_b32 s0, s4, 0x7f
+; GFX10-NEXT: s_sub_i32 s1, 64, s0
+; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX10-NEXT: s_sub_i32 s1, s0, 64
+; GFX10-NEXT: s_cmp_lt_u32 s0, 64
+; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX10-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s1
+; GFX10-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10-NEXT: s_and_b32 s1, 1, s4
; GFX10-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s7
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX10-NEXT: s_and_b32 s0, 1, s5
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6749,64 +6798,65 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0
; GFX10-NEXT: v_or_b32_e32 v2, s2, v2
; GFX10-NEXT: v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT: v_or_b32_e32 v0, s4, v0
-; GFX10-NEXT: v_or_b32_e32 v1, s5, v1
+; GFX10-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX10-NEXT: v_or_b32_e32 v1, s9, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_fshr_i128_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5]
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s8, s1, 31
-; GFX11-NEXT: s_mov_b32 s9, 0
+; GFX11-NEXT: s_lshr_b32 s6, s1, 31
+; GFX11-NEXT: s_mov_b32 s7, 0
+; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT: s_sub_i32 s7, s4, 64
-; GFX11-NEXT: s_sub_i32 s5, 64, s4
-; GFX11-NEXT: s_cmp_lt_u32 s4, 64
-; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT: s_not_b32 s10, s4
+; GFX11-NEXT: s_sub_i32 s12, s5, 64
+; GFX11-NEXT: s_sub_i32 s6, 64, s5
+; GFX11-NEXT: s_cmp_lt_u32 s5, 64
; GFX11-NEXT: s_cselect_b32 s13, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s5
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4
-; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT: s_cmp_lg_u32 s12, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s10
+; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12
; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s5, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT: s_sub_i32 s0, 64, s6
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7f
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT: s_sub_i32 s0, s6, 64
-; GFX11-NEXT: s_cmp_lt_u32 s6, 64
-; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3]
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 0
+; GFX11-NEXT: s_sub_i32 s1, 64, s0
+; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX11-NEXT: s_sub_i32 s1, s0, 64
+; GFX11-NEXT: s_cmp_lt_u32 s0, 64
+; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3]
+; GFX11-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-NEXT: s_and_b32 s0, 1, s1
+; GFX11-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-NEXT: s_and_b32 s1, 1, s4
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s7
-; GFX11-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3]
+; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX11-NEXT: s_and_b32 s0, 1, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v2, s2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT: v_or_b32_e32 v0, s4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v1, s5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT: v_or_b32_e32 v1, s9, v1
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
%cast.result = bitcast i128 %result to <4 x float>
@@ -6816,51 +6866,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
; GFX6-LABEL: v_fshr_i128_vss:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: s_sub_i32 s5, s4, 64
-; GFX6-NEXT: s_sub_i32 s7, 64, s4
; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX6-NEXT: s_cmp_lt_u32 s4, 64
+; GFX6-NEXT: s_andn2_b32 s5, 0x7f, s4
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX6-NEXT: s_sub_i32 s6, s5, 64
+; GFX6-NEXT: s_sub_i32 s7, 64, s5
+; GFX6-NEXT: s_cmp_lt_u32 s5, 64
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7
+; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_cmp_eq_u32 s5, 0
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7
-; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT: s_and_b32 s4, 1, s8
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX6-NEXT: s_and_b32 s4, 1, s9
-; GFX6-NEXT: s_sub_i32 s10, s6, 64
-; GFX6-NEXT: s_sub_i32 s8, 64, s6
+; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s5
; GFX6-NEXT: v_or_b32_e32 v6, v0, v6
; GFX6-NEXT: v_or_b32_e32 v7, v1, v7
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5
-; GFX6-NEXT: s_cmp_lt_u32 s6, 64
-; GFX6-NEXT: s_cselect_b32 s11, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s6, 0
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s6
+; GFX6-NEXT: s_and_b32 s5, 1, s8
+; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT: s_and_b32 s5, 1, s9
; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX6-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX6-NEXT: s_sub_i32 s10, s5, 64
+; GFX6-NEXT: s_sub_i32 s8, 64, s5
+; GFX6-NEXT: s_cmp_lt_u32 s5, 64
+; GFX6-NEXT: s_cselect_b32 s11, 1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s5, 0
; GFX6-NEXT: s_cselect_b32 s12, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
-; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX6-NEXT: v_or_b32_e32 v0, s0, v4
; GFX6-NEXT: v_or_b32_e32 v1, s1, v5
; GFX6-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6869,51 +6919,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX8-LABEL: v_fshr_i128_vss:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: s_sub_i32 s5, s4, 64
-; GFX8-NEXT: s_sub_i32 s7, 64, s4
; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX8-NEXT: s_cmp_lt_u32 s4, 64
+; GFX8-NEXT: s_andn2_b32 s5, 0x7f, s4
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX8-NEXT: s_sub_i32 s6, s5, 64
+; GFX8-NEXT: s_sub_i32 s7, 64, s5
+; GFX8-NEXT: s_cmp_lt_u32 s5, 64
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cmp_eq_u32 s5, 0
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT: s_and_b32 s4, 1, s8
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT: s_and_b32 s4, 1, s9
-; GFX8-NEXT: s_sub_i32 s10, s6, 64
-; GFX8-NEXT: s_sub_i32 s8, 64, s6
+; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5]
; GFX8-NEXT: v_or_b32_e32 v6, v0, v6
; GFX8-NEXT: v_or_b32_e32 v7, v1, v7
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX8-NEXT: s_cmp_lt_u32 s6, 64
-; GFX8-NEXT: s_cselect_b32 s11, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s6, 0
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX8-NEXT: s_and_b32 s5, 1, s8
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT: s_and_b32 s5, 1, s9
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX8-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX8-NEXT: s_sub_i32 s10, s5, 64
+; GFX8-NEXT: s_sub_i32 s8, 64, s5
+; GFX8-NEXT: s_cmp_lt_u32 s5, 64
+; GFX8-NEXT: s_cselect_b32 s11, 1, 0
+; GFX8-NEXT: s_cmp_eq_u32 s5, 0
; GFX8-NEXT: s_cselect_b32 s12, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
-; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX8-NEXT: v_or_b32_e32 v0, s0, v4
; GFX8-NEXT: v_or_b32_e32 v1, s1, v5
; GFX8-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6922,51 +6972,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
;
; GFX9-LABEL: v_fshr_i128_vss:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: s_sub_i32 s5, s4, 64
-; GFX9-NEXT: s_sub_i32 s7, 64, s4
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
-; GFX9-NEXT: s_cmp_lt_u32 s4, 64
+; GFX9-NEXT: s_andn2_b32 s5, 0x7f, s4
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
+; GFX9-NEXT: s_sub_i32 s6, s5, 64
+; GFX9-NEXT: s_sub_i32 s7, 64, s5
+; GFX9-NEXT: s_cmp_lt_u32 s5, 64
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_cmp_eq_u32 s5, 0
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT: s_and_b32 s4, 1, s8
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT: s_and_b32 s4, 1, s9
-; GFX9-NEXT: s_sub_i32 s10, s6, 64
-; GFX9-NEXT: s_sub_i32 s8, 64, s6
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5]
; GFX9-NEXT: v_or_b32_e32 v6, v0, v6
; GFX9-NEXT: v_or_b32_e32 v7, v1, v7
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX9-NEXT: s_cmp_lt_u32 s6, 64
-; GFX9-NEXT: s_cselect_b32 s11, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s6, 0
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5]
+; GFX9-NEXT: s_and_b32 s5, 1, s8
+; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT: s_and_b32 s5, 1, s9
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5
+; GFX9-NEXT: s_and_b32 s5, s4, 0x7f
+; GFX9-NEXT: s_sub_i32 s10, s5, 64
+; GFX9-NEXT: s_sub_i32 s8, 64, s5
+; GFX9-NEXT: s_cmp_lt_u32 s5, 64
+; GFX9-NEXT: s_cselect_b32 s11, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s5, 0
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6
-; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6
+; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
; GFX9-NEXT: v_or_b32_e32 v0, s0, v4
; GFX9-NEXT: v_or_b32_e32 v1, s1, v5
; GFX9-NEXT: v_or_b32_e32 v2, s2, v2
@@ -6978,49 +7028,49 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v1
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5]
-; GFX10-NEXT: s_sub_i32 s7, 64, s4
+; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT: s_sub_i32 s6, s5, 64
; GFX10-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT: s_sub_i32 s5, s4, 64
-; GFX10-NEXT: s_cmp_lt_u32 s4, 64
+; GFX10-NEXT: s_sub_i32 s7, 64, s5
+; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s5, 0
+; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
-; GFX10-NEXT: s_and_b32 s4, 1, s8
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX10-NEXT: s_and_b32 s5, 1, s8
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX10-NEXT: s_and_b32 s5, s4, 0x7f
; GFX10-NEXT: v_or_b32_e32 v4, v4, v6
; GFX10-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX10-NEXT: s_and_b32 s4, 1, s9
-; GFX10-NEXT: s_sub_i32 s10, s6, 64
-; GFX10-NEXT: s_sub_i32 s7, 64, s6
-; GFX10-NEXT: s_cmp_lt_u32 s6, 64
+; GFX10-NEXT: s_and_b32 s6, 1, s9
+; GFX10-NEXT: s_sub_i32 s10, s5, 64
+; GFX10-NEXT: s_sub_i32 s8, 64, s5
+; GFX10-NEXT: s_cmp_lt_u32 s5, 64
; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s6, 0
+; GFX10-NEXT: s_cmp_eq_u32 s5, 0
; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s6
-; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s7
-; GFX10-NEXT: s_lshr_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
; GFX10-NEXT: v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
+; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
; GFX10-NEXT: v_or_b32_e32 v1, s1, v7
; GFX10-NEXT: v_or_b32_e32 v2, s2, v2
; GFX10-NEXT: v_or_b32_e32 v3, s3, v3
@@ -7031,47 +7081,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v1
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5]
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s7, 64, s4
+; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_sub_i32 s6, s5, 64
; GFX11-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT: s_sub_i32 s5, s4, 64
-; GFX11-NEXT: s_cmp_lt_u32 s4, 64
+; GFX11-NEXT: s_sub_i32 s7, 64, s5
+; GFX11-NEXT: s_cmp_lt_u32 s5, 64
; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1]
; GFX11-NEXT: s_cselect_b32 s8, 1, 0
-; GFX11-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3]
; GFX11-NEXT: s_cselect_b32 s9, 1, 0
-; GFX11-NEXT: s_and_b32 s4, 1, s8
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1]
+; GFX11-NEXT: s_and_b32 s5, 1, s8
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1]
+; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
+; GFX11-NEXT: s_and_b32 s5, s4, 0x7f
; GFX11-NEXT: v_or_b32_e32 v4, v4, v6
; GFX11-NEXT: v_or_b32_e32 v5, v5, v7
-; GFX11-NEXT: s_and_b32 s4, 1, s9
-; GFX11-NEXT: s_sub_i32 s10, s6, 64
-; GFX11-NEXT: s_sub_i32 s7, 64, s6
-; GFX11-NEXT: s_cmp_lt_u32 s6, 64
+; GFX11-NEXT: s_and_b32 s6, 1, s9
+; GFX11-NEXT: s_sub_i32 s10, s5, 64
+; GFX11-NEXT: s_sub_i32 s8, 64, s5
+; GFX11-NEXT: s_cmp_lt_u32 s5, 64
; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
; GFX11-NEXT: s_cselect_b32 s11, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 0
+; GFX11-NEXT: s_cmp_eq_u32 s5, 0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6
; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s6
-; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s7
-; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
-; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
+; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3]
; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
; GFX11-NEXT: v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], 0
+; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0
; GFX11-NEXT: v_or_b32_e32 v1, s1, v7
; GFX11-NEXT: v_or_b32_e32 v2, s2, v2
; GFX11-NEXT: v_or_b32_e32 v3, s3, v3
@@ -7209,435 +7259,447 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
; GFX6-LABEL: s_fshr_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT: s_lshr_b32 s24, s1, 31
-; GFX6-NEXT: s_mov_b32 s25, 0
-; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], 1
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX6-NEXT: s_sub_i32 s19, s16, 64
-; GFX6-NEXT: s_sub_i32 s17, 64, s16
-; GFX6-NEXT: s_cmp_lt_u32 s16, 64
-; GFX6-NEXT: s_cselect_b32 s24, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, 0
+; GFX6-NEXT: s_lshr_b32 s22, s1, 31
+; GFX6-NEXT: s_mov_b32 s23, 0
+; GFX6-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT: s_not_b32 s17, s16
+; GFX6-NEXT: s_sub_i32 s21, s2, 64
+; GFX6-NEXT: s_sub_i32 s22, 64, s2
+; GFX6-NEXT: s_cmp_lt_u32 s2, 64
; GFX6-NEXT: s_cselect_b32 s28, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[2:3], s[22:23], s16
-; GFX6-NEXT: s_lshr_b64 s[26:27], s[22:23], s17
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT: s_lshl_b64 s[22:23], s[22:23], s19
-; GFX6-NEXT: s_cmp_lg_u32 s24, 0
-; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT: s_cmp_eq_u32 s2, 0
+; GFX6-NEXT: s_cselect_b32 s29, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
+; GFX6-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
+; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
+; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
; GFX6-NEXT: s_cmp_lg_u32 s28, 0
-; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT: s_sub_i32 s24, s18, 64
-; GFX6-NEXT: s_sub_i32 s22, 64, s18
-; GFX6-NEXT: s_cmp_lt_u32 s18, 64
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX6-NEXT: s_cmp_lg_u32 s29, 0
+; GFX6-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX6-NEXT: s_and_b32 s0, s16, 0x7f
+; GFX6-NEXT: s_sub_i32 s21, s0, 64
+; GFX6-NEXT: s_sub_i32 s22, 64, s0
+; GFX6-NEXT: s_cmp_lt_u32 s0, 64
; GFX6-NEXT: s_cselect_b32 s26, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, 0
+; GFX6-NEXT: s_cmp_eq_u32 s0, 0
; GFX6-NEXT: s_cselect_b32 s27, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s18
-; GFX6-NEXT: s_lshr_b64 s[18:19], s[8:9], s18
-; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
-; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s24
+; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
+; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
; GFX6-NEXT: s_cmp_lg_u32 s26, 0
-; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s27, 0
; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s26, 0
; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT: s_lshr_b32 s24, s5, 31
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
-; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX6-NEXT: s_sub_i32 s9, s10, 64
-; GFX6-NEXT: s_sub_i32 s11, 64, s10
-; GFX6-NEXT: s_cmp_lt_u32 s10, 64
-; GFX6-NEXT: s_cselect_b32 s20, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s10, 0
+; GFX6-NEXT: s_lshr_b32 s22, s5, 31
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
+; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s20
+; GFX6-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT: s_not_b32 s16, s20
+; GFX6-NEXT: s_sub_i32 s18, s6, 64
+; GFX6-NEXT: s_sub_i32 s10, 64, s6
+; GFX6-NEXT: s_cmp_lt_u32 s6, 64
+; GFX6-NEXT: s_cselect_b32 s19, 1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s6, 0
; GFX6-NEXT: s_cselect_b32 s21, 1, 0
-; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10
-; GFX6-NEXT: s_lshr_b64 s[18:19], s[16:17], s11
-; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
-; GFX6-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX6-NEXT: s_lshl_b64 s[16:17], s[16:17], s9
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX6-NEXT: s_cmp_lg_u32 s21, 0
-; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX6-NEXT: s_sub_i32 s18, s8, 64
-; GFX6-NEXT: s_sub_i32 s16, 64, s8
-; GFX6-NEXT: s_cmp_lt_u32 s8, 64
+; GFX6-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX6-NEXT: s_and_b32 s4, s20, 0x7f
+; GFX6-NEXT: s_sub_i32 s18, s4, 64
+; GFX6-NEXT: s_sub_i32 s16, 64, s4
+; GFX6-NEXT: s_cmp_lt_u32 s4, 64
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s20, 1, 0
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8
-; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
+; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s21, 1, 0
+; GFX6-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX6-NEXT: s_cmp_lg_u32 s21, 0
+; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fshr_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT: s_lshr_b32 s24, s1, 31
-; GFX8-NEXT: s_mov_b32 s25, 0
-; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], 1
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX8-NEXT: s_sub_i32 s19, s16, 64
-; GFX8-NEXT: s_sub_i32 s17, 64, s16
-; GFX8-NEXT: s_cmp_lt_u32 s16, 64
-; GFX8-NEXT: s_cselect_b32 s24, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s16, 0
+; GFX8-NEXT: s_lshr_b32 s22, s1, 31
+; GFX8-NEXT: s_mov_b32 s23, 0
+; GFX8-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT: s_not_b32 s17, s16
+; GFX8-NEXT: s_sub_i32 s21, s2, 64
+; GFX8-NEXT: s_sub_i32 s22, 64, s2
+; GFX8-NEXT: s_cmp_lt_u32 s2, 64
; GFX8-NEXT: s_cselect_b32 s28, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[22:23], s16
-; GFX8-NEXT: s_lshr_b64 s[26:27], s[22:23], s17
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT: s_lshl_b64 s[22:23], s[22:23], s19
-; GFX8-NEXT: s_cmp_lg_u32 s24, 0
-; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b32 s29, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
+; GFX8-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
+; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
; GFX8-NEXT: s_cmp_lg_u32 s28, 0
-; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT: s_sub_i32 s24, s18, 64
-; GFX8-NEXT: s_sub_i32 s22, 64, s18
-; GFX8-NEXT: s_cmp_lt_u32 s18, 64
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX8-NEXT: s_cmp_lg_u32 s29, 0
+; GFX8-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX8-NEXT: s_and_b32 s0, s16, 0x7f
+; GFX8-NEXT: s_sub_i32 s21, s0, 64
+; GFX8-NEXT: s_sub_i32 s22, 64, s0
+; GFX8-NEXT: s_cmp_lt_u32 s0, 64
; GFX8-NEXT: s_cselect_b32 s26, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s18, 0
+; GFX8-NEXT: s_cmp_eq_u32 s0, 0
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s18
-; GFX8-NEXT: s_lshr_b64 s[18:19], s[8:9], s18
-; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
-; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s24
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
+; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
; GFX8-NEXT: s_cmp_lg_u32 s26, 0
-; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s26, 0
; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT: s_lshr_b32 s24, s5, 31
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
-; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX8-NEXT: s_sub_i32 s9, s10, 64
-; GFX8-NEXT: s_sub_i32 s11, 64, s10
-; GFX8-NEXT: s_cmp_lt_u32 s10, 64
-; GFX8-NEXT: s_cselect_b32 s20, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s10, 0
+; GFX8-NEXT: s_lshr_b32 s22, s5, 31
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
+; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s20
+; GFX8-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT: s_not_b32 s16, s20
+; GFX8-NEXT: s_sub_i32 s18, s6, 64
+; GFX8-NEXT: s_sub_i32 s10, 64, s6
+; GFX8-NEXT: s_cmp_lt_u32 s6, 64
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
+; GFX8-NEXT: s_cmp_eq_u32 s6, 0
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
-; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10
-; GFX8-NEXT: s_lshr_b64 s[18:19], s[16:17], s11
-; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
-; GFX8-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX8-NEXT: s_lshl_b64 s[16:17], s[16:17], s9
-; GFX8-NEXT: s_cmp_lg_u32 s20, 0
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
+; GFX8-NEXT: s_cmp_lg_u32 s19, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
-; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX8-NEXT: s_sub_i32 s18, s8, 64
-; GFX8-NEXT: s_sub_i32 s16, 64, s8
-; GFX8-NEXT: s_cmp_lt_u32 s8, 64
+; GFX8-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX8-NEXT: s_and_b32 s4, s20, 0x7f
+; GFX8-NEXT: s_sub_i32 s18, s4, 64
+; GFX8-NEXT: s_sub_i32 s16, 64, s4
+; GFX8-NEXT: s_cmp_lt_u32 s4, 64
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
-; GFX8-NEXT: s_cmp_eq_u32 s8, 0
-; GFX8-NEXT: s_cselect_b32 s20, 1, 0
-; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8
-; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
+; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cselect_b32 s21, 1, 0
+; GFX8-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX8-NEXT: s_cmp_lg_u32 s20, 0
-; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX8-NEXT: s_cmp_lg_u32 s21, 0
+; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fshr_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT: s_lshr_b32 s24, s1, 31
-; GFX9-NEXT: s_mov_b32 s25, 0
-; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], 1
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25]
-; GFX9-NEXT: s_sub_i32 s19, s16, 64
-; GFX9-NEXT: s_sub_i32 s17, 64, s16
-; GFX9-NEXT: s_cmp_lt_u32 s16, 64
-; GFX9-NEXT: s_cselect_b32 s24, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, 0
+; GFX9-NEXT: s_lshr_b32 s22, s1, 31
+; GFX9-NEXT: s_mov_b32 s23, 0
+; GFX9-NEXT: s_lshl_b64 s[18:19], s[0:1], 1
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23]
+; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT: s_not_b32 s17, s16
+; GFX9-NEXT: s_sub_i32 s21, s2, 64
+; GFX9-NEXT: s_sub_i32 s22, 64, s2
+; GFX9-NEXT: s_cmp_lt_u32 s2, 64
; GFX9-NEXT: s_cselect_b32 s28, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[22:23], s16
-; GFX9-NEXT: s_lshr_b64 s[26:27], s[22:23], s17
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT: s_lshl_b64 s[22:23], s[22:23], s19
-; GFX9-NEXT: s_cmp_lg_u32 s24, 0
-; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s29, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[24:25], s[18:19], s22
+; GFX9-NEXT: s_lshl_b64 s[26:27], s[0:1], s17
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17
+; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
+; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21
; GFX9-NEXT: s_cmp_lg_u32 s28, 0
-; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT: s_sub_i32 s24, s18, 64
-; GFX9-NEXT: s_sub_i32 s22, 64, s18
-; GFX9-NEXT: s_cmp_lt_u32 s18, 64
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19]
+; GFX9-NEXT: s_cmp_lg_u32 s29, 0
+; GFX9-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19]
+; GFX9-NEXT: s_and_b32 s0, s16, 0x7f
+; GFX9-NEXT: s_sub_i32 s21, s0, 64
+; GFX9-NEXT: s_sub_i32 s22, 64, s0
+; GFX9-NEXT: s_cmp_lt_u32 s0, 64
; GFX9-NEXT: s_cselect_b32 s26, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s18, 0
+; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s18
-; GFX9-NEXT: s_lshr_b64 s[18:19], s[8:9], s18
-; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22
-; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23]
-; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s24
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16
+; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22
+; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21
; GFX9-NEXT: s_cmp_lg_u32 s26, 0
-; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11]
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s26, 0
; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
-; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT: s_lshr_b32 s24, s5, 31
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1
-; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25]
-; GFX9-NEXT: s_sub_i32 s9, s10, 64
-; GFX9-NEXT: s_sub_i32 s11, 64, s10
-; GFX9-NEXT: s_cmp_lt_u32 s10, 64
-; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, 0
+; GFX9-NEXT: s_lshr_b32 s22, s5, 31
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], 1
+; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23]
+; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s20
+; GFX9-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT: s_not_b32 s16, s20
+; GFX9-NEXT: s_sub_i32 s18, s6, 64
+; GFX9-NEXT: s_sub_i32 s10, 64, s6
+; GFX9-NEXT: s_cmp_lt_u32 s6, 64
+; GFX9-NEXT: s_cselect_b32 s19, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s6, 0
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10
-; GFX9-NEXT: s_lshr_b64 s[18:19], s[16:17], s11
-; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
-; GFX9-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11]
-; GFX9-NEXT: s_lshl_b64 s[16:17], s[16:17], s9
-; GFX9-NEXT: s_cmp_lg_u32 s20, 0
+; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10
+; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18
+; GFX9-NEXT: s_cmp_lg_u32 s19, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17]
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9]
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
-; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11]
-; GFX9-NEXT: s_sub_i32 s18, s8, 64
-; GFX9-NEXT: s_sub_i32 s16, 64, s8
-; GFX9-NEXT: s_cmp_lt_u32 s8, 64
+; GFX9-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-NEXT: s_and_b32 s4, s20, 0x7f
+; GFX9-NEXT: s_sub_i32 s18, s4, 64
+; GFX9-NEXT: s_sub_i32 s16, 64, s4
+; GFX9-NEXT: s_cmp_lt_u32 s4, 64
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s8, 0
-; GFX9-NEXT: s_cselect_b32 s20, 1, 0
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8
-; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8
+; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_cselect_b32 s21, 1, 0
+; GFX9-NEXT: s_lshr_b64 s[10:11], s[12:13], s20
; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17]
+; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17]
; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15]
-; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15]
+; GFX9-NEXT: s_cmp_lg_u32 s21, 0
+; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9]
-; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13]
+; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11]
+; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fshr_v2i128:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17]
; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT: s_lshr_b32 s22, s1, 31
-; GFX10-NEXT: s_mov_b32 s23, 0
+; GFX10-NEXT: s_lshr_b32 s18, s1, 31
+; GFX10-NEXT: s_mov_b32 s19, 0
+; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16
; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX10-NEXT: s_sub_i32 s19, s16, 64
-; GFX10-NEXT: s_sub_i32 s17, 64, s16
-; GFX10-NEXT: s_cmp_lt_u32 s16, 64
-; GFX10-NEXT: s_cselect_b32 s22, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s16, 0
+; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX10-NEXT: s_not_b32 s18, s16
+; GFX10-NEXT: s_sub_i32 s21, s17, 64
+; GFX10-NEXT: s_sub_i32 s22, 64, s17
+; GFX10-NEXT: s_cmp_lt_u32 s17, 64
; GFX10-NEXT: s_cselect_b32 s28, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s17
-; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s16
-; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s16
-; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
-; GFX10-NEXT: s_cmp_lg_u32 s22, 0
-; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX10-NEXT: s_cmp_eq_u32 s17, 0
+; GFX10-NEXT: s_cselect_b32 s17, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22
+; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18
+; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18
+; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21
; GFX10-NEXT: s_cmp_lg_u32 s28, 0
+; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT: s_sub_i32 s22, s18, 64
-; GFX10-NEXT: s_sub_i32 s19, 64, s18
-; GFX10-NEXT: s_cmp_lt_u32 s18, 64
+; GFX10-NEXT: s_and_b32 s0, s16, 0x7f
+; GFX10-NEXT: s_sub_i32 s18, s0, 64
+; GFX10-NEXT: s_sub_i32 s17, 64, s0
+; GFX10-NEXT: s_cmp_lt_u32 s0, 64
+; GFX10-NEXT: s_cselect_b32 s21, 1, 0
+; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cselect_b32 s26, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s18, 0
-; GFX10-NEXT: s_cselect_b32 s27, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s18
-; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s19
-; GFX10-NEXT: s_lshr_b64 s[18:19], s[10:11], s18
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s22
-; GFX10-NEXT: s_cmp_lg_u32 s26, 0
+; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16
+; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
+; GFX10-NEXT: s_cmp_lg_u32 s21, 0
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT: s_cmp_lg_u32 s27, 0
-; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s26, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[18:19], 0
-; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21]
+; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s21, 0
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT: s_lshr_b32 s22, s5, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX10-NEXT: s_lshr_b32 s18, s5, 31
+; GFX10-NEXT: s_andn2_b32 s8, 0x7f, s20
+; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX10-NEXT: s_sub_i32 s9, s10, 64
-; GFX10-NEXT: s_sub_i32 s11, 64, s10
-; GFX10-NEXT: s_cmp_lt_u32 s10, 64
-; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_cmp_eq_u32 s10, 0
-; GFX10-NEXT: s_cselect_b32 s21, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s11
-; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s10
-; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
-; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX10-NEXT: s_cmp_lg_u32 s20, 0
-; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT: s_cmp_lg_u32 s21, 0
-; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX10-NEXT: s_not_b32 s16, s20
; GFX10-NEXT: s_sub_i32 s18, s8, 64
; GFX10-NEXT: s_sub_i32 s9, 64, s8
; GFX10-NEXT: s_cmp_lt_u32 s8, 64
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s8, 0
-; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s8
-; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s9
-; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s8
-; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
+; GFX10-NEXT: s_cselect_b32 s21, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s9
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s16
+; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s18
+; GFX10-NEXT: s_cmp_lg_u32 s19, 0
+; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX10-NEXT: s_cmp_lg_u32 s21, 0
+; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX10-NEXT: s_and_b32 s4, s20, 0x7f
+; GFX10-NEXT: s_sub_i32 s18, s4, 64
+; GFX10-NEXT: s_sub_i32 s8, 64, s4
+; GFX10-NEXT: s_cmp_lt_u32 s4, 64
+; GFX10-NEXT: s_cselect_b32 s19, 1, 0
+; GFX10-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10-NEXT: s_cselect_b32 s21, 1, 0
+; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s20
+; GFX10-NEXT: s_lshl_b64 s[8:9], s[14:15], s8
+; GFX10-NEXT: s_lshr_b64 s[16:17], s[14:15], s20
+; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s18
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
-; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX10-NEXT: s_cmp_lg_u32 s20, 0
+; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX10-NEXT: s_cmp_lg_u32 s21, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
-; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshr_v2i128:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f
-; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17]
; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT: s_lshr_b32 s22, s1, 31
-; GFX11-NEXT: s_mov_b32 s23, 0
+; GFX11-NEXT: s_lshr_b32 s18, s1, 31
+; GFX11-NEXT: s_mov_b32 s19, 0
+; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23]
-; GFX11-NEXT: s_sub_i32 s19, s16, 64
-; GFX11-NEXT: s_sub_i32 s17, 64, s16
-; GFX11-NEXT: s_cmp_lt_u32 s16, 64
-; GFX11-NEXT: s_cselect_b32 s22, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s16, 0
+; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19]
+; GFX11-NEXT: s_not_b32 s18, s16
+; GFX11-NEXT: s_sub_i32 s21, s17, 64
+; GFX11-NEXT: s_sub_i32 s22, 64, s17
+; GFX11-NEXT: s_cmp_lt_u32 s17, 64
; GFX11-NEXT: s_cselect_b32 s28, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17
-; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16
-; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16
-; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19
-; GFX11-NEXT: s_cmp_lg_u32 s22, 0
-; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1]
+; GFX11-NEXT: s_cmp_eq_u32 s17, 0
+; GFX11-NEXT: s_cselect_b32 s17, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22
+; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18
+; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18
+; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21
; GFX11-NEXT: s_cmp_lg_u32 s28, 0
+; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s17, 0
; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT: s_sub_i32 s22, s18, 64
-; GFX11-NEXT: s_sub_i32 s19, 64, s18
-; GFX11-NEXT: s_cmp_lt_u32 s18, 64
+; GFX11-NEXT: s_and_b32 s0, s16, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s18, s0, 64
+; GFX11-NEXT: s_sub_i32 s17, 64, s0
+; GFX11-NEXT: s_cmp_lt_u32 s0, 64
+; GFX11-NEXT: s_cselect_b32 s21, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cselect_b32 s26, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s18, 0
-; GFX11-NEXT: s_cselect_b32 s27, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s18
-; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s19
-; GFX11-NEXT: s_lshr_b64 s[18:19], s[10:11], s18
-; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s22
-; GFX11-NEXT: s_cmp_lg_u32 s26, 0
+; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16
+; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
+; GFX11-NEXT: s_cmp_lg_u32 s21, 0
; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT: s_cmp_lg_u32 s27, 0
-; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX11-NEXT: s_cmp_lg_u32 s26, 0
-; GFX11-NEXT: s_cselect_b64 s[8:9], s[18:19], 0
-; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21]
+; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s21, 0
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT: s_lshr_b32 s22, s5, 31
; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f
-; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1]
+; GFX11-NEXT: s_lshr_b32 s18, s5, 31
+; GFX11-NEXT: s_and_not1_b32 s8, 0x7f, s20
+; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1]
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23]
-; GFX11-NEXT: s_sub_i32 s9, s10, 64
-; GFX11-NEXT: s_sub_i32 s11, 64, s10
-; GFX11-NEXT: s_cmp_lt_u32 s10, 64
-; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s10, 0
-; GFX11-NEXT: s_cselect_b32 s21, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11
-; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s10
-; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10
-; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19]
-; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
-; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT: s_cmp_lg_u32 s21, 0
-; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX11-NEXT: s_not_b32 s16, s20
; GFX11-NEXT: s_sub_i32 s18, s8, 64
; GFX11-NEXT: s_sub_i32 s9, 64, s8
; GFX11-NEXT: s_cmp_lt_u32 s8, 64
; GFX11-NEXT: s_cselect_b32 s19, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s8, 0
-; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s8
-; GFX11-NEXT: s_lshl_b64 s[16:17], s[14:15], s9
-; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s8
-; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17]
-; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s18
+; GFX11-NEXT: s_cselect_b32 s21, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s9
+; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s16
+; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s16
+; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s18
+; GFX11-NEXT: s_cmp_lg_u32 s19, 0
+; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5]
+; GFX11-NEXT: s_cmp_lg_u32 s21, 0
+; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5]
+; GFX11-NEXT: s_and_b32 s4, s20, 0x7f
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_i32 s18, s4, 64
+; GFX11-NEXT: s_sub_i32 s8, 64, s4
+; GFX11-NEXT: s_cmp_lt_u32 s4, 64
+; GFX11-NEXT: s_cselect_b32 s19, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-NEXT: s_cselect_b32 s21, 1, 0
+; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s20
+; GFX11-NEXT: s_lshl_b64 s[8:9], s[14:15], s8
+; GFX11-NEXT: s_lshr_b64 s[16:17], s[14:15], s20
+; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s18
; GFX11-NEXT: s_cmp_lg_u32 s19, 0
-; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15]
-; GFX11-NEXT: s_cmp_lg_u32 s20, 0
+; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX11-NEXT: s_cmp_lg_u32 s21, 0
; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5]
; GFX11-NEXT: s_cmp_lg_u32 s19, 0
-; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0
+; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0
; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
; GFX11-NEXT: ; return to shader part epilog
@@ -7649,68 +7711,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-LABEL: v_fshr_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX6-NEXT: v_not_b32_e32 v16, v16
; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], 1
+; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX6-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v24
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], v0
-; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v24
-; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v24
-; GFX6-NEXT: v_lshl_b64 v[21:22], v[16:17], v24
-; GFX6-NEXT: v_or_b32_e32 v18, v0, v18
-; GFX6-NEXT: v_or_b32_e32 v19, v1, v19
-; GFX6-NEXT: v_lshl_b64 v[0:1], v[16:17], v25
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v23
-; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v23
+; GFX6-NEXT: v_not_b32_e32 v0, v16
+; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0
+; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19
+; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19
+; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19
+; GFX6-NEXT: v_or_b32_e32 v21, v0, v21
+; GFX6-NEXT: v_or_b32_e32 v22, v1, v22
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v25
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16
+; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22
; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2
-; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23
+; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v22
; GFX6-NEXT: v_or_b32_e32 v2, v0, v2
; GFX6-NEXT: v_or_b32_e32 v3, v1, v3
; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24
-; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v23
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX6-NEXT: v_not_b32_e32 v8, v20
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX6-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8
; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22
; GFX6-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v19
+; GFX6-NEXT: v_not_b32_e32 v4, v20
+; GFX6-NEXT: v_or_b32_e32 v0, v18, v0
+; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4
+; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18
; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4
-; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v19
-; GFX6-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v19
-; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v19
+; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18
+; GFX6-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18
+; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18
; GFX6-NEXT: v_or_b32_e32 v10, v4, v10
; GFX6-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v20
-; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19
+; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20
; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18
@@ -7729,8 +7791,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX6-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX6-NEXT: v_or_b32_e32 v1, v22, v1
+; GFX6-NEXT: v_or_b32_e32 v1, v23, v1
+; GFX6-NEXT: v_or_b32_e32 v3, v21, v3
; GFX6-NEXT: v_or_b32_e32 v4, v16, v4
; GFX6-NEXT: v_or_b32_e32 v5, v17, v5
; GFX6-NEXT: v_or_b32_e32 v6, v10, v6
@@ -7740,68 +7802,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-LABEL: v_fshr_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX8-NEXT: v_not_b32_e32 v16, v16
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX8-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v24
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX8-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v24
-; GFX8-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX8-NEXT: v_or_b32_e32 v18, v0, v18
-; GFX8-NEXT: v_or_b32_e32 v19, v1, v19
-; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v23
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX8-NEXT: v_not_b32_e32 v0, v16
+; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19
+; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX8-NEXT: v_or_b32_e32 v21, v0, v21
+; GFX8-NEXT: v_or_b32_e32 v22, v1, v22
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16
+; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23
+; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v22
; GFX8-NEXT: v_or_b32_e32 v2, v0, v2
; GFX8-NEXT: v_or_b32_e32 v3, v1, v3
; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX8-NEXT: v_not_b32_e32 v8, v20
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX8-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8
; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11]
; GFX8-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v19
+; GFX8-NEXT: v_not_b32_e32 v4, v20
+; GFX8-NEXT: v_or_b32_e32 v0, v18, v0
+; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18
; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX8-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v19
-; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX8-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18
+; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9]
; GFX8-NEXT: v_or_b32_e32 v10, v4, v10
; GFX8-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20
; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18
@@ -7820,8 +7882,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX8-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v22, v1
+; GFX8-NEXT: v_or_b32_e32 v1, v23, v1
+; GFX8-NEXT: v_or_b32_e32 v3, v21, v3
; GFX8-NEXT: v_or_b32_e32 v4, v16, v4
; GFX8-NEXT: v_or_b32_e32 v5, v17, v5
; GFX8-NEXT: v_or_b32_e32 v6, v10, v6
@@ -7831,68 +7893,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-LABEL: v_fshr_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16
-; GFX9-NEXT: v_not_b32_e32 v16, v16
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1]
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1
; GFX9-NEXT: v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT: v_sub_u32_e32 v0, 64, v24
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17]
-; GFX9-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3]
-; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v24
-; GFX9-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17]
-; GFX9-NEXT: v_or_b32_e32 v18, v0, v18
-; GFX9-NEXT: v_or_b32_e32 v19, v1, v19
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v23
-; GFX9-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9]
+; GFX9-NEXT: v_not_b32_e32 v0, v16
+; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18]
+; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3]
+; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19
+; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18]
+; GFX9-NEXT: v_or_b32_e32 v21, v0, v21
+; GFX9-NEXT: v_or_b32_e32 v22, v1, v22
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc
+; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11]
-; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23
+; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v22
; GFX9-NEXT: v_or_b32_e32 v2, v0, v2
; GFX9-NEXT: v_or_b32_e32 v3, v1, v3
; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11]
-; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
-; GFX9-NEXT: v_not_b32_e32 v8, v20
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GFX9-NEXT: v_or_b32_e32 v3, v19, v3
-; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT: v_sub_u32_e32 v4, 64, v19
+; GFX9-NEXT: v_not_b32_e32 v4, v20
+; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11]
+; GFX9-NEXT: v_or_b32_e32 v0, v18, v0
+; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4
+; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9]
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v2, v18, v2
-; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20
-; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v19
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7]
+; GFX9-NEXT: v_or_b32_e32 v2, v19, v2
+; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9]
; GFX9-NEXT: v_or_b32_e32 v10, v4, v10
; GFX9-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9]
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20
; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc
; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18
; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc
@@ -7911,8 +7973,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
-; GFX9-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX9-NEXT: v_or_b32_e32 v1, v22, v1
+; GFX9-NEXT: v_or_b32_e32 v1, v23, v1
+; GFX9-NEXT: v_or_b32_e32 v3, v21, v3
; GFX9-NEXT: v_or_b32_e32 v4, v16, v4
; GFX9-NEXT: v_or_b32_e32 v5, v17, v5
; GFX9-NEXT: v_or_b32_e32 v6, v10, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 404e726246f4d2..81abe91b283f96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2787,52 +2787,51 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-LABEL: v_sdiv_v2i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5
; CGP-NEXT: v_rcp_f32_e32 v1, v1
-; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT: v_rcp_f32_e32 v7, v3
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT: v_rcp_f32_e32 v8, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_mov_b32_e32 v0, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT: v_mov_b32_e32 v0, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT: v_mul_lo_u32 v4, v1, v5
+; CGP-NEXT: v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v1
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e32 v4, v1, v8, vcc
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mov_b32_e32 v5, v1
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v1, v5, v3
-; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v5
; CGP-NEXT: v_mov_b32_e32 v0, v1
-; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v6, v0
+; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v7, v0
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v4
; CGP-NEXT: v_mov_b32_e32 v7, v1
-; CGP-NEXT: v_mul_lo_u32 v8, v7, v4
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_mul_lo_u32 v8, v7, v6
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v7
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v4
-; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 3729f1cc2b12d9..183f2edbf9035b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff
; GFX8-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_shl_i32_zext_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10PLUS-NEXT: ; return to shader part epilog
%and = and i16 %x, 16383
%ext = zext i16 %and to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 5b94e71ecf52e2..cfac0c2fa56aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3286,45 +3286,45 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-LABEL: v_srem_v2i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4
-; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5
+; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6
+; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5
; CGP-NEXT: v_rcp_f32_e32 v1, v1
-; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0
+; CGP-NEXT: v_rcp_f32_e32 v7, v3
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT: v_rcp_f32_e32 v8, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
-; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_mov_b32_e32 v0, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0
+; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0
+; CGP-NEXT: v_mov_b32_e32 v0, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6
+; CGP-NEXT: v_mul_lo_u32 v0, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v4, v1, v5
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
; CGP-NEXT: v_mov_b32_e32 v0, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v1, v4
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v1, v6
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index e31d8e95bd6084..1ee521b3dedac1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2147,26 +2147,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
-; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3
-; CGP-NEXT: v_rcp_f32_e32 v4, v2
+; CGP-NEXT: v_rcp_f32_e32 v4, v1
; CGP-NEXT: v_rcp_f32_e32 v5, v3
; CGP-NEXT: v_mul_f32_e32 v4, v0, v4
-; CGP-NEXT: v_mul_f32_e32 v5, v1, v5
+; CGP-NEXT: v_mul_f32_e32 v5, v2, v5
; CGP-NEXT: v_trunc_f32_e32 v4, v4
; CGP-NEXT: v_trunc_f32_e32 v5, v5
-; CGP-NEXT: v_mad_f32 v0, -v4, v2, v0
+; CGP-NEXT: v_mad_f32 v0, -v4, v1, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_mad_f32 v1, -v5, v3, v1
+; CGP-NEXT: v_mad_f32 v2, -v5, v3, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2
+; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3
+; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index f30b278b3e611c..a7e5ce3d216199 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2561,12 +2561,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
-; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3
; CGP-NEXT: v_rcp_f32_e32 v8, v5
; CGP-NEXT: v_rcp_f32_e32 v9, v7
@@ -2584,10 +2584,10 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_mul_lo_u32 v2, v4, v2
+; CGP-NEXT: v_mul_lo_u32 v1, v4, v1
; CGP-NEXT: v_mul_lo_u32 v3, v5, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v2, v3
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1
; CGP-NEXT: v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 9ea9fa91e4f92a..1b35a89ad7f935 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -278,7 +278,6 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
;
; GISEL-LABEL: s_csh_64_0:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 63
; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
@@ -310,7 +309,6 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
;
; GISEL-LABEL: s_csh_64_1:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 0xff
; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index a0b549711f339b..93e14a205f05d4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5]
@@ -1837,7 +1837,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 25
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
More information about the llvm-commits
mailing list