[llvm] e9caa37 - [DAG] Move lshr narrowing from visitANDLike to SimplifyDemandedBits
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 17 07:50:21 PDT 2023
Author: Simon Pilgrim
Date: 2023-07-17T15:50:09+01:00
New Revision: e9caa37e9c69f6a6e5ab59d33b9d492054819ded
URL: https://github.com/llvm/llvm-project/commit/e9caa37e9c69f6a6e5ab59d33b9d492054819ded
DIFF: https://github.com/llvm/llvm-project/commit/e9caa37e9c69f6a6e5ab59d33b9d492054819ded.diff
LOG: [DAG] Move lshr narrowing from visitANDLike to SimplifyDemandedBits
Inspired by some of the cases from D145468
Let SimplifyDemandedBits handle the narrowing of lshr to half-width if we don't require the upper bits, the narrowed shift is profitable and the zext/trunc are free.
A future patch will propose the equivalent shl narrowing combine.
Differential Revision: https://reviews.llvm.org/D146121
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AMDGPU/idot4s.ll
llvm/test/CodeGen/AMDGPU/idot4u.ll
llvm/test/CodeGen/AMDGPU/idot8s.ll
llvm/test/CodeGen/AMDGPU/idot8u.ll
llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
llvm/test/CodeGen/AMDGPU/shift-i128.ll
llvm/test/CodeGen/AMDGPU/wave32.ll
llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
llvm/test/CodeGen/X86/3addr-or.ll
llvm/test/CodeGen/X86/and-shift.ll
llvm/test/CodeGen/X86/bswap.ll
llvm/test/CodeGen/X86/combine-bitreverse.ll
llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
llvm/test/CodeGen/X86/extract-bits.ll
llvm/test/CodeGen/X86/h-registers-0.ll
llvm/test/CodeGen/X86/lzcnt-cmp.ll
llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
llvm/test/CodeGen/X86/zext-lshr.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aad4e4a2d3f75a..c88708b935bd13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6152,55 +6152,6 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
}
}
- // Reduce bit extract of low half of an integer to the narrower type.
- // (and (srl i64:x, K), KMask) ->
- // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
- if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
- if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
- if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
- unsigned Size = VT.getSizeInBits();
- const APInt &AndMask = CAnd->getAPIntValue();
- unsigned ShiftBits = CShift->getZExtValue();
-
- // Bail out, this node will probably disappear anyway.
- if (ShiftBits == 0)
- return SDValue();
-
- unsigned MaskBits = AndMask.countr_one();
- EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
-
- if (AndMask.isMask() &&
- // Required bits must not span the two halves of the integer and
- // must fit in the half size type.
- (ShiftBits + MaskBits <= Size / 2) &&
- TLI.isNarrowingProfitable(VT, HalfVT) &&
- TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
- TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
- TLI.isTruncateFree(VT, HalfVT) &&
- TLI.isZExtFree(HalfVT, VT)) {
- // The isNarrowingProfitable is to avoid regressions on PPC and
- // AArch64 which match a few 64-bit bit insert / bit extract patterns
- // on downstream users of this. Those patterns could probably be
- // extended to handle extensions mixed in.
-
- SDValue SL(N0);
- assert(MaskBits <= Size);
-
- // Extracting the highest bit of the low half.
- EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
- N0.getOperand(0));
-
- SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
- SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
- SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
- SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
- return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
- }
- }
- }
- }
-
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0a7a5aa5194a94..8136b751c2779c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1863,6 +1863,27 @@ bool TargetLowering::SimplifyDemandedBits(
if (Op->getFlags().hasExact())
InDemandedMask.setLowBits(ShAmt);
+ // Narrow shift to lower half - similar to ShrinkDemandedOp.
+ // (srl i64:x, K) -> (i64 zero_extend (srl (i32 (trunc i64:x)), K))
+ if ((BitWidth % 2) == 0 && !VT.isVector() &&
+ ((InDemandedMask.countLeadingZeros() >= (BitWidth / 2)) ||
+ TLO.DAG.MaskedValueIsZero(
+ Op0, APInt::getHighBitsSet(BitWidth, BitWidth / 2)))) {
+ EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
+ if (isNarrowingProfitable(VT, HalfVT) &&
+ isTypeDesirableForOp(ISD::SRL, HalfVT) &&
+ isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
+ (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT))) {
+ SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
+ SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
+ ShAmt, HalfVT, dl, TLO.LegalTypes());
+ SDValue NewShift =
+ TLO.DAG.getNode(ISD::SRL, dl, HalfVT, NewOp, NewShiftAmt);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift));
+ }
+ }
+
// Compute the new bits that are at the top now.
if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
Depth + 1))
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 6ecde579e8416a..ea22aaee761c8d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -963,21 +963,19 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8
; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8
+; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2
; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8
-; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0
; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_alignbit_b32 v2, 0, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 16
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 03a330d19cbaf7..b7821f8fd6da51 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1850,28 +1850,24 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0
-; GFX7-NEXT: v_alignbit_b32 v2, v4, v2, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GFX7-NEXT: v_alignbit_b32 v0, v7, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v3, 0, v3, 16
-; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0
+; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8
+; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v5, v4, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1
+; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_mad_u32_u24 v0, v7, v8, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index d113564f995856..fcbb3512777071 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -2014,48 +2014,48 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4
-; GFX7-NEXT: v_bfe_i32 v7, v2, 4, 4
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX7-NEXT: v_bfe_i32 v14, v0, 4, 4
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
+; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4
; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4
-; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4
-; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2
-; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4
+; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4
+; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2
+; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4
+; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4
+; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4
-; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4
-; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4
-; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0
-; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4
+; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4
+; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4
+; GFX7-NEXT: v_ashrrev_i32_e32 v14, 28, v0
+; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4
+; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4
+; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@@ -2581,12 +2581,10 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24
-; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24
+; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9
+; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 4a31522f8dce31..3828fa557731e8 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -2444,32 +2444,28 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_and_b32_e32 v9, 15, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_and_b32_e32 v16, 15, v0
; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4
; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4
-; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4
-; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 12, v2
+; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4
+; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4
+; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4
; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4
-; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4
-; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 12, v0
+; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4
+; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4
+; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4
+; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xf000000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xf000000, v0
-; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1
-; GFX7-NEXT: v_alignbit_b32 v2, s10, v2, 24
-; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 24
-; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
+; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 1b23035b82a73b..61017e809c8636 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -478,51 +478,49 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_imad_pat_v3i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v7, 0, v7, 16
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
+; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
+; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v3, 0, v8, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v0, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -738,41 +736,39 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v10, v7, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v9, 0, v9, 16
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v10, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v8, v6, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v6, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v5, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v13
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v9, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v11, v4, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v11, v4, 1
-; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7
; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v0, v4
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v10, v8
+; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6
; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
@@ -783,11 +779,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v8, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10
; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -798,7 +794,7 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX67-SDAG-NEXT: v_alignbit_b32 v5, 0, v0, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
@@ -1395,51 +1391,49 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX67-SDAG-LABEL: clpeak_umad_pat_v3i16:
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v7, 16, v1
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v7, 0, v7, 16
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
+; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
+; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
+; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v3, 0, v8, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8
+; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4
; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v0, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -1655,41 +1649,39 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG: ; %bb.0: ; %entry
; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v3
+; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3
+; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2
; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v10, v7, v3
-; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v9, 0, v9, 16
+; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1
+; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v10, v7
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v8, v6, 1
-; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v0
-; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v6, v2
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v5, v1
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v13
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v9, v5
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v11, v4, v0
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v11, v4, 1
-; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13
+; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7
; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v0, v4
+; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5
; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000
-; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v10, v8
+; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8
; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6
; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8
@@ -1700,11 +1692,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v8, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10
; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4
; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -1715,7 +1707,7 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5
; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6
-; GFX67-SDAG-NEXT: v_alignbit_b32 v5, 0, v0, 16
+; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
index 8e54bd51aaa5ca..b46476f34cbe2f 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -155,7 +155,7 @@ define i32 @trunc_srl_i64_25_to_i26(i64 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xa000000, v0
-; GCN-NEXT: v_alignbit_b32 v0, 0, v0, 25
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 25, v0
; GCN-NEXT: v_add_u32_e32 v0, 55, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%value.knownbits2 = and i64 %x, 167772160 ; 0xA000000
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index fb857e484f6ff0..79e96b45a901c4 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1873,38 +1873,38 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
-; NOSDWA-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1]
-; NOSDWA-NEXT: v_and_b32_e32 v6, 0xff, v0
-; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 8, v0
+; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0
+; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; NOSDWA-NEXT: v_and_b32_e32 v5, 0xff, v1
+; NOSDWA-NEXT: v_and_b32_e32 v7, 0xff, v1
; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 24, v1
; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; NOSDWA-NEXT: v_lshlrev_b16_e32 v7, 8, v7
+; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5
+; NOSDWA-NEXT: v_lshlrev_b16_e32 v6, 8, v6
; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0
; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8
; NOSDWA-NEXT: v_lshlrev_b16_e32 v9, 8, v9
; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1
-; NOSDWA-NEXT: v_lshlrev_b16_e32 v4, 8, v4
-; NOSDWA-NEXT: v_or_b32_e32 v6, v6, v7
-; NOSDWA-NEXT: v_or_b32_e32 v5, v5, v8
+; NOSDWA-NEXT: v_or_b32_e32 v4, v4, v5
+; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v6
+; NOSDWA-NEXT: v_or_b32_e32 v5, v7, v8
; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v9
-; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4
-; NOSDWA-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v4
; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; NOSDWA-NEXT: v_or_b32_e32 v0, v6, v0
-; NOSDWA-NEXT: v_or_b32_e32 v1, v4, v1
+; NOSDWA-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT: v_or_b32_e32 v0, v4, v0
+; NOSDWA-NEXT: v_or_b32_e32 v1, v5, v1
; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; NOSDWA-NEXT: s_endpgm
;
; GFX89-LABEL: pulled_out_test:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: v_mov_b32_e32 v6, 8
-; GFX89-NEXT: v_mov_b32_e32 v7, 0xff
+; GFX89-NEXT: v_mov_b32_e32 v4, 8
+; GFX89-NEXT: v_mov_b32_e32 v5, 0xff
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s0
; GFX89-NEXT: v_mov_b32_e32 v1, s1
@@ -1912,73 +1912,72 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX89-NEXT: v_mov_b32_e32 v2, s2
; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1]
-; GFX89-NEXT: v_lshrrev_b32_sdwa v8, v6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0
+; GFX89-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: v_lshrrev_b32_e32 v9, 24, v1
-; GFX89-NEXT: v_and_b32_sdwa v5, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX89-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v9
-; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v4
-; GFX89-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX89-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v7
+; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v9
+; GFX89-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX89-NEXT: s_endpgm
;
; GFX9-LABEL: pulled_out_test:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, 8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_movk_i32 s0, 0xff
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
-; GFX9-NEXT: v_lshrrev_b32_sdwa v6, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0
+; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT: v_and_b32_sdwa v3, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v6, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7
-; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v5
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7
+; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: pulled_out_test:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_mov_b32_e32 v5, 8
-; GFX10-NEXT: v_mov_b32_e32 v6, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v7, 24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, 8
+; GFX10-NEXT: v_mov_b32_e32 v4, 24
+; GFX10-NEXT: v_mov_b32_e32 v5, 0xff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v7, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v6, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
entry:
%idxprom = ashr exact i64 15, 32
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index dd5a59cb1e36e0..4b02d00ddce1ef 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -149,12 +149,11 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
; GCN-NEXT: s_mov_b64 s[4:5], 0x41
; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
-; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GCN-NEXT: v_mov_b32_e32 v3, 0x41
-; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_mov_b32_e32 v2, 0x41
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -168,11 +167,10 @@ define i128 @v_ashr_i128_kv(i128 %rhs) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
-; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 095011efe7a988..2ddde62cdb6fc6 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2489,8 +2489,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: v_alignbit_b32 v0, 0, vcc_lo, 1
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
; GFX1032-NEXT: s_min_u32 s0, s0, s1
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
@@ -2587,9 +2586,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1032-NEXT: v_trunc_f32_e32 v1, v1
; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: v_alignbit_b32 v1, 0, vcc_lo, 1
+; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
; GFX1032-NEXT: s_min_u32 s0, s0, s1
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
diff --git a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
index 4014387677e37d..0d63779227554c 100644
--- a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
+++ b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
@@ -15,10 +15,9 @@ define void @passing2(i64 %str.0, i64 %str.1, i16 signext %s, i32 %j, i8 signex
; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: shrq $16, %rsi
-; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: shrq $24, %rax
+; CHECK-NEXT: shrq $16, %rax
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
index fa743f26ba2d17..f2a53653ec2ffc 100644
--- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -8,10 +8,10 @@ define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nbloc
; CHECK-NEXT: movl %edx, %edx
; CHECK-NEXT: movl (%rdi,%rdx,4), %edx
; CHECK-NEXT: movzbl %dl, %r10d
+; CHECK-NEXT: # kill: def $edx killed $edx def $rdx
+; CHECK-NEXT: shrl $8, %edx
; CHECK-NEXT: addl $4, %r10d
-; CHECK-NEXT: shrq $6, %rdx
-; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC
-; CHECK-NEXT: movl (%rdi,%rdx), %edx
+; CHECK-NEXT: movl (%rdi,%rdx,4), %edx
; CHECK-NEXT: movzbl %dl, %edi
; CHECK-NEXT: shrl $8, %edx
; CHECK-NEXT: addl $5, %esi
diff --git a/llvm/test/CodeGen/X86/3addr-or.ll b/llvm/test/CodeGen/X86/3addr-or.ll
index 72e29b9faf4a6e..65f6d2b4123e8e 100644
--- a/llvm/test/CodeGen/X86/3addr-or.ll
+++ b/llvm/test/CodeGen/X86/3addr-or.ll
@@ -20,12 +20,11 @@ define i32 @test1(i32 %x) nounwind ssp {
define i64 @test2(i8 %A, i8 %B) nounwind {
; CHECK-LABEL: test2:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
; CHECK-NEXT: shll $4, %edi
; CHECK-NEXT: andl $48, %edi
; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: shrq $4, %rax
-; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: shrl $4, %eax
+; CHECK-NEXT: orl %edi, %eax
; CHECK-NEXT: retq
%C = zext i8 %A to i64
%D = shl i64 %C, 4
diff --git a/llvm/test/CodeGen/X86/and-shift.ll b/llvm/test/CodeGen/X86/and-shift.ll
index e406a28c7f4dbe..42e68cd8aac5fe 100644
--- a/llvm/test/CodeGen/X86/and-shift.ll
+++ b/llvm/test/CodeGen/X86/and-shift.ll
@@ -54,8 +54,8 @@ define i64 @shift30_and2_i64(i64 %x) {
; X64-LABEL: shift30_and2_i64:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $30, %rax
-; X64-NEXT: andl $2, %eax
+; X64-NEXT: shrl $30, %eax
+; X64-NEXT: andl $-2, %eax
; X64-NEXT: retq
%shr = lshr i64 %x, 30
%and = and i64 %shr, 2
diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index e965c621337c54..17fd612b812ebc 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -166,8 +166,8 @@ define i64 @not_bswap() {
; CHECK64-LABEL: not_bswap:
; CHECK64: # %bb.0:
; CHECK64-NEXT: movzwl var16(%rip), %eax
-; CHECK64-NEXT: movq %rax, %rcx
-; CHECK64-NEXT: shrq $8, %rcx
+; CHECK64-NEXT: movl %eax, %ecx
+; CHECK64-NEXT: shrl $8, %ecx
; CHECK64-NEXT: shlq $8, %rax
; CHECK64-NEXT: orq %rcx, %rax
; CHECK64-NEXT: retq
@@ -224,9 +224,12 @@ define i64 @finally_useful_bswap() {
;
; CHECK64-LABEL: finally_useful_bswap:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: movzwl var16(%rip), %eax
-; CHECK64-NEXT: bswapq %rax
-; CHECK64-NEXT: shrq $48, %rax
+; CHECK64-NEXT: movzwl var16(%rip), %ecx
+; CHECK64-NEXT: movzbl %cl, %eax
+; CHECK64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; CHECK64-NEXT: shrl $8, %ecx
+; CHECK64-NEXT: shlq $8, %rax
+; CHECK64-NEXT: orq %rcx, %rax
; CHECK64-NEXT: retq
%init = load i16, ptr @var16
%big = zext i16 %init to i64
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index c2b9cbb0467133..32579bd05605e7 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -369,20 +369,19 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $235867919, %ecx # imm = 0xE0F0F0F
; X64-NEXT: shlq $4, %rcx
-; X64-NEXT: shrq $4, %rax
+; X64-NEXT: shrl $4, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
; X64-NEXT: orq %rcx, %rax
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: andl $590558003, %ecx # imm = 0x23333333
-; X64-NEXT: shrq $2, %rax
+; X64-NEXT: shrl $2, %eax
; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
; X64-NEXT: leaq (%rax,%rcx,4), %rax
-; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
-; X64-NEXT: movq %rax, %rdx
-; X64-NEXT: andq %rcx, %rdx
-; X64-NEXT: shrq %rax
-; X64-NEXT: andq %rcx, %rax
-; X64-NEXT: leaq (%rax,%rdx,2), %rax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: leaq (%rax,%rcx,2), %rax
; X64-NEXT: retq
%1 = call i64 @llvm.bitreverse.i64(i64 %a)
%2 = shl i64 %1, 33
diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
index 1fd1387791846c..5c23c155ed85ff 100644
--- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
@@ -1585,7 +1585,7 @@ define i64 @test_i64_2147483647_mask_lshr_1(i64 %a0) {
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: andl $2147483646, %eax # imm = 0x7FFFFFFE
-; X64-NEXT: shrq %rax
+; X64-NEXT: shrl %eax
; X64-NEXT: retq
%t0 = and i64 %a0, 2147483647
%t1 = lshr i64 %t0, 1
@@ -1759,7 +1759,7 @@ define i64 @test_i64_2147483647_mask_ashr_1(i64 %a0) {
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: andl $2147483646, %eax # imm = 0x7FFFFFFE
-; X64-NEXT: shrq %rax
+; X64-NEXT: shrl %eax
; X64-NEXT: retq
%t0 = and i64 %a0, 2147483647
%t1 = ashr i64 %t0, 1
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90b9a38c001ad4..15d13e62b0fadb 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -8130,22 +8130,22 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
;
; X64-NOBMI-LABEL: pr38938:
; X64-NOBMI: # %bb.0:
-; X64-NOBMI-NEXT: movq (%rsi), %rax
-; X64-NOBMI-NEXT: shrq $19, %rax
-; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC
-; X64-NOBMI-NEXT: incl (%rdi,%rax)
+; X64-NOBMI-NEXT: movl (%rsi), %eax
+; X64-NOBMI-NEXT: shrl $21, %eax
+; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF
+; X64-NOBMI-NEXT: incl (%rdi,%rax,4)
; X64-NOBMI-NEXT: retq
;
; X64-BMINOTBM-LABEL: pr38938:
; X64-BMINOTBM: # %bb.0:
; X64-BMINOTBM-NEXT: movl $2581, %eax # imm = 0xA15
-; X64-BMINOTBM-NEXT: bextrq %rax, (%rsi), %rax
+; X64-BMINOTBM-NEXT: bextrl %eax, (%rsi), %eax
; X64-BMINOTBM-NEXT: incl (%rdi,%rax,4)
; X64-BMINOTBM-NEXT: retq
;
; X64-BMITBM-LABEL: pr38938:
; X64-BMITBM: # %bb.0:
-; X64-BMITBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15
+; X64-BMITBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15
; X64-BMITBM-NEXT: incl (%rdi,%rax,4)
; X64-BMITBM-NEXT: retq
%tmp = load i64, ptr %a1, align 8
diff --git a/llvm/test/CodeGen/X86/h-registers-0.ll b/llvm/test/CodeGen/X86/h-registers-0.ll
index 37173095366955..76b0a34643d522 100644
--- a/llvm/test/CodeGen/X86/h-registers-0.ll
+++ b/llvm/test/CodeGen/X86/h-registers-0.ll
@@ -10,21 +10,21 @@
define void @bar64(i64 inreg %x, ptr inreg %p) nounwind {
; X64-LABEL: bar64:
; X64: # %bb.0:
-; X64-NEXT: shrq $8, %rdi
+; X64-NEXT: shrl $8, %edi
; X64-NEXT: incb %dil
; X64-NEXT: movb %dil, (%rsi)
; X64-NEXT: retq
;
; X32-LABEL: bar64:
; X32: # %bb.0:
-; X32-NEXT: shrq $8, %rdi
+; X32-NEXT: shrl $8, %edi
; X32-NEXT: incb %dil
; X32-NEXT: movb %dil, (%esi)
; X32-NEXT: retq
;
; WIN64-LABEL: bar64:
; WIN64: # %bb.0:
-; WIN64-NEXT: shrq $8, %rcx
+; WIN64-NEXT: shrl $8, %ecx
; WIN64-NEXT: incb %cl
; WIN64-NEXT: movb %cl, (%rdx)
; WIN64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index b2c1ffd799e312..c1cce6f5d8ca10 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -68,7 +68,7 @@ define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) nounwind {
; X64-BSR-LABEL: lshr_ctlz_undef_cmpeq_one_i64:
; X64-BSR: # %bb.0:
; X64-BSR-NEXT: bsrq %rdi, %rax
-; X64-BSR-NEXT: shrq $6, %rax
+; X64-BSR-NEXT: shrl $6, %eax
; X64-BSR-NEXT: cmpl $1, %eax
; X64-BSR-NEXT: sete %al
; X64-BSR-NEXT: retq
@@ -76,7 +76,7 @@ define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) nounwind {
; X64-LZCNT-LABEL: lshr_ctlz_undef_cmpeq_one_i64:
; X64-LZCNT: # %bb.0:
; X64-LZCNT-NEXT: lzcntq %rdi, %rax
-; X64-LZCNT-NEXT: shrq $6, %rax
+; X64-LZCNT-NEXT: shrl $6, %eax
; X64-LZCNT-NEXT: cmpl $1, %eax
; X64-LZCNT-NEXT: sete %al
; X64-LZCNT-NEXT: retq
@@ -149,7 +149,7 @@ define i1 @lshr_ctlz_undef_cmpne_zero_i64(i64 %in) nounwind {
; X64-BSR-LABEL: lshr_ctlz_undef_cmpne_zero_i64:
; X64-BSR: # %bb.0:
; X64-BSR-NEXT: bsrq %rdi, %rax
-; X64-BSR-NEXT: testq $-64, %rax
+; X64-BSR-NEXT: testl $-64, %eax
; X64-BSR-NEXT: setne %al
; X64-BSR-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
index da402d81db9fe9..3f64a383abd2c9 100644
--- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
@@ -89,9 +89,9 @@ define i64 @test4(ptr %data) {
;
; X64-LABEL: test4:
; X64: # %bb.0: # %entry
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: shrq $2, %rax
-; X64-NEXT: andl $60, %eax
+; X64-NEXT: movzbl (%rdi), %eax
+; X64-NEXT: shrl $2, %eax
+; X64-NEXT: andl $-4, %eax
; X64-NEXT: retq
entry:
%bf.load = load i8, ptr %data, align 4
@@ -114,7 +114,7 @@ define i64 @test5(ptr %data) {
; X64-LABEL: test5:
; X64: # %bb.0: # %entry
; X64-NEXT: movzbl (%rdi), %eax
-; X64-NEXT: shrq $2, %rax
+; X64-NEXT: shrl $2, %eax
; X64-NEXT: xorq $60, %rax
; X64-NEXT: retq
entry:
@@ -138,7 +138,7 @@ define i64 @test6(ptr %data) {
; X64-LABEL: test6:
; X64: # %bb.0: # %entry
; X64-NEXT: movzbl (%rdi), %eax
-; X64-NEXT: shrq $2, %rax
+; X64-NEXT: shrl $2, %eax
; X64-NEXT: orq $60, %rax
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/zext-lshr.ll b/llvm/test/CodeGen/X86/zext-lshr.ll
index fbfd7b7e1bfe0c..76f874663ed531 100644
--- a/llvm/test/CodeGen/X86/zext-lshr.ll
+++ b/llvm/test/CodeGen/X86/zext-lshr.ll
@@ -42,7 +42,7 @@ define i64 @i64_zext_shift_i16_zext_i8(i8 %a0) nounwind {
; X64-LABEL: i64_zext_shift_i16_zext_i8:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shrq $5, %rax
+; X64-NEXT: shrl $5, %eax
; X64-NEXT: retq
%t0 = zext i8 %a0 to i16
%t1 = lshr i16 %t0, 5
@@ -103,7 +103,7 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
; X64-LABEL: i128_zext_shift_i64_zext_i8:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: shrq $4, %rax
+; X64-NEXT: shrl $4, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%t0 = zext i8 %a0 to i64
@@ -127,7 +127,7 @@ define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
; X64-LABEL: i128_zext_shift_i64_zext_i16:
; X64: # %bb.0:
; X64-NEXT: movzwl %di, %eax
-; X64-NEXT: shrq $7, %rax
+; X64-NEXT: shrl $7, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
%t0 = zext i16 %a0 to i64
More information about the llvm-commits
mailing list