[llvm] 5641804 - [DAG] MatchRotate - Add funnel shift by variable support
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 15 04:51:22 PDT 2020
Author: Simon Pilgrim
Date: 2020-03-15T11:50:45Z
New Revision: 564180429818dd48f2fab970fdb42d172ebd2a5f
URL: https://github.com/llvm/llvm-project/commit/564180429818dd48f2fab970fdb42d172ebd2a5f
DIFF: https://github.com/llvm/llvm-project/commit/564180429818dd48f2fab970fdb42d172ebd2a5f.diff
LOG: [DAG] MatchRotate - Add funnel shift by variable support
Followup to D75114, this patch reuses the existing MatchRotate ROTL/ROTR rotation pattern code to also recognize the more general FSHL/FSHR funnel shift patterns when we have variable shift amounts, matched with MatchFunnelPosNeg which acts in an (almost) equivalent manner to MatchRotatePosNeg.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AMDGPU/fshl.ll
llvm/test/CodeGen/AMDGPU/fshr.ll
llvm/test/CodeGen/X86/shift-double-x86_64.ll
llvm/test/CodeGen/X86/shift-double.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 36fc540fe316..7e2aaaa2ed49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -555,6 +555,10 @@ namespace {
SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);
+ SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
+ SDValue InnerPos, SDValue InnerNeg,
+ unsigned PosOpcode, unsigned NegOpcode,
+ const SDLoc &DL);
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
SDValue MatchLoadCombine(SDNode *N);
SDValue MatchStoreCombine(StoreSDNode *N);
@@ -6319,6 +6323,33 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
return SDValue();
}
+// A subroutine of MatchRotate used once we have found an OR of two opposite
+// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
+// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
+// former being preferred if supported. InnerPos and InnerNeg are Pos and
+// Neg with outer conversions stripped away.
+// TODO: Merge with MatchRotatePosNeg.
+SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
+ SDValue Neg, SDValue InnerPos,
+ SDValue InnerNeg, unsigned PosOpcode,
+ unsigned NegOpcode, const SDLoc &DL) {
+ // fold (or (shl x0, (*ext y)),
+ // (srl x1, (*ext (sub 32, y)))) ->
+ // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
+ //
+ // fold (or (shl x0, (*ext (sub 32, y))),
+ // (srl x1, (*ext y))) ->
+ // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
+ EVT VT = N0.getValueType();
+ if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
+ bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
+ return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
+ HasPos ? Pos : Neg);
+ }
+
+ return SDValue();
+}
+
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
// idioms for rotate, and if the target supports rotation instructions, generate
// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
@@ -6444,10 +6475,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
return Res;
}
- // TODO: Handle variable funnel shifts.
- if (!IsRotate)
- return SDValue();
-
// If there is a mask here, and we have a variable shift, we can't be sure
// that we're masking out the right stuff.
if (LHSMask.getNode() || RHSMask.getNode())
@@ -6468,13 +6495,29 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
RExtOp0 = RHSShiftAmt.getOperand(0);
}
- SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
- LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+ if (IsRotate && (HasROTL || HasROTR)) {
+ SDValue TryL =
+ MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
+ RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+ if (TryL)
+ return TryL;
+
+ SDValue TryR =
+ MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
+ LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+ if (TryR)
+ return TryR;
+ }
+
+ SDValue TryL =
+ MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+ LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
if (TryL)
return TryL;
- SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
- RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+ SDValue TryR =
+ MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+ RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
if (TryR)
return TryR;
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 0e17df416cd2..f91472967a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -17,11 +17,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 31
-; SI-NEXT: s_sub_i32 s8, 32, s2
-; SI-NEXT: s_lshl_b32 s3, s0, s2
-; SI-NEXT: s_lshr_b32 s1, s1, s8
-; SI-NEXT: s_or_b32 s1, s3, s1
; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: s_sub_i32 s1, 32, s2
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -34,14 +33,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 31
-; VI-NEXT: s_sub_i32 s3, 32, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_lshl_b32 s0, s0, s2
-; VI-NEXT: s_lshr_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: s_sub_i32 s1, 32, s2
+; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -53,14 +51,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 31
-; GFX9-NEXT: s_sub_i32 s3, 32, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_lshl_b32 s0, s0, s2
-; GFX9-NEXT: s_lshr_b32 s1, s1, s3
-; GFX9-NEXT: s_or_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_sub_i32 s1, 32, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dword v[0:1], v2, off
@@ -68,7 +65,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
;
; R600-LABEL: fshl_i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
@@ -77,9 +74,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T1.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHR T1.W, KC0[2].W, PV.W,
-; R600-NEXT: LSHL * T2.W, KC0[2].Z, T0.W,
-; R600-NEXT: OR_INT * T1.W, PS, PV.W,
+; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[2].Z, KC0[2].W, PV.W,
; R600-NEXT: CNDE_INT T0.X, T0.W, KC0[2].Z, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@@ -153,22 +148,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: s_sub_i32 s11, 32, s1
+; SI-NEXT: s_sub_i32 s10, 32, s1
+; SI-NEXT: v_mov_b32_e32 v1, s10
; SI-NEXT: s_and_b32 s0, s0, 31
-; SI-NEXT: s_lshl_b32 s10, s3, s1
-; SI-NEXT: s_lshr_b32 s9, s9, s11
-; SI-NEXT: s_sub_i32 s3, 32, s0
-; SI-NEXT: s_or_b32 s9, s10, s9
+; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: s_lshl_b32 s1, s2, s0
-; SI-NEXT: s_lshr_b32 s3, s8, s3
-; SI-NEXT: v_mov_b32_e32 v0, s9
-; SI-NEXT: s_or_b32 s1, s1, s3
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_sub_i32 s1, 32, s0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v2, s1
+; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
+; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -181,22 +174,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: s_sub_i32 s9, 32, s1
+; VI-NEXT: s_sub_i32 s7, 32, s1
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_and_b32 s0, s0, 31
-; VI-NEXT: s_lshl_b32 s8, s5, s1
-; VI-NEXT: s_lshr_b32 s7, s7, s9
-; VI-NEXT: s_sub_i32 s5, 32, s0
-; VI-NEXT: s_or_b32 s7, s8, s7
+; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; VI-NEXT: s_lshl_b32 s1, s4, s0
-; VI-NEXT: s_lshr_b32 s5, s6, s5
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_or_b32 s1, s1, s5
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_sub_i32 s1, 32, s0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -211,22 +202,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: s_sub_i32 s9, 32, s1
+; GFX9-NEXT: s_sub_i32 s7, 32, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_and_b32 s0, s0, 31
-; GFX9-NEXT: s_lshl_b32 s8, s5, s1
-; GFX9-NEXT: s_lshr_b32 s7, s7, s9
-; GFX9-NEXT: s_sub_i32 s5, 32, s0
-; GFX9-NEXT: s_or_b32 s7, s8, s7
+; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; GFX9-NEXT: s_lshl_b32 s1, s4, s0
-; GFX9-NEXT: s_lshr_b32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: s_or_b32 s1, s1, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_sub_i32 s1, 32, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -236,29 +225,25 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshl_v2i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
-; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
+; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHR T0.Z, KC0[3].Z, PV.W,
-; R600-NEXT: LSHL T2.W, KC0[3].X, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHR T0.Y, KC0[3].Y, PS,
-; R600-NEXT: LSHL T1.Z, KC0[2].W, T1.W,
-; R600-NEXT: OR_INT T2.W, PV.W, PV.Z,
+; R600-NEXT: SUB_INT T1.W, literal.x, PV.W,
+; R600-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
+; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
+; R600-NEXT: SUB_INT T0.Z, literal.x, PS,
+; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
-; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].X,
-; R600-NEXT: OR_INT T0.W, PV.Z, PV.Y,
-; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
-; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[2].W,
-; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].X,
+; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].W, KC0[3].Y, PV.Z,
+; R600-NEXT: SETE_INT * T1.W, T2.W, 0.0,
+; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[2].W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -341,40 +326,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s11
-; SI-NEXT: v_mov_b32_e32 v4, s8
+; SI-NEXT: v_mov_b32_e32 v0, s15
; SI-NEXT: s_and_b32 s3, s3, 31
-; SI-NEXT: s_sub_i32 s17, 32, s3
+; SI-NEXT: s_sub_i32 s16, 32, s3
+; SI-NEXT: v_mov_b32_e32 v1, s16
; SI-NEXT: s_and_b32 s2, s2, 31
-; SI-NEXT: s_lshl_b32 s16, s11, s3
-; SI-NEXT: s_lshr_b32 s15, s15, s17
-; SI-NEXT: s_sub_i32 s11, 32, s2
-; SI-NEXT: s_or_b32 s15, s16, s15
+; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; SI-NEXT: s_lshl_b32 s3, s10, s2
-; SI-NEXT: s_lshr_b32 s11, s14, s11
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: s_or_b32 s3, s3, s11
-; SI-NEXT: s_and_b32 s1, s1, 31
+; SI-NEXT: v_mov_b32_e32 v1, s11
+; SI-NEXT: s_sub_i32 s3, 32, s2
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: s_sub_i32 s3, 32, s1
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: s_and_b32 s1, s1, 31
+; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; SI-NEXT: s_lshl_b32 s2, s9, s1
-; SI-NEXT: s_lshr_b32 s3, s13, s3
; SI-NEXT: v_mov_b32_e32 v1, s10
-; SI-NEXT: s_or_b32 s2, s2, s3
-; SI-NEXT: s_and_b32 s0, s0, 31
+; SI-NEXT: s_sub_i32 s2, 32, s1
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_sub_i32 s2, 32, s0
+; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: s_and_b32 s0, s0, 31
+; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: s_lshl_b32 s1, s8, s0
-; SI-NEXT: s_lshr_b32 s2, s12, s2
; SI-NEXT: v_mov_b32_e32 v1, s9
-; SI-NEXT: s_or_b32 s1, s1, s2
+; SI-NEXT: s_sub_i32 s1, 32, s0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: v_mov_b32_e32 v4, s1
+; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
+; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -387,40 +368,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_and_b32 s3, s3, 31
-; VI-NEXT: s_sub_i32 s15, 32, s3
+; VI-NEXT: s_sub_i32 s11, 32, s3
+; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: s_and_b32 s2, s2, 31
-; VI-NEXT: s_lshl_b32 s14, s7, s3
-; VI-NEXT: s_lshr_b32 s11, s11, s15
-; VI-NEXT: s_sub_i32 s7, 32, s2
-; VI-NEXT: s_or_b32 s11, s14, s11
+; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; VI-NEXT: s_lshl_b32 s3, s6, s2
-; VI-NEXT: s_lshr_b32 s7, s10, s7
-; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_or_b32 s3, s3, s7
-; VI-NEXT: s_and_b32 s1, s1, 31
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_sub_i32 s3, 32, s2
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: s_sub_i32 s3, 32, s1
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_and_b32 s1, s1, 31
+; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; VI-NEXT: s_lshl_b32 s2, s5, s1
-; VI-NEXT: s_lshr_b32 s3, s9, s3
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_and_b32 s0, s0, 31
+; VI-NEXT: s_sub_i32 s2, 32, s1
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_sub_i32 s2, 32, s0
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_and_b32 s0, s0, 31
+; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; VI-NEXT: s_lshl_b32 s1, s4, s0
-; VI-NEXT: s_lshr_b32 s2, s8, s2
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_or_b32 s1, s1, s2
+; VI-NEXT: s_sub_i32 s1, 32, s0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
@@ -435,40 +412,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_and_b32 s3, s3, 31
-; GFX9-NEXT: s_sub_i32 s15, 32, s3
+; GFX9-NEXT: s_sub_i32 s11, 32, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s11
; GFX9-NEXT: s_and_b32 s2, s2, 31
-; GFX9-NEXT: s_lshl_b32 s14, s7, s3
-; GFX9-NEXT: s_lshr_b32 s11, s11, s15
-; GFX9-NEXT: s_sub_i32 s7, 32, s2
-; GFX9-NEXT: s_or_b32 s11, s14, s11
+; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; GFX9-NEXT: s_lshl_b32 s3, s6, s2
-; GFX9-NEXT: s_lshr_b32 s7, s10, s7
-; GFX9-NEXT: v_mov_b32_e32 v0, s11
-; GFX9-NEXT: s_or_b32 s3, s3, s7
-; GFX9-NEXT: s_and_b32 s1, s1, 31
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s3, 32, s2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s3, 32, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_and_b32 s1, s1, 31
+; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX9-NEXT: s_lshl_b32 s2, s5, s1
-; GFX9-NEXT: s_lshr_b32 s3, s9, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: s_or_b32 s2, s2, s3
-; GFX9-NEXT: s_and_b32 s0, s0, 31
+; GFX9-NEXT: s_sub_i32 s2, 32, s1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s2, 32, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: s_and_b32 s0, s0, 31
+; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; GFX9-NEXT: s_lshl_b32 s1, s4, s0
-; GFX9-NEXT: s_lshr_b32 s2, s8, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_sub_i32 s1, 32, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
@@ -478,44 +451,37 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshl_v4i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; R600-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
-; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
+; R600-NEXT: AND_INT T0.W, KC0[6].X, literal.x,
+; R600-NEXT: AND_INT * T1.W, KC0[5].W, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHR T0.Z, KC0[4].Y, PV.W,
-; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
-; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
-; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
-; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
-; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
-; R600-NEXT: LSHR * T2.W, KC0[5].X, PV.W,
+; R600-NEXT: AND_INT T0.X, KC0[5].Y, literal.x,
+; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[4].X, KC0[5].X, PV.W,
+; R600-NEXT: SETE_INT T0.Z, T0.W, 0.0,
+; R600-NEXT: SUB_INT * T0.W, literal.y, T1.W,
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: LSHL * T4.W, KC0[4].X, T1.W,
-; R600-NEXT: OR_INT T0.X, PV.W, T2.W,
-; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0,
-; R600-NEXT: LSHR T1.Z, KC0[4].W, T1.Z,
-; R600-NEXT: LSHL T1.W, KC0[3].W, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
+; R600-NEXT: AND_INT * T2.W, KC0[5].Z, literal.x,
+; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; R600-NEXT: SUB_INT T1.Y, literal.x, PV.W,
+; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, T0.W,
+; R600-NEXT: SETE_INT * T0.W, T1.W, 0.0,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHR T1.X, KC0[4].Z, PS,
-; R600-NEXT: LSHL T2.Y, KC0[3].Z, T0.Y,
-; R600-NEXT: OR_INT T1.Z, PV.W, PV.Z,
-; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
-; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[4].X,
-; R600-NEXT: LSHL T1.Y, KC0[3].Y, T0.W,
-; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[3].W,
-; R600-NEXT: OR_INT T1.W, T2.Y, T1.X,
-; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
-; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[3].Z,
-; R600-NEXT: OR_INT T1.W, PV.Y, T0.Z,
-; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
-; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[3].Y,
+; R600-NEXT: CNDE_INT * T1.W, T0.Z, T0.Y, KC0[4].X,
+; R600-NEXT: CNDE_INT T1.Z, T0.W, T1.Z, KC0[3].W,
+; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Z, KC0[4].Z, T1.Y,
+; R600-NEXT: SETE_INT * T2.W, T2.W, 0.0,
+; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
+; R600-NEXT: SUB_INT * T0.W, literal.x, T0.X,
+; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Y, KC0[4].Y, PV.W,
+; R600-NEXT: SETE_INT * T2.W, T0.X, 0.0,
+; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index f90863b26d5a..f5d7671b9cf3 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -128,24 +128,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: s_sub_i32 s11, 32, s1
-; SI-NEXT: s_lshr_b32 s10, s9, s1
-; SI-NEXT: s_lshl_b32 s3, s3, s11
-; SI-NEXT: s_or_b32 s3, s3, s10
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_and_b32 s0, s0, 31
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: s_sub_i32 s3, 32, s0
+; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: s_lshr_b32 s1, s8, s0
-; SI-NEXT: s_lshl_b32 s2, s2, s3
-; SI-NEXT: s_or_b32 s1, s2, s1
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -156,24 +150,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: s_sub_i32 s9, 32, s1
-; VI-NEXT: s_lshr_b32 s8, s7, s1
-; VI-NEXT: s_lshl_b32 s5, s5, s9
-; VI-NEXT: s_or_b32 s5, s5, s8
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b32 s0, s0, 31
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: s_sub_i32 s5, 32, s0
+; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; VI-NEXT: s_lshr_b32 s1, s6, s0
-; VI-NEXT: s_lshl_b32 s4, s4, s5
-; VI-NEXT: s_or_b32 s1, s4, s1
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -186,24 +174,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: s_sub_i32 s9, 32, s1
-; GFX9-NEXT: s_lshr_b32 s8, s7, s1
-; GFX9-NEXT: s_lshl_b32 s5, s5, s9
-; GFX9-NEXT: s_or_b32 s5, s5, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_and_b32 s0, s0, 31
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: s_sub_i32 s5, 32, s0
+; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; GFX9-NEXT: s_lshr_b32 s1, s6, s0
-; GFX9-NEXT: s_lshl_b32 s4, s4, s5
-; GFX9-NEXT: s_or_b32 s1, s4, s1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
@@ -211,29 +193,22 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshr_v2i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
-; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
+; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHL T0.Z, KC0[3].X, PV.W,
-; R600-NEXT: LSHR T2.W, KC0[3].Z, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHL T0.Y, KC0[2].W, PS,
-; R600-NEXT: LSHR T1.Z, KC0[3].Y, T1.W,
-; R600-NEXT: OR_INT T2.W, PV.Z, PV.W,
-; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
-; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
-; R600-NEXT: OR_INT T0.W, PV.Y, PV.Z,
-; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
-; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
-; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
+; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
+; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
+; R600-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
+; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
+; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
+; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@@ -316,42 +291,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s15
+; SI-NEXT: v_mov_b32_e32 v0, s15
; SI-NEXT: s_and_b32 s3, s3, 31
-; SI-NEXT: s_sub_i32 s17, 32, s3
-; SI-NEXT: s_lshr_b32 s16, s15, s3
-; SI-NEXT: s_lshl_b32 s11, s11, s17
-; SI-NEXT: s_or_b32 s11, s11, s16
-; SI-NEXT: s_and_b32 s2, s2, 31
-; SI-NEXT: v_mov_b32_e32 v0, s11
-; SI-NEXT: s_sub_i32 s11, 32, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; SI-NEXT: s_lshr_b32 s3, s14, s2
-; SI-NEXT: s_lshl_b32 s10, s10, s11
-; SI-NEXT: s_or_b32 s3, s10, s3
-; SI-NEXT: s_and_b32 s1, s1, 31
-; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: s_sub_i32 s3, 32, s1
+; SI-NEXT: s_and_b32 s2, s2, 31
+; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_mov_b32_e32 v1, s2
+; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; SI-NEXT: s_lshr_b32 s2, s13, s1
-; SI-NEXT: s_lshl_b32 s3, s9, s3
-; SI-NEXT: v_mov_b32_e32 v1, s14
-; SI-NEXT: s_or_b32 s2, s3, s2
+; SI-NEXT: s_and_b32 s1, s1, 31
+; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s13
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_and_b32 s0, s0, 31
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_sub_i32 s2, 32, s0
+; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; SI-NEXT: s_lshr_b32 s1, s12, s0
-; SI-NEXT: s_lshl_b32 s2, s8, s2
-; SI-NEXT: v_mov_b32_e32 v1, s13
-; SI-NEXT: s_or_b32 s1, s2, s1
-; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, s12
+; SI-NEXT: v_mov_b32_e32 v4, s0
+; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -362,42 +325,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_and_b32 s3, s3, 31
-; VI-NEXT: s_sub_i32 s15, 32, s3
-; VI-NEXT: s_lshr_b32 s14, s11, s3
-; VI-NEXT: s_lshl_b32 s7, s7, s15
-; VI-NEXT: s_or_b32 s7, s7, s14
-; VI-NEXT: s_and_b32 s2, s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_sub_i32 s7, 32, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; VI-NEXT: s_lshr_b32 s3, s10, s2
-; VI-NEXT: s_lshl_b32 s6, s6, s7
-; VI-NEXT: s_or_b32 s3, s6, s3
-; VI-NEXT: s_and_b32 s1, s1, 31
-; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: s_sub_i32 s3, 32, s1
+; VI-NEXT: s_and_b32 s2, s2, 31
+; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; VI-NEXT: s_lshr_b32 s2, s9, s1
-; VI-NEXT: s_lshl_b32 s3, s5, s3
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_and_b32 s1, s1, 31
+; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b32 s0, s0, 31
-; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_sub_i32 s2, 32, s0
+; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; VI-NEXT: s_lshr_b32 s1, s8, s0
-; VI-NEXT: s_lshl_b32 s2, s4, s2
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: s_or_b32 s1, s2, s1
-; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@@ -410,42 +361,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_and_b32 s3, s3, 31
-; GFX9-NEXT: s_sub_i32 s15, 32, s3
-; GFX9-NEXT: s_lshr_b32 s14, s11, s3
-; GFX9-NEXT: s_lshl_b32 s7, s7, s15
-; GFX9-NEXT: s_or_b32 s7, s7, s14
-; GFX9-NEXT: s_and_b32 s2, s2, 31
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: s_sub_i32 s7, 32, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
-; GFX9-NEXT: s_lshr_b32 s3, s10, s2
-; GFX9-NEXT: s_lshl_b32 s6, s6, s7
-; GFX9-NEXT: s_or_b32 s3, s6, s3
-; GFX9-NEXT: s_and_b32 s1, s1, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s3, 32, s1
+; GFX9-NEXT: s_and_b32 s2, s2, 31
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s10
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
-; GFX9-NEXT: s_lshr_b32 s2, s9, s1
-; GFX9-NEXT: s_lshl_b32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-NEXT: s_or_b32 s2, s3, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 31
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_and_b32 s0, s0, 31
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s2, 32, s0
+; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
-; GFX9-NEXT: s_lshr_b32 s1, s8, s0
-; GFX9-NEXT: s_lshl_b32 s2, s4, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_or_b32 s1, s2, s1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
@@ -453,44 +392,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshr_v4i32:
; R600: ; %bb.0: ; %entry
-; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
-; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; R600-NEXT: ALU 20, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
-; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
+; R600-NEXT: AND_INT T0.W, KC0[5].Z, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
-; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHL T0.Z, KC0[3].Y, PV.W,
-; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
-; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
-; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
-; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
-; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
-; R600-NEXT: LSHL * T2.W, KC0[4].X, PV.W,
-; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
-; R600-NEXT: LSHR * T4.W, KC0[5].X, T1.W,
-; R600-NEXT: OR_INT T0.X, T2.W, PV.W,
-; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0, BS:VEC_120/SCL_212
-; R600-NEXT: LSHL T1.Z, KC0[3].W, T1.Z,
-; R600-NEXT: LSHR T1.W, KC0[4].W, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
-; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
-; R600-NEXT: LSHL T1.X, KC0[3].Z, PS,
-; R600-NEXT: LSHR T2.Y, KC0[4].Z, T0.Y,
-; R600-NEXT: OR_INT T1.Z, PV.Z, PV.W,
-; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
-; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[5].X,
-; R600-NEXT: LSHR T1.Y, KC0[4].Y, T0.W,
-; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[4].W,
-; R600-NEXT: OR_INT T1.W, T1.X, T2.Y,
-; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
-; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[4].Z,
-; R600-NEXT: OR_INT T1.W, T0.Z, PV.Y,
+; R600-NEXT: SETE_INT T0.Z, PS, 0.0,
+; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
+; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
+; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; R600-NEXT: SETE_INT T1.Z, PV.W, 0.0,
+; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
+; R600-NEXT: CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
+; R600-NEXT: CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
+; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
-; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[4].Y,
+; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
+; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x,
+; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
+; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
+; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
diff --git a/llvm/test/CodeGen/X86/shift-double-x86_64.ll b/llvm/test/CodeGen/X86/shift-double-x86_64.ll
index a410595e03cf..7a086ba36474 100644
--- a/llvm/test/CodeGen/X86/shift-double-x86_64.ll
+++ b/llvm/test/CodeGen/X86/shift-double-x86_64.ll
@@ -8,7 +8,6 @@ define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: andl $63, %ecx
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shldq %cl, %rsi, %rax
; CHECK-NEXT: retq
@@ -25,7 +24,6 @@ define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: andl $63, %ecx
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shrdq %cl, %rdi, %rax
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shift-double.ll b/llvm/test/CodeGen/X86/shift-double.ll
index 9621c9445937..c0872957f2b8 100644
--- a/llvm/test/CodeGen/X86/shift-double.ll
+++ b/llvm/test/CodeGen/X86/shift-double.ll
@@ -290,11 +290,9 @@ define i64 @test10(i64 %val, i32 %bits) nounwind {
define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X86-LABEL: test11:
; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: retl
;
@@ -302,7 +300,6 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %edx, %ecx
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl $31, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shldl %cl, %esi, %eax
; X64-NEXT: retq
@@ -317,11 +314,9 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X86-LABEL: test12:
; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrdl %cl, %edx, %eax
; X86-NEXT: retl
;
@@ -329,7 +324,6 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %edx, %ecx
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl $31, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrdl %cl, %edi, %eax
; X64-NEXT: retq
More information about the llvm-commits
mailing list