[llvm] r370617 - [DAGCombiner] improve throughput of shift+logic+shift

Sun Sep 1 11:38:16 PDT 2019

Author: spatel
Date: Sun Sep  1 11:38:15 2019
New Revision: 370617

URL: http://llvm.org/viewvc/llvm-project?rev=370617&view=rev
Log:
[DAGCombiner] improve throughput of shift+logic+shift

The motivating case for this is a long way from here:
https://bugs.llvm.org/show_bug.cgi?id=43146
...but I think this is where we have to start.

We need to canonicalize/optimize sequences of shift and logic to ease
pattern matching for things like bswap and improve perf in general.
But without the artificial limit of '!LegalTypes' (early combining),
there are a lot of test diffs, and not all are good.

In the minimal tests added for this proposal, x86 should have better
throughput in all cases. AArch64 is neutral for scalar tests because
it can fold shifts into bitwise logic ops.

There are 3 shift opcodes and 3 logic opcodes for a total of 9 possible patterns:
https://rise4fun.com/Alive/VlI
https://rise4fun.com/Alive/n1m
https://rise4fun.com/Alive/1Vn

Differential Revision: https://reviews.llvm.org/D67021

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll
    llvm/trunk/test/CodeGen/AArch64/shift-logic.ll
    llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll
    llvm/trunk/test/CodeGen/X86/shift-logic.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=370617&r1=370616&r2=370617&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Sun Sep  1 11:38:15 2019
@@ -7204,6 +7204,72 @@ SDValue DAGCombiner::visitXOR(SDNode *N)
   return SDValue();
 }
 
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This is a
+/// throughput improvement.
+static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
+  // Match a one-use bitwise logic op.
+  SDValue LogicOp = Shift->getOperand(0);
+  if (!LogicOp.hasOneUse())
+    return SDValue();
+
+  unsigned LogicOpcode = LogicOp.getOpcode();
+  if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
+      LogicOpcode != ISD::XOR)
+    return SDValue();
+
+  // Find a matching one-use shift by constant.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  SDValue C1 = Shift->getOperand(1);
+  ConstantSDNode *C1Node = isConstOrConstSplat(C1);
+  assert(C1Node && "Expected a shift with constant operand");
+  const APInt &C1Val = C1Node->getAPIntValue();
+  auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
+                             const APInt *&ShiftAmtVal) {
+    if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
+      return false;
+
+    ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
+    if (!ShiftCNode)
+      return false;
+
+    // Capture the shifted operand and shift amount value.
+    ShiftOp = V.getOperand(0);
+    ShiftAmtVal = &ShiftCNode->getAPIntValue();
+
+    // Shift amount types do not have to match their operand type, so check that
+    // the constants are the same width.
+    if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
+      return false;
+
+    // The fold is not valid if the sum of the shift values exceeds bitwidth.
+    if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  SDValue X, Y;
+  const APInt *C0Val;
+  if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
+    Y = LogicOp.getOperand(1);
+  else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
+    Y = LogicOp.getOperand(0);
+  else
+    return SDValue();
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  SDLoc DL(Shift);
+  EVT VT = Shift->getValueType(0);
+  EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
+  SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
+  SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
+  SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
+}
+
 /// Handle transforms common to the three shifts, when the shift amount is a
 /// constant.
 /// We are looking for: (shift being one of shl/sra/srl)
@@ -7222,6 +7288,14 @@ SDValue DAGCombiner::visitShiftByConstan
   if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
     return SDValue();
 
+  // TODO: This is limited to early combining because it may reveal regressions
+  //       otherwise. But since we just checked a target hook to see if this is
+  //       desirable, that should have filtered out cases where this interferes
+  //       with some other pattern matching.
+  if (!LegalTypes)
+    if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
+      return R;
+
   // We want to pull some binops through shifts, so that we have (and (shift))
   // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
   // thing happens with address calculations, so it's important to canonicalize

Modified: llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll?rev=370617&r1=370616&r2=370617&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/bitfield-insert.ll Sun Sep  1 11:38:15 2019
@@ -265,12 +265,12 @@ define void @test_32bit_opnd1_better(i32
 define i32 @test_nouseful_bits(i8 %a, i32 %b) {
 ; CHECK-LABEL: test_nouseful_bits:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    bfi w8, w8, #8, #24
-; CHECK-NEXT:    mov w9, w0
-; CHECK-NEXT:    bfi w9, w8, #8, #24
-; CHECK-NEXT:    bfi w0, w9, #8, #24
-; CHECK-NEXT:    lsl w0, w0, #8
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    lsl w8, w8, #8
+; CHECK-NEXT:    mov w9, w8
+; CHECK-NEXT:    bfxil w9, w0, #0, #8
+; CHECK-NEXT:    bfi w8, w9, #16, #16
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %conv = zext i8 %a to i32     ;   0  0  0  A
   %shl = shl i32 %b, 8          ;   B2 B1 B0 0

Modified: llvm/trunk/test/CodeGen/AArch64/shift-logic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/shift-logic.ll?rev=370617&r1=370616&r2=370617&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/shift-logic.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/shift-logic.ll Sun Sep  1 11:38:15 2019
@@ -4,8 +4,8 @@
 define i8 @shl_and(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: shl_and:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w0, lsl #3
-; CHECK-NEXT:    lsl w0, w8, #2
+; CHECK-NEXT:    lsl w8, w0, #5
+; CHECK-NEXT:    and w0, w8, w1, lsl #2
 ; CHECK-NEXT:    ret
   %sh0 = shl i8 %x, 3
   %r = and i8 %sh0, %y
@@ -16,8 +16,8 @@ define i8 @shl_and(i8 %x, i8 %y) nounwin
 define i16 @shl_or(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: shl_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w1, w0, lsl #5
-; CHECK-NEXT:    lsl w0, w8, #7
+; CHECK-NEXT:    lsl w8, w0, #12
+; CHECK-NEXT:    orr w0, w8, w1, lsl #7
 ; CHECK-NEXT:    ret
   %sh0 = shl i16 %x, 5
   %r = or i16 %y, %sh0
@@ -28,8 +28,8 @@ define i16 @shl_or(i16 %x, i16 %y) nounw
 define i32 @shl_xor(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: shl_xor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w1, w0, lsl #5
-; CHECK-NEXT:    lsl w0, w8, #7
+; CHECK-NEXT:    lsl w8, w0, #12
+; CHECK-NEXT:    eor w0, w8, w1, lsl #7
 ; CHECK-NEXT:    ret
   %sh0 = shl i32 %x, 5
   %r = xor i32 %sh0, %y
@@ -40,8 +40,8 @@ define i32 @shl_xor(i32 %x, i32 %y) noun
 define i64 @lshr_and(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: lshr_and:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x1, x0, lsr #5
-; CHECK-NEXT:    lsr x0, x8, #7
+; CHECK-NEXT:    lsr x8, x0, #12
+; CHECK-NEXT:    and x0, x8, x1, lsr #7
 ; CHECK-NEXT:    ret
   %sh0 = lshr i64 %x, 5
   %r = and i64 %y, %sh0
@@ -52,9 +52,9 @@ define i64 @lshr_and(i64 %x, i64 %y) nou
 define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: lshr_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #5
+; CHECK-NEXT:    ushr v1.4s, v1.4s, #7
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #12
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #7
 ; CHECK-NEXT:    ret
   %sh0 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %r = or <4 x i32> %sh0, %y
@@ -65,9 +65,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x,
 define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: lshr_xor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #5
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #7
+; CHECK-NEXT:    ushr v1.8h, v1.8h, #7
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #12
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %sh0 = lshr <8 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   %r = xor <8 x i16> %y, %sh0
@@ -79,9 +79,9 @@ define <8 x i16> @lshr_xor(<8 x i16> %x,
 define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: ashr_and:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr v0.16b, v0.16b, #3
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    sshr v0.16b, v0.16b, #2
+; CHECK-NEXT:    sshr v1.16b, v1.16b, #2
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #5
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %sh0 = ashr <16 x i8> %x, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   %r = and <16 x i8> %y, %sh0
@@ -92,9 +92,9 @@ define <16 x i8> @ashr_and(<16 x i8> %x,
 define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: ashr_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #5
+; CHECK-NEXT:    sshr v1.2d, v1.2d, #7
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #12
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshr v0.2d, v0.2d, #7
 ; CHECK-NEXT:    ret
   %sh0 = ashr <2 x i64> %x, <i64 5, i64 5>
   %r = or <2 x i64> %sh0, %y
@@ -105,8 +105,8 @@ define <2 x i64> @ashr_or(<2 x i64> %x,
 define i32 @ashr_xor(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: ashr_xor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w1, w0, asr #5
-; CHECK-NEXT:    asr w0, w8, #7
+; CHECK-NEXT:    asr w8, w0, #12
+; CHECK-NEXT:    eor w0, w8, w1, asr #7
 ; CHECK-NEXT:    ret
   %sh0 = ashr i32 %x, 5
   %r = xor i32 %y, %sh0

Modified: llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll?rev=370617&r1=370616&r2=370617&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll (original)
+++ llvm/trunk/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll Sun Sep  1 11:38:15 2019
@@ -15,10 +15,10 @@ target datalayout = "e-p:32:32:32-i1:8:3
 
 ; Make sure the cmp is not scheduled before the InlineAsm that clobbers cc.
 ; CHECK: bl _f2
-; CHECK: cmp {{r[0-9]+}}, #0
-; CHECK-NEXT: it       eq
-; CHECK-NEXT: addeq    {{r[0-9]+}}, #1
-; CHECK-NEXT: lsls
+; CHECK: clz {{r[0-9]+}}
+; CHECK-DAG: lsrs    {{r[0-9]+}}
+; CHECK-DAG: lsls    {{r[0-9]+}}
+; CHECK-NEXT: orr.w   {{r[0-9]+}}
 ; CHECK-NEXT: InlineAsm Start
 define void @test(%s1* %this, i32 %format, i32 %w, i32 %h, i32 %levels, i32* %s, i8* %data, i32* nocapture %rowbytes, void (i8*, i8*)* %release, i8* %info) nounwind {
 entry:

Modified: llvm/trunk/test/CodeGen/X86/shift-logic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shift-logic.ll?rev=370617&r1=370616&r2=370617&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shift-logic.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shift-logic.ll Sun Sep  1 11:38:15 2019
@@ -4,10 +4,10 @@
 define i8 @shl_and(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: shl_and:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal (,%rdi,8), %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $2, %sil
+; CHECK-NEXT:    shlb $5, %al
 ; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    shlb $2, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %sh0 = shl i8 %x, 3
@@ -20,9 +20,9 @@ define i16 @shl_or(i16 %x, i16 %y) nounw
 ; CHECK-LABEL: shl_or:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    shll $7, %esi
+; CHECK-NEXT:    shll $12, %eax
 ; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    shll $7, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %sh0 = shl i16 %x, 5
@@ -35,9 +35,9 @@ define i32 @shl_xor(i32 %x, i32 %y) noun
 ; CHECK-LABEL: shl_xor:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $5, %eax
+; CHECK-NEXT:    shll $7, %esi
+; CHECK-NEXT:    shll $12, %eax
 ; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    shll $7, %eax
 ; CHECK-NEXT:    retq
   %sh0 = shl i32 %x, 5
   %r = xor i32 %sh0, %y
@@ -49,9 +49,9 @@ define i64 @lshr_and(i64 %x, i64 %y) nou
 ; CHECK-LABEL: lshr_and:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    shrq $5, %rax
+; CHECK-NEXT:    shrq $7, %rsi
+; CHECK-NEXT:    shrq $12, %rax
 ; CHECK-NEXT:    andq %rsi, %rax
-; CHECK-NEXT:    shrq $7, %rax
 ; CHECK-NEXT:    retq
   %sh0 = lshr i64 %x, 5
   %r = and i64 %y, %sh0
@@ -62,9 +62,9 @@ define i64 @lshr_and(i64 %x, i64 %y) nou
 define <4 x i32> @lshr_or(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: lshr_or:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrld $5, %xmm0
+; CHECK-NEXT:    psrld $7, %xmm1
+; CHECK-NEXT:    psrld $12, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    psrld $7, %xmm0
 ; CHECK-NEXT:    retq
   %sh0 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %r = or <4 x i32> %sh0, %y
@@ -75,9 +75,9 @@ define <4 x i32> @lshr_or(<4 x i32> %x,
 define <8 x i16> @lshr_xor(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-LABEL: lshr_xor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrlw $5, %xmm0
+; CHECK-NEXT:    psrlw $7, %xmm1
+; CHECK-NEXT:    psrlw $12, %xmm0
 ; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    psrlw $7, %xmm0
 ; CHECK-NEXT:    retq
   %sh0 = lshr <8 x i16> %x, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   %r = xor <8 x i16> %y, %sh0
@@ -89,17 +89,17 @@ define <8 x i16> @lshr_xor(<8 x i16> %x,
 define <16 x i8> @ashr_and(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-LABEL: ashr_and:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    psrlw $3, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; CHECK-NEXT:    pxor %xmm2, %xmm1
+; CHECK-NEXT:    psubb %xmm2, %xmm1
+; CHECK-NEXT:    psrlw $5, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; CHECK-NEXT:    pxor %xmm2, %xmm0
 ; CHECK-NEXT:    psubb %xmm2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    psrlw $2, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    psubb %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %sh0 = ashr <16 x i8> %x, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   %r = and <16 x i8> %y, %sh0
@@ -110,19 +110,19 @@ define <16 x i8> @ashr_and(<16 x i8> %x,
 define <2 x i64> @ashr_or(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-LABEL: ashr_or:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    psrad $7, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-NEXT:    psrlq $7, %xmm1
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    psrad $5, %xmm2
+; CHECK-NEXT:    psrad $12, %xmm2
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-NEXT:    psrlq $5, %xmm0
+; CHECK-NEXT:    psrlq $12, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrad $7, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; CHECK-NEXT:    psrlq $7, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %sh0 = ashr <2 x i64> %x, <i64 5, i64 5>
   %r = or <2 x i64> %sh0, %y
@@ -134,9 +134,9 @@ define i32 @ashr_xor(i32 %x, i32 %y) nou
 ; CHECK-LABEL: ashr_xor:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    sarl $5, %eax
+; CHECK-NEXT:    sarl $7, %esi
+; CHECK-NEXT:    sarl $12, %eax
 ; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    sarl $7, %eax
 ; CHECK-NEXT:    retq
   %sh0 = ashr i32 %x, 5
   %r = xor i32 %y, %sh0