[llvm] 100763a - [DAG] Extend SearchForAndLoads with any_extend handling

Tue Jan 18 13:03:14 PST 2022

Author: David Green
Date: 2022-01-18T21:03:08Z
New Revision: 100763a88fe97b22cd5e3f69d203669aac3ed48f

URL: https://github.com/llvm/llvm-project/commit/100763a88fe97b22cd5e3f69d203669aac3ed48f
DIFF: https://github.com/llvm/llvm-project/commit/100763a88fe97b22cd5e3f69d203669aac3ed48f.diff

LOG: [DAG] Extend SearchForAndLoads with any_extend handling

This extends the code in SearchForAndLoads to be able to look through
ANY_EXTEND nodes, which can be created from mismatching IR types where
the AND node we begin from only demands the low parts of the register.
That turns zext and sext into any_extends as only the low bits are
demanded. To be able to look through ANY_EXTEND nodes we need to handle
mismatching types in a few places, potentially truncating the mask to
the size of the final load.

Recommitted with a more conservative check for the type of the extend.

Differential Revision: https://reviews.llvm.org/D117457

Added: 
    llvm/test/CodeGen/X86/combine-andintoload.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/combine-andintoload.ll
    llvm/test/CodeGen/X86/pr35763.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7c684bd3aeb97..d1f75b40e79db 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5491,6 +5491,8 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
 
     // Some constants may need fixing up later if they are too large.
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (Mask->getValueType(0) != C->getValueType(0))
+        return false;
       if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
           (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
         NodesWithConsts.insert(N);
@@ -5524,9 +5526,9 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
     case ISD::AssertZext: {
       unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
       EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
-      EVT VT = Op.getOpcode() == ISD::AssertZext ?
-        cast<VTSDNode>(Op.getOperand(1))->getVT() :
-        Op.getOperand(0).getValueType();
+      EVT VT = Op.getOpcode() == ISD::AssertZext
+                   ? cast<VTSDNode>(Op.getOperand(1))->getVT()
+                   : Op.getOperand(0).getValueType();
 
       // We can accept extending nodes if the mask is wider or an equal
       // width to the original type.
@@ -5534,6 +5536,15 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
         continue;
       break;
     }
+    case ISD::ANY_EXTEND: {
+      unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
+      EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+      EVT VT = Op.getOperand(0).getValueType();
+      if (ExtVT.bitsGE(VT))
+        break;
+      // Fallthrough to searching for nodes from the operands of the extend.
+      LLVM_FALLTHROUGH;
+    }
     case ISD::OR:
     case ISD::XOR:
     case ISD::AND:
@@ -5593,12 +5604,14 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
     // masking.
     if (FixupNode) {
       LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
-      SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
-                                FixupNode->getValueType(0),
-                                SDValue(FixupNode, 0), MaskOp);
+      SDValue MaskOpT = DAG.getZExtOrTrunc(MaskOp, SDLoc(FixupNode),
+                                           FixupNode->getValueType(0));
+      SDValue And =
+          DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0),
+                      SDValue(FixupNode, 0), MaskOpT);
       DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
       if (And.getOpcode() == ISD ::AND)
-        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
+        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOpT);
     }
 
     // Narrow any constants that need it.
@@ -5607,10 +5620,12 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
       SDValue Op1 = LogicN->getOperand(1);
 
       if (isa<ConstantSDNode>(Op0))
-          std::swap(Op0, Op1);
+        std::swap(Op0, Op1);
 
-      SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
-                                Op1, MaskOp);
+      SDValue MaskOpT =
+          DAG.getZExtOrTrunc(MaskOp, SDLoc(Op1), Op1.getValueType());
+      SDValue And =
+          DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), Op1, MaskOpT);
 
       DAG.UpdateNodeOperands(LogicN, Op0, And);
     }
@@ -5618,12 +5633,14 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
     // Create narrow loads.
     for (auto *Load : Loads) {
       LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
+      SDValue MaskOpT =
+          DAG.getZExtOrTrunc(MaskOp, SDLoc(Load), Load->getValueType(0));
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
-                                SDValue(Load, 0), MaskOp);
+                                SDValue(Load, 0), MaskOpT);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
       if (And.getOpcode() == ISD ::AND)
         And = SDValue(
-            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
+            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOpT), 0);
       SDValue NewLoad = reduceLoadWidth(And.getNode());
       assert(NewLoad &&
              "Shouldn't be masking the load if it can't be narrowed");

diff  --git a/llvm/test/CodeGen/AArch64/combine-andintoload.ll b/llvm/test/CodeGen/AArch64/combine-andintoload.ll
index 2fcd546d2ade9..d7a69fdd11936 100644
--- a/llvm/test/CodeGen/AArch64/combine-andintoload.ll
+++ b/llvm/test/CodeGen/AArch64/combine-andintoload.ll
@@ -5,16 +5,14 @@
 define i64 @load32_and16_and(i32* %p, i64 %y) {
 ; CHECK-LABEL: load32_and16_and:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_and:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = zext i32 %x to i64
@@ -26,16 +24,14 @@ define i64 @load32_and16_and(i32* %p, i64 %y) {
 define i64 @load32_and16_andr(i32* %p, i64 %y) {
 ; CHECK-LABEL: load32_and16_andr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_andr:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = zext i32 %x to i64
@@ -47,16 +43,14 @@ define i64 @load32_and16_andr(i32* %p, i64 %y) {
 define i64 @load32_and16_and_sext(i32* %p, i64 %y) {
 ; CHECK-LABEL: load32_and16_and_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_and_sext:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = sext i32 %x to i64
@@ -68,16 +62,16 @@ define i64 @load32_and16_and_sext(i32* %p, i64 %y) {
 define i64 @load32_and16_or(i32* %p, i64 %y) {
 ; CHECK-LABEL: load32_and16_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    orr w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w9, w1, #0xffff
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_or:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    orr w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w9, w1, #0xffff
+; CHECKBE-NEXT:    orr w0, w9, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = zext i32 %x to i64
@@ -170,16 +164,14 @@ define i64 @load16_and16(i16* %p, i64 %y) {
 define i64 @load16_and8(i16* %p, i64 %y) {
 ; CHECK-LABEL: load16_and8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load16_and8:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldrh w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xff
+; CHECKBE-NEXT:    ldrb w8, [x0, #1]
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i16, i16* %p, align 4
   %xz = zext i16 %x to i64
@@ -232,15 +224,13 @@ define i64 @load8_and16_zext(i8* %p, i8 %y) {
 ; CHECK-LABEL: load8_and16_zext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load8_and16_zext:
 ; CHECKBE:       // %bb.0:
 ; CHECKBE-NEXT:    ldrb w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xff
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i8, i8* %p, align 4
   %xz = zext i8 %x to i64
@@ -296,16 +286,14 @@ define i64 @load8_and16_or(i8* %p, i64 %y) {
 define i64 @load16_and8_manyext(i16* %p, i32 %y) {
 ; CHECK-LABEL: load16_and8_manyext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    and w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    and w0, w1, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load16_and8_manyext:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldrh w8, [x0]
-; CHECKBE-NEXT:    and w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xff
+; CHECKBE-NEXT:    ldrb w8, [x0, #1]
+; CHECKBE-NEXT:    and w0, w1, w8
 ; CHECKBE-NEXT:    ret
   %x = load i16, i16* %p, align 4
   %xz = zext i16 %x to i32
@@ -318,18 +306,16 @@ define i64 @load16_and8_manyext(i16* %p, i32 %y) {
 define i64 @multiple_load(i16* %p, i32* %q) {
 ; CHECK-LABEL: multiple_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    and w8, w9, w8
-; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: multiple_load:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldrh w8, [x0]
-; CHECKBE-NEXT:    ldr w9, [x1]
-; CHECKBE-NEXT:    and w8, w9, w8
-; CHECKBE-NEXT:    and x0, x8, #0xff
+; CHECKBE-NEXT:    ldrb w8, [x0, #1]
+; CHECKBE-NEXT:    ldrb w9, [x1, #3]
+; CHECKBE-NEXT:    and w0, w9, w8
 ; CHECKBE-NEXT:    ret
   %x = load i16, i16* %p, align 4
   %xz = zext i16 %x to i64
@@ -343,18 +329,16 @@ define i64 @multiple_load(i16* %p, i32* %q) {
 define i64 @multiple_load_or(i16* %p, i32* %q) {
 ; CHECK-LABEL: multiple_load_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: multiple_load_or:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldrh w8, [x0]
-; CHECKBE-NEXT:    ldr w9, [x1]
-; CHECKBE-NEXT:    orr w8, w9, w8
-; CHECKBE-NEXT:    and x0, x8, #0xff
+; CHECKBE-NEXT:    ldrb w8, [x0, #1]
+; CHECKBE-NEXT:    ldrb w9, [x1, #3]
+; CHECKBE-NEXT:    orr w0, w9, w8
 ; CHECKBE-NEXT:    ret
   %x = load i16, i16* %p, align 4
   %xz = zext i16 %x to i64
@@ -368,16 +352,16 @@ define i64 @multiple_load_or(i16* %p, i32* %q) {
 define i64 @load32_and16_zexty(i32* %p, i32 %y) {
 ; CHECK-LABEL: load32_and16_zexty:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    orr w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w9, w1, #0xffff
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_zexty:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    orr w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w9, w1, #0xffff
+; CHECKBE-NEXT:    orr w0, w9, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = zext i32 %x to i64
@@ -390,16 +374,16 @@ define i64 @load32_and16_zexty(i32* %p, i32 %y) {
 define i64 @load32_and16_sexty(i32* %p, i32 %y) {
 ; CHECK-LABEL: load32_and16_sexty:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    orr w8, w1, w8
-; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    and w9, w1, #0xffff
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECKBE-LABEL: load32_and16_sexty:
 ; CHECKBE:       // %bb.0:
-; CHECKBE-NEXT:    ldr w8, [x0]
-; CHECKBE-NEXT:    orr w8, w1, w8
-; CHECKBE-NEXT:    and x0, x8, #0xffff
+; CHECKBE-NEXT:    ldrh w8, [x0, #2]
+; CHECKBE-NEXT:    and w9, w1, #0xffff
+; CHECKBE-NEXT:    orr w0, w9, w8
 ; CHECKBE-NEXT:    ret
   %x = load i32, i32* %p, align 4
   %xz = zext i32 %x to i64
@@ -408,3 +392,49 @@ define i64 @load32_and16_sexty(i32* %p, i32 %y) {
   %r = and i64 %a, 65535
   ret i64 %r
 }
+
+define zeroext i1 @bigger(i8* nocapture readonly %c, i8* nocapture readonly %e, i64 %d, i64 %p1) {
+; CHECK-LABEL: bigger:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldrb w8, [x0, x2]
+; CHECK-NEXT:    and w10, w3, #0x7
+; CHECK-NEXT:    ldrb w9, [x1, x2]
+; CHECK-NEXT:    mov w11, #8
+; CHECK-NEXT:    sub w10, w11, w10
+; CHECK-NEXT:    eor w8, w9, w8
+; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    lsr w8, w8, w10
+; CHECK-NEXT:    tst w8, w9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+;
+; CHECKBE-LABEL: bigger:
+; CHECKBE:       // %bb.0: // %entry
+; CHECKBE-NEXT:    ldrb w8, [x0, x2]
+; CHECKBE-NEXT:    and w10, w3, #0x7
+; CHECKBE-NEXT:    ldrb w9, [x1, x2]
+; CHECKBE-NEXT:    mov w11, #8
+; CHECKBE-NEXT:    sub w10, w11, w10
+; CHECKBE-NEXT:    eor w8, w9, w8
+; CHECKBE-NEXT:    mov w9, #5
+; CHECKBE-NEXT:    lsr w8, w8, w10
+; CHECKBE-NEXT:    tst w8, w9
+; CHECKBE-NEXT:    cset w0, eq
+; CHECKBE-NEXT:    ret
+entry:
+  %0 = trunc i64 %p1 to i16
+  %1 = and i16 %0, 7
+  %sh_prom = sub nuw nsw i16 8, %1
+  %shl = shl nuw nsw i16 5, %sh_prom
+  %arrayidx = getelementptr inbounds i8, i8* %c, i64 %d
+  %2 = load i8, i8* %arrayidx, align 1
+  %3 = and i16 %shl, 255
+  %conv2 = zext i16 %3 to i32
+  %arrayidx3 = getelementptr inbounds i8, i8* %e, i64 %d
+  %4 = load i8, i8* %arrayidx3, align 1
+  %5 = xor i8 %4, %2
+  %6 = zext i8 %5 to i32
+  %7 = and i32 %6, %conv2
+  %cmp.not = icmp eq i32 %7, 0
+  ret i1 %cmp.not
+}

diff  --git a/llvm/test/CodeGen/X86/combine-andintoload.ll b/llvm/test/CodeGen/X86/combine-andintoload.ll
new file mode 100644
index 0000000000000..d33776b131f80
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-andintoload.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-none-eabi -o - | FileCheck %s
+
+define zeroext i1 @bigger(i8* nocapture readonly %c, i8* nocapture readonly %e, i64 %d, i64 %p1) {
+; CHECK-LABEL: bigger:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $7, %cl
+; CHECK-NEXT:    movb $8, %al
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shll %cl, %r8d
+; CHECK-NEXT:    movb (%rsi,%rdx), %al
+; CHECK-NEXT:    xorb (%rdi,%rdx), %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    andl %r8d, %eax
+; CHECK-NEXT:    testb $-1, %al
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+entry:
+  %0 = trunc i64 %p1 to i16
+  %1 = and i16 %0, 7
+  %sh_prom = sub nuw nsw i16 8, %1
+  %shl = shl nuw nsw i16 5, %sh_prom
+  %arrayidx = getelementptr inbounds i8, i8* %c, i64 %d
+  %2 = load i8, i8* %arrayidx, align 1
+  %3 = and i16 %shl, 255
+  %conv2 = zext i16 %3 to i32
+  %arrayidx3 = getelementptr inbounds i8, i8* %e, i64 %d
+  %4 = load i8, i8* %arrayidx3, align 1
+  %5 = xor i8 %4, %2
+  %6 = zext i8 %5 to i32
+  %7 = and i32 %6, %conv2
+  %cmp.not = icmp eq i32 %7, 0
+  ret i1 %cmp.not
+}

diff  --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll
index 8b3e91dc577ae..53a0a0284d11d 100644
--- a/llvm/test/CodeGen/X86/pr35763.ll
+++ b/llvm/test/CodeGen/X86/pr35763.ll
@@ -10,10 +10,10 @@
 define dso_local void @PR35763() {
 ; CHECK-LABEL: PR35763:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl z(%rip), %eax
-; CHECK-NEXT:    orl z+2(%rip), %eax
-; CHECK-NEXT:    movzwl %ax, %eax
-; CHECK-NEXT:    movq %rax, tf_3_var_136(%rip)
+; CHECK-NEXT:    movzwl z(%rip), %eax
+; CHECK-NEXT:    movzwl z+2(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, tf_3_var_136(%rip)
 ; CHECK-NEXT:    movl z+6(%rip), %eax
 ; CHECK-NEXT:    movzbl z+10(%rip), %ecx
 ; CHECK-NEXT:    shlq $32, %rcx