[llvm] r375025 - [SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)

Wed Oct 16 11:06:25 PDT 2019

Author: spatel
Date: Wed Oct 16 11:06:24 2019
New Revision: 375025

URL: http://llvm.org/viewvc/llvm-project?rev=375025&view=rev
Log:
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)

The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.

I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146

We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).

Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.

In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:

  movbe   rax, qword ptr [rdi]

or:

  mov     rax, qword ptr [rdi]

Not some (half) vector monstrosity as we currently do using SLP:

  vpmovzxbq       ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
  vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
  movzx   eax, byte ptr [rdi]
  movzx   ecx, byte ptr [rdi + 5]
  shl     rcx, 40
  movzx   edx, byte ptr [rdi + 6]
  shl     rdx, 48
  or      rdx, rcx
  movzx   ecx, byte ptr [rdi + 7]
  shl     rcx, 56
  or      rcx, rdx
  or      rcx, rax
  vextracti128    xmm1, ymm0, 1
  vpor    xmm0, xmm0, xmm1
  vpshufd xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
  vpor    xmm0, xmm0, xmm1
  vmovq   rax, xmm0
  or      rax, rcx
  vzeroupper
  ret

Differential Revision: https://reviews.llvm.org/D67841

Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=375025&r1=375024&r2=375025&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Wed Oct 16 11:06:24 2019
@@ -618,6 +618,15 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable() const;
 
+  /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -3284,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTre
   return true;
 }
 
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+  if (RdxOpcode != Instruction::Or)
+    return false;
+
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+
+  // Look past the reduction to find a source value. Arbitrarily follow the
+  // path through operand 0 of any 'or'. Also, peek through optional
+  // shift-left-by-constant.
+  Value *ZextLoad = FirstReduced;
+  while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+         match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+  // Check if the input to the reduction is an extended load.
+  Value *LoadPtr;
+  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+    return false;
+
+  // Require that the total load bit width is a legal integer type.
+  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+  Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+  unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+  LLVMContext &Context = FirstReduced->getContext();
+  if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+    return false;
+
+  // Everything matched - assume that we can fold the whole sequence using
+  // load combining.
+  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
+             << *(cast<Instruction>(FirstReduced)) << "\n");
+
+  return true;
+}
+
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
@@ -6374,6 +6420,8 @@ public:
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
+      if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+        break;
 
       V.computeMinimumValueSizes();
 

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll?rev=375025&r1=375024&r2=375025&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll Wed Oct 16 11:06:24 2019
@@ -15,31 +15,37 @@ define i64 @load_bswap(%v8i8* %p) {
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
 ; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
 ; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
 ; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
 ; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
 ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
 ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
 ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
 ; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
 ; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
 ; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
 ;
   %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
   %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -97,18 +103,38 @@ define i64 @load_bswap_nop_shift(%v8i8*
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
+; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
+; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
+; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
+; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
+; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
+; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
+; CHECK-NEXT:    [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
 ;
   %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
   %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -168,30 +194,36 @@ define i64 @load64le(i8* %arg) {
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
 ; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
 ; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
 ; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
 ; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
 ; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
 ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
 ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
 ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
 ; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
 ; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
 ; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[Z0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
 ;
   %g1 = getelementptr inbounds i8, i8* %arg, i64 1
   %g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -247,18 +279,38 @@ define i64 @load64le_nop_shift(i8* %arg)
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
+; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
+; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
+; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
+; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
+; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
+; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
+; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[S0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
 ;
   %g1 = getelementptr inbounds i8, i8* %arg, i64 1
   %g2 = getelementptr inbounds i8, i8* %arg, i64 2