[llvm] r373833 - [SLP] avoid reduction transform on patterns that the backend can load-combine
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 5 11:03:59 PDT 2019
Author: spatel
Date: Sat Oct 5 11:03:58 2019
New Revision: 373833
URL: http://llvm.org/viewvc/llvm-project?rev=373833&view=rev
Log:
[SLP] avoid reduction transform on patterns that the backend can load-combine
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a scalar cost model adjustment with a conservative pattern match and cost
summation for a multi-instruction sequence that can probably be reduced later.
This should prevent SLP from creating a vector reduction unless that sequence is
extremely cheap.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
Modified:
llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=373833&r1=373832&r2=373833&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Sat Oct 5 11:03:58 2019
@@ -1129,6 +1129,16 @@ private:
/// Returns -1 if the cost is unknown.
int getInstructionThroughput(const Instruction *I) const;
+ /// Given an input value that is an element of an 'or' reduction, check if the
+ /// reduction is composed of narrower loaded values. Assuming that a
+ /// legal-sized reduction of shifted/zexted loaded values can be load combined
+ /// in the backend, create a relative cost that accounts for the removal of
+ /// the intermediate ops and replacement by a single wide load.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ Optional<int> getLoadCombineCost(unsigned Opcode,
+ ArrayRef<const Value *> Args) const;
+
/// The abstract base class used to type erase specific TTI
/// implementations.
class Concept;
Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=373833&r1=373832&r2=373833&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Sat Oct 5 11:03:58 2019
@@ -571,11 +571,64 @@ TargetTransformInfo::getOperandInfo(Valu
return OpInfo;
}
+Optional<int>
+TargetTransformInfo::getLoadCombineCost(unsigned Opcode,
+ ArrayRef<const Value *> Args) const {
+ if (Opcode != Instruction::Or)
+ return llvm::None;
+ if (Args.empty())
+ return llvm::None;
+
+ // Look past the reduction to find a source value. Arbitrarily follow the
+ // path through operand 0 of any 'or'. Also, peek through optional
+ // shift-left-by-constant.
+ const Value *ZextLoad = Args.front();
+ while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+ // Check if the input to the reduction is an extended load.
+ Value *LoadPtr;
+ if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ return llvm::None;
+
+ // Require that the total load bit width is a legal integer type.
+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+ Type *WideType = ZextLoad->getType();
+ Type *EltType = LoadPtr->getType()->getPointerElementType();
+ unsigned WideWidth = WideType->getIntegerBitWidth();
+ unsigned EltWidth = EltType->getIntegerBitWidth();
+ if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0)
+ return llvm::None;
+
+ // Calculate relative cost: {narrow load+zext+shl+or} are assumed to be
+ // removed and replaced by a single wide load.
+ // FIXME: This is not accurate for the larger pattern where we replace
+ // multiple narrow load sequences with just 1 wide load. We could
+ // remove the addition of the wide load cost here and expect the caller
+ // to make an adjustment for that.
+ int Cost = 0;
+ Cost -= getMemoryOpCost(Instruction::Load, EltType, 0, 0);
+ Cost -= getCastInstrCost(Instruction::ZExt, WideType, EltType);
+ Cost -= getArithmeticInstrCost(Instruction::Shl, WideType);
+ Cost -= getArithmeticInstrCost(Instruction::Or, WideType);
+ Cost += getMemoryOpCost(Instruction::Load, WideType, 0, 0);
+ return Cost;
+}
+
+
int TargetTransformInfo::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args) const {
+ // Check if we can match this instruction as part of a larger pattern.
+ Optional<int> LoadCombineCost = getLoadCombineCost(Opcode, Args);
+ if (LoadCombineCost)
+ return LoadCombineCost.getValue();
+
+ // Fallback to implementation-specific overrides or base class.
int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args);
assert(Cost >= 0 && "TTI should not produce negative costs!");
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=373833&r1=373832&r2=373833&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Sat Oct 5 11:03:58 2019
@@ -6499,10 +6499,19 @@ private:
int ScalarReduxCost = 0;
switch (ReductionData.getKind()) {
- case RK_Arithmetic:
- ScalarReduxCost =
- TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ case RK_Arithmetic: {
+ // Note: Passing in the reduction operands allows the cost model to match
+ // load combining patterns for this reduction.
+ auto *ReduxInst = cast<Instruction>(ReductionRoot);
+ SmallVector<const Value *, 2> OperandList;
+ for (Value *Operand : ReduxInst->operands())
+ OperandList.push_back(Operand);
+ ScalarReduxCost = TTI->getArithmeticInstrCost(ReductionData.getOpcode(),
+ ScalarTy, TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None, OperandList);
break;
+ }
case RK_Min:
case RK_Max:
case RK_UMin:
Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll?rev=373833&r1=373832&r2=373833&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/bad-reduction.ll Sat Oct 5 11:03:58 2019
@@ -15,31 +15,37 @@ define i64 @load_bswap(%v8i8* %p) {
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
+; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
-; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
-; CHECK-NEXT: ret i64 [[OP_EXTRA]]
+; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
+; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -97,18 +103,38 @@ define i64 @load_bswap_nop_shift(%v8i8*
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT: ret i64 [[TMP5]]
+; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
+; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
+; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
+; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
+; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
+; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
+; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
+; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
+; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
+; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
+; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
+; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
+; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
+; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
+; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -168,30 +194,36 @@ define i64 @load64le(i8* %arg) {
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
+; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
-; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
-; CHECK-NEXT: ret i64 [[OP_EXTRA]]
+; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]]
+; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -247,18 +279,38 @@ define i64 @load64le_nop_shift(i8* %arg)
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT: ret i64 [[TMP5]]
+; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
+; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
+; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
+; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
+; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
+; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
+; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
+; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
+; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
+; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
+; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
+; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
+; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
+; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
+; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]]
+; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
More information about the llvm-commits
mailing list