[llvm] r336130 - [SLP] Recognize min/max pattern using instructions producing same values.
Farhana Aleen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 2 10:55:32 PDT 2018
Author: faaleen
Date: Mon Jul 2 10:55:31 2018
New Revision: 336130
URL: http://llvm.org/viewvc/llvm-project?rev=336130&view=rev
Log:
[SLP] Recognize min/max pattern using instructions producing same values.
Summary: It is common to have the following min/max pattern during the intermediate stages of SLP since we only optimize at the end. This patch tries to catch such patterns and allow more vectorization.
%1 = extractelement <2 x i32> %a, i32 0
%2 = extractelement <2 x i32> %a, i32 1
%cond = icmp sgt i32 %1, %2
%3 = extractelement <2 x i32> %a, i32 0
%4 = extractelement <2 x i32> %a, i32 1
%select = select i1 %cond, i32 %3, i32 %4
Author: FarhanaAleen
Reviewed By: ABataev, RKSimon, spatel
Differential Revision: https://reviews.llvm.org/D47608
Modified:
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=336130&r1=336129&r2=336130&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Mon Jul 2 10:55:31 2018
@@ -5469,6 +5469,77 @@ class HorizontalReduction {
return OperationData(
Instruction::FCmp, LHS, RHS, RK_Max,
cast<Instruction>(Select->getCondition())->hasNoNaNs());
+ } else {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ LHS = Select->getTrueValue();
+ RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return OperationData(V);
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return OperationData(V);
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return OperationData(V);
+ }
+ switch (Pred) {
+ default:
+ return OperationData(V);
+
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
+
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
+
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
+ cast<Instruction>(Cond)->hasNoNaNs());
+
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
+
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
+
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGT:
+ case CmpInst::FCMP_UGE:
+ return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
+ cast<Instruction>(Cond)->hasNoNaNs());
+ }
}
}
return OperationData(V);
Modified: llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll?rev=336130&r1=336129&r2=336130&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll Mon Jul 2 10:55:31 2018
@@ -20,21 +20,19 @@ define i32 @smaxv6() {
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
-; GFX9-NEXT: [[LOAD3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; GFX9-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], i32 [[SELECT1]], i32 [[LOAD3]]
-; GFX9-NEXT: [[LOAD4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; GFX9-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], i32 [[SELECT2]], i32 [[LOAD4]]
-; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; GFX9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], i32 [[SELECT3]], i32 [[LOAD5]]
-; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[SELECT4]], i32 [[LOAD6]]
+; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8
-; GFX9-NEXT: ret i32 [[SELECT5]]
+; GFX9-NEXT: ret i32 [[OP_EXTRA]]
;
%load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@@ -69,21 +67,19 @@ define i64 @sminv6() {
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
-; GFX9-NEXT: [[LOAD3:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
-; GFX9-NEXT: [[CMP2:%.*]] = icmp slt i64 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], i64 [[SELECT1]], i64 [[LOAD3]]
-; GFX9-NEXT: [[LOAD4:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
-; GFX9-NEXT: [[CMP3:%.*]] = icmp slt i64 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], i64 [[SELECT2]], i64 [[LOAD4]]
-; GFX9-NEXT: [[LOAD5:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
-; GFX9-NEXT: [[CMP4:%.*]] = icmp slt i64 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], i64 [[SELECT3]], i64 [[LOAD5]]
-; GFX9-NEXT: [[LOAD6:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
-; GFX9-NEXT: [[CMP5:%.*]] = icmp slt i64 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i64 [[SELECT4]], i64 [[LOAD6]]
+; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
+; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
+; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT: [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8
-; GFX9-NEXT: ret i64 [[SELECT5]]
+; GFX9-NEXT: ret i64 [[OP_EXTRA]]
;
%load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
%load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
@@ -118,21 +114,19 @@ define float @fmaxv6() {
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
-; GFX9-NEXT: [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
-; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
-; GFX9-NEXT: [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
-; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
-; GFX9-NEXT: [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
-; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
-; GFX9-NEXT: [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
-; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
+; GFX9-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
+; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
+; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8
-; GFX9-NEXT: ret float [[SELECT5]]
+; GFX9-NEXT: ret float [[OP_EXTRA]]
;
%load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
%load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
@@ -167,21 +161,19 @@ define double @dminv6() {
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
-; GFX9-NEXT: [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
-; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
-; GFX9-NEXT: [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
-; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
-; GFX9-NEXT: [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
-; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
-; GFX9-NEXT: [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
-; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
+; GFX9-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
+; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
+; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
+; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8
-; GFX9-NEXT: ret double [[SELECT5]]
+; GFX9-NEXT: ret double [[OP_EXTRA]]
;
%load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
%load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
@@ -216,21 +208,19 @@ define i32 @smax_wdiff_valuenum(i32, i32
; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
-; GFX9-NEXT: [[LOAD3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
-; GFX9-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[SELECT1]], [[LOAD3]]
-; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], i32 [[SELECT1]], i32 [[LOAD3]]
-; GFX9-NEXT: [[LOAD4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
-; GFX9-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[SELECT2]], [[LOAD4]]
-; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], i32 [[SELECT2]], i32 [[LOAD4]]
-; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
-; GFX9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[SELECT3]], [[LOAD5]]
-; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], i32 [[SELECT3]], i32 [[LOAD5]]
-; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
-; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SELECT4]], [[LOAD6]]
-; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[SELECT4]], i32 [[LOAD6]]
+; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
+; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]]
; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8
-; GFX9-NEXT: ret i32 [[SELECT5]]
+; GFX9-NEXT: ret i32 [[OP_EXTRA]]
;
%vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
%elt1 = extractelement <2 x i32> %vload, i32 0
More information about the llvm-commits
mailing list