[llvm] r329585 - Support generic expansion of ordered vector reduction (PR36732)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 9 08:44:20 PDT 2018
Author: rksimon
Date: Mon Apr 9 08:44:20 2018
New Revision: 329585
URL: http://llvm.org/viewvc/llvm-project?rev=329585&view=rev
Log:
Support generic expansion of ordered vector reduction (PR36732)
Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order.
This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1]
Differential Revision: https://reviews.llvm.org/D45366
Modified:
llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h
llvm/trunk/lib/CodeGen/ExpandReductions.cpp
llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp
llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll
Modified: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h?rev=329585&r1=329584&r2=329585&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h (original)
+++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h Mon Apr 9 08:44:20 2018
@@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE = nullptr);
+/// Generates an ordered vector reduction using extracts to reduce the value.
+Value *
+getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+ RecurrenceDescriptor::MRK_Invalid,
+ ArrayRef<Value *> RedOps = None);
+
/// Generates a vector reduction using shufflevectors to reduce the value.
Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind
Modified: llvm/trunk/lib/CodeGen/ExpandReductions.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ExpandReductions.cpp?rev=329585&r1=329584&r2=329585&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/ExpandReductions.cpp (original)
+++ llvm/trunk/lib/CodeGen/ExpandReductions.cpp Mon Apr 9 08:44:20 2018
@@ -78,13 +78,15 @@ RecurrenceDescriptor::MinMaxRecurrenceKi
bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
bool Changed = false;
- SmallVector<IntrinsicInst*, 4> Worklist;
+ SmallVector<IntrinsicInst *, 4> Worklist;
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
if (auto II = dyn_cast<IntrinsicInst>(&*I))
Worklist.push_back(II);
for (auto *II : Worklist) {
IRBuilder<> Builder(II);
+ bool IsOrdered = false;
+ Value *Acc = nullptr;
Value *Vec = nullptr;
auto ID = II->getIntrinsicID();
auto MRK = RecurrenceDescriptor::MRK_Invalid;
@@ -92,11 +94,10 @@ bool expandReductions(Function &F, const
case Intrinsic::experimental_vector_reduce_fadd:
case Intrinsic::experimental_vector_reduce_fmul:
// FMFs must be attached to the call, otherwise it's an ordered reduction
- // and it can't be handled by generating this shuffle sequence.
- // TODO: Implement scalarization of ordered reductions here for targets
- // without native support.
+ // and it can't be handled by generating a shuffle sequence.
if (!II->getFastMathFlags().isFast())
- continue;
+ IsOrdered = true;
+ Acc = II->getArgOperand(0);
Vec = II->getArgOperand(1);
break;
case Intrinsic::experimental_vector_reduce_add:
@@ -118,7 +119,9 @@ bool expandReductions(Function &F, const
}
if (!TTI->shouldExpandReduction(II))
continue;
- auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+ Value *Rdx =
+ IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
+ : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
II->replaceAllUsesWith(Rdx);
II->eraseFromParent();
Changed = true;
Modified: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp?rev=329585&r1=329584&r2=329585&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp Mon Apr 9 08:44:20 2018
@@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V)
return V;
}
+// Helper to generate an ordered reduction.
+Value *
+llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+ unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+ ArrayRef<Value *> RedOps) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+
+ // Extract and apply reduction ops in ascending order:
+ // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+ Value *Result = Acc;
+ for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+ Value *Ext =
+ Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+ "bin.rdx");
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
+ Ext);
+ }
+
+ if (!RedOps.empty())
+ propagateIRFlags(Result, RedOps);
+ }
+
+ return Result;
+}
+
// Helper to generate a log2 shuffle reduction.
Value *
llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
Modified: llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll?rev=329585&r1=329584&r2=329585&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll (original)
+++ llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll Mon Apr 9 08:44:20 2018
@@ -117,8 +117,15 @@ entry:
define float @fadd_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
-; CHECK-NEXT: ret float [[R]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
@@ -128,8 +135,15 @@ entry:
define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict_accum(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
-; CHECK-NEXT: ret float [[R]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
@@ -169,8 +183,15 @@ entry:
define float @fmul_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
-; CHECK-NEXT: ret float [[R]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
@@ -180,8 +201,15 @@ entry:
define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict_accum(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
-; CHECK-NEXT: ret float [[R]]
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)
More information about the llvm-commits
mailing list