[llvm] 62e228f - [Matrix] Add info about number of operations to remarks.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 17:45:11 PST 2020
Author: Florian Hahn
Date: 2020-01-27T17:43:39-08:00
New Revision: 62e228f8fdb0ca86fc1663ef058f875021ede9a0
URL: https://github.com/llvm/llvm-project/commit/62e228f8fdb0ca86fc1663ef058f875021ede9a0
DIFF: https://github.com/llvm/llvm-project/commit/62e228f8fdb0ca86fc1663ef058f875021ede9a0.diff
LOG: [Matrix] Add info about number of operations to remarks.
This patch updates the remark to also include a summary of the number of
vector operations generated for each matrix expression.
Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke
Reviewed By: anemet
Differential Revision: https://reviews.llvm.org/D72480
Added:
Modified:
llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index daf1707c2d6c..149f87efb705 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -141,11 +141,30 @@ class LowerMatrixIntrinsics {
const TargetTransformInfo &TTI;
OptimizationRemarkEmitter &ORE;
+ /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+ struct OpInfoTy {
+ /// Number of stores emitted to generate this matrix.
+ unsigned NumStores = 0;
+ /// Number of loads emitted to generate this matrix.
+ unsigned NumLoads = 0;
+ /// Number of compute operations emitted to generate this matrix.
+ unsigned NumComputeOps = 0;
+
+ OpInfoTy &operator+=(const OpInfoTy &RHS) {
+ NumStores += RHS.NumStores;
+ NumLoads += RHS.NumLoads;
+ NumComputeOps += RHS.NumComputeOps;
+ return *this;
+ }
+ };
+
/// Wrapper class representing a matrix as a set of column vectors.
/// All column vectors must have the same vector type.
class ColumnMatrixTy {
SmallVector<Value *, 16> Columns;
+ OpInfoTy OpInfo;
+
public:
ColumnMatrixTy() : Columns() {}
ColumnMatrixTy(ArrayRef<Value *> Cols)
@@ -167,6 +186,10 @@ class LowerMatrixIntrinsics {
void addColumn(Value *V) { Columns.push_back(V); }
+ VectorType *getColumnTy() {
+ return cast<VectorType>(Columns[0]->getType());
+ }
+
iterator_range<SmallVector<Value *, 8>::iterator> columns() {
return make_range(Columns.begin(), Columns.end());
}
@@ -177,6 +200,29 @@ class LowerMatrixIntrinsics {
return Columns.size() == 1 ? Columns[0]
: concatenateVectors(Builder, Columns);
}
+
+ ColumnMatrixTy &addNumLoads(unsigned N) {
+ OpInfo.NumLoads += N;
+ return *this;
+ }
+
+ void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
+
+ ColumnMatrixTy &addNumStores(unsigned N) {
+ OpInfo.NumStores += N;
+ return *this;
+ }
+
+ ColumnMatrixTy &addNumComputeOps(unsigned N) {
+ OpInfo.NumComputeOps += N;
+ return *this;
+ }
+
+ unsigned getNumStores() const { return OpInfo.NumStores; }
+ unsigned getNumLoads() const { return OpInfo.NumLoads; }
+ unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
+
+ const OpInfoTy &getOpInfo() const { return OpInfo; }
};
struct ShapeInfo {
@@ -224,6 +270,20 @@ class LowerMatrixIntrinsics {
OptimizationRemarkEmitter &ORE)
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {}
+ unsigned getNumOps(Type *VT) {
+ assert(isa<VectorType>(VT) && "Expected vector type");
+ return getNumOps(VT->getScalarType(),
+ cast<VectorType>(VT)->getNumElements());
+ }
+
+ //
+ /// Return the estimated number of vector ops required for an operation on
+ /// \p VT * N.
+ unsigned getNumOps(Type *ST, unsigned N) {
+ return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() /
+ double(TTI.getRegisterBitWidth(true)));
+ }
+
/// Return the set of column vectors that a matrix value is lowered to.
///
/// If we lowered \p MatrixVal, just return the cache result column matrix.
@@ -582,7 +642,10 @@ class LowerMatrixIntrinsics {
Result.addColumn(Column);
}
- finalizeLowering(Inst, Result, Builder);
+ finalizeLowering(Inst,
+ Result.addNumLoads(getNumOps(Result.getColumnTy()) *
+ Result.getNumColumns()),
+ Builder);
}
/// Lowers llvm.matrix.columnwise.load.
@@ -607,7 +670,8 @@ class LowerMatrixIntrinsics {
Shape.NumRows, VType->getElementType(), Builder);
createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
}
- Inst2ColumnMatrix[Inst] = ColumnMatrixTy();
+ Inst2ColumnMatrix[Inst] = ColumnMatrixTy().addNumStores(
+ getNumOps(LM.getColumnTy()) * LM.getNumColumns());
ToRemove.push_back(Inst);
}
@@ -668,8 +732,9 @@ class LowerMatrixIntrinsics {
}
Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
- IRBuilder<> &Builder, bool AllowContraction) {
-
+ IRBuilder<> &Builder, bool AllowContraction,
+ unsigned &NumComputeOps) {
+ NumComputeOps += getNumOps(A->getType());
if (!Sum)
return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
@@ -681,10 +746,12 @@ class LowerMatrixIntrinsics {
Func.getParent(), Intrinsic::fmuladd, A->getType());
return Builder.CreateCall(FMulAdd, {A, B, Sum});
}
+ NumComputeOps += getNumOps(A->getType());
Value *Mul = Builder.CreateFMul(A, B);
return Builder.CreateFAdd(Sum, Mul);
}
+ NumComputeOps += getNumOps(A->getType());
Value *Mul = Builder.CreateMul(A, B);
return Builder.CreateAdd(Sum, Mul);
}
@@ -738,6 +805,7 @@ class LowerMatrixIntrinsics {
bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
MatMul->hasAllowContract());
+ unsigned NumComputeOps = 0;
// Multiply columns from the first operand with scalars from the second
// operand. Then move along the K axes and accumulate the columns. With
// this the adds can be vectorized without reassociation.
@@ -754,11 +822,12 @@ class LowerMatrixIntrinsics {
Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K);
Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(),
- Builder, AllowContract);
+ Builder, AllowContract, NumComputeOps);
}
Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder));
}
}
+ Result.addNumComputeOps(NumComputeOps);
finalizeLowering(MatMul, Result, Builder);
}
@@ -788,7 +857,13 @@ class LowerMatrixIntrinsics {
Result.addColumn(ResultColumn);
}
- finalizeLowering(Inst, Result, Builder);
+ // TODO: Improve estimate of operations needed for transposes. Currently we
+ // just count the insertelement/extractelement instructions, but do not
+ // account for later simplifications/combines.
+ finalizeLowering(
+ Inst,
+ Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns),
+ Builder);
}
/// Lower load instructions, if shape information is available.
@@ -850,7 +925,10 @@ class LowerMatrixIntrinsics {
Result.addColumn(
BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C)));
- finalizeLowering(Inst, Result, Builder);
+ finalizeLowering(Inst,
+ Result.addNumComputeOps(getNumOps(Result.getColumnTy()) *
+ Result.getNumColumns()),
+ Builder);
return true;
}
@@ -1116,6 +1194,23 @@ class LowerMatrixIntrinsics {
return Leaves;
}
+ /// Calculate the number of exclusive and shared op counts for expression
+ /// starting at \p V. Expressions used multiple times are counted once.
+ OpInfoTy sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs) {
+ auto CM = Inst2ColumnMatrix.find(Root);
+ if (CM == Inst2ColumnMatrix.end())
+ return {};
+
+ // Already counted this expression. Stop.
+ if (!ReusedExprs.insert(Root).second)
+ return {};
+
+ OpInfoTy Count = CM->second.getOpInfo();
+ for (Value *Op : cast<Instruction>(Root)->operand_values())
+ Count += sumOpInfos(Op, ReusedExprs);
+ return Count;
+ }
+
void emitRemarks() {
if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
return;
@@ -1125,10 +1220,16 @@ class LowerMatrixIntrinsics {
// Generate remarks for each leaf.
for (auto *L : Leaves) {
+ SmallPtrSet<Value *, 8> ReusedExprs;
+ auto Counts = sumOpInfos(L, ReusedExprs);
OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
cast<Instruction>(L)->getDebugLoc(),
cast<Instruction>(L)->getParent());
- Rem << "Lowered matrix expression ";
+ Rem << "Lowered with ";
+ Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+ << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+ << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops";
+
Rem << ("\n" + linearize(L, DL));
ORE.emit(Rem);
}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
index 402f83f3ef38..c28a260a2180 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -3,7 +3,7 @@
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "aarch64-apple-ios"
-; CHECK-LABEL: remark: test.h:40:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:40:20: Lowered with 6 stores, 6 loads, 24 compute ops
; CHECK-NEXT: store(
; CHECK-NEXT: transpose.2x6.double(load(addr %A)),
; CHECK-NEXT: addr %B)
@@ -17,7 +17,7 @@ define void @transpose(<12 x double>* %A, <12 x double>* %B) !dbg !23 {
declare <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double>, i32, i32)
-; CHECK-LABEL: remark: test.h:50:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:50:20: Lowered with 2 stores, 12 loads, 22 compute ops
; CHECK-NEXT: store(
; CHECK-NEXT: multiply.2x6.6x2.double(
; CHECK-NEXT: load(addr %A),
@@ -33,7 +33,7 @@ define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !d
declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32)
-; CHECK-LABEL: remark: test.h:60:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:60:20: Lowered with 6 stores, 6 loads, 0 compute ops
; CHECK-NEXT: store(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
; CHECK-NEXT: addr %B)
@@ -45,7 +45,7 @@ define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
-; CHECK-LABEL: remark: test.h:70:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:70:20: Lowered with 6 stores, 6 loads, 0 compute ops
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
; CHECK-NEXT: addr %B,
@@ -58,7 +58,7 @@ define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32)
-; CHECK-LABEL: remark: test.h:80:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:80:20: Lowered with 6 stores, 6 loads, 12 compute ops
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
@@ -76,7 +76,7 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
ret void
}
-; CHECK-LABEL: remark: test.h:90:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:90:20: Lowered with 6 stores, 6 loads, 12 compute ops
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
@@ -85,7 +85,7 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: addr %B,
; CHECK-NEXT: 10)
-; CHECK-NEXT: remark: test.h:90:20: Lowered matrix expression
+; CHECK-NEXT: remark: test.h:90:20: Lowered with 2 stores, 12 loads, 22 compute ops
; CHECK-NEXT: store(
; CHECK-NEXT: multiply.2x6.6x2.double(
; CHECK-NEXT: load(addr %C),
@@ -106,7 +106,7 @@ define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x doub
ret void
}
-; CHECK-LABEL: remark: test.h:100:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:100:20: Lowered with 6 stores, 6 loads, 12 compute ops
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
@@ -124,7 +124,7 @@ define void @stackaddresses(<9 x double>* %A) !dbg !35 {
ret void
}
-; CHECK-LABEL: remark: test.h:30:20: Lowered matrix expression
+; CHECK-LABEL: remark: test.h:30:20: Lowered with 10 stores, 9 loads, 30 compute ops
; CHECK-NEXT: store(
; CHECK-NEXT: transpose.5x3.double(load(addr %A)),
; CHECK-NEXT: stack addr %s1)
More information about the llvm-commits
mailing list