[llvm] 949294f - [Matrix] Add optimization remarks for matrix expression.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 27 16:39:48 PST 2020


Author: Florian Hahn
Date: 2020-01-27T16:39:29-08:00
New Revision: 949294f39627421f4bfaaca7d5d9deacb33efbe0

URL: https://github.com/llvm/llvm-project/commit/949294f39627421f4bfaaca7d5d9deacb33efbe0
DIFF: https://github.com/llvm/llvm-project/commit/949294f39627421f4bfaaca7d5d9deacb33efbe0.diff

LOG: [Matrix] Add optimization remarks for matrix expression.

Generate remarks for matrix operations in a function. To generate remarks
for matrix expressions, the following approach is used:
1. Collect leafs of matrix expressions (done in
   RemarkGenerator::getExpressionLeafs).  Leafs are lowered matrix
   instructions without other matrix users (like stores).

2. For each leaf, create a remark containing a linearizied version of the
   matrix expression.

The following improvements will be submitted as follow-ups:
* Summarize number of vector instructions generated for each expression.
* Account for shared sub-expressions.
* Propagate matrix remarks up the inlining chain.

The information provided by the matrix remarks helps users to spot cases
where matrix expression got split up, e.g. due to inlining not
happening. The remarks allow users to address those issues, ensuring
best performance.

Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D72453

Added: 
    llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll

Modified: 
    llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 1d5a5cd62869..daf1707c2d6c 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -10,7 +10,8 @@
 //
 // TODO:
 //  * Implement multiply & add fusion
-//  * Add remark, summarizing the available matrix optimization opportunities.
+//  * Add remark, summarizing the available matrix optimization opportunities
+//    (WIP).
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,7 +19,9 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
@@ -136,6 +139,7 @@ class LowerMatrixIntrinsics {
   Function &Func;
   const DataLayout &DL;
   const TargetTransformInfo &TTI;
+  OptimizationRemarkEmitter &ORE;
 
   /// Wrapper class representing a matrix as a set of column vectors.
   /// All column vectors must have the same vector type.
@@ -213,11 +217,12 @@ class LowerMatrixIntrinsics {
   SmallVector<Instruction *, 16> ToRemove;
 
   /// Map from instructions to their produced column matrix.
-  DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
+  MapVector<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
 
 public:
-  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI)
-      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {}
+  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
+                        OptimizationRemarkEmitter &ORE)
+      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {}
 
   /// Return the set of column vectors that a matrix value is lowered to.
   ///
@@ -509,6 +514,9 @@ class LowerMatrixIntrinsics {
       }
     }
 
+    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
+    RemarkGen.emitRemarks();
+
     for (Instruction *Inst : reverse(ToRemove))
       Inst->eraseFromParent();
 
@@ -599,6 +607,7 @@ class LowerMatrixIntrinsics {
                             Shape.NumRows, VType->getElementType(), Builder);
       createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
     }
+    Inst2ColumnMatrix[Inst] = ColumnMatrixTy();
 
     ToRemove.push_back(Inst);
   }
@@ -844,13 +853,301 @@ class LowerMatrixIntrinsics {
     finalizeLowering(Inst, Result, Builder);
     return true;
   }
+
+  /// Helper to linearize a matrix expression tree into a string. Currently
+  /// matrix expressions are linarized by starting at an expression leaf and
+  /// linearizing bottom up.
+  struct ExprLinearizer {
+    unsigned LengthToBreak = 100;
+    std::string Str;
+    raw_string_ostream Stream;
+    unsigned LineLength = 0;
+    const DataLayout &DL;
+
+    /// Mapping from instructions to column matrixes. It is used to identify
+    /// matrix instructions.
+    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
+
+    /// Used to keep track of sub-expressions that get reused while linearizing
+    /// the expression. Re-used sub-expressions are marked as (reused).
+    SmallPtrSet<Value *, 8> ReusedExprs;
+
+    ExprLinearizer(const DataLayout &DL,
+                   const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix)
+        : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix) {}
+
+    void indent(unsigned N) {
+      LineLength += N;
+      for (unsigned i = 0; i < N; i++)
+        Stream << " ";
+    }
+
+    void lineBreak() {
+      Stream << "\n";
+      LineLength = 0;
+    }
+
+    void maybeIndent(unsigned Indent) {
+      if (LineLength >= LengthToBreak)
+        lineBreak();
+
+      if (LineLength == 0)
+        indent(Indent);
+    }
+
+    void write(const std::string &S) {
+      LineLength += S.size();
+      Stream << S;
+    }
+
+    Value *getUnderlyingObjectThroughLoads(Value *V) {
+      if (Value *Ptr = getPointerOperand(V))
+        return getUnderlyingObjectThroughLoads(Ptr);
+      else if (V->getType()->isPointerTy())
+        return GetUnderlyingObject(V, DL);
+      return V;
+    }
+
+    /// Returns true if \p V is a matrix value.
+    bool isMatrix(Value *V) const {
+      return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
+    }
+
+    /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
+    /// \p SS.
+    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
+      auto M = Inst2ColumnMatrix.find(V);
+      if (M == Inst2ColumnMatrix.end())
+        SS << "unknown";
+      else {
+        SS << M->second.getNumRows();
+        SS << "x";
+        SS << M->second.getNumColumns();
+      }
+    }
+
+    /// Write the called function name. Handles calls to llvm.matrix.*
+    /// specially: we write the name, followed by the dimensions of the input
+    /// matrixes, followed by the scalar type name.
+    void writeFnName(CallInst *CI) {
+      if (!CI->getCalledFunction())
+        write("<no called fn>");
+      else {
+        StringRef Name = CI->getCalledFunction()->getName();
+        if (!Name.startswith("llvm.matrix")) {
+          write(Name);
+          return;
+        }
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+        write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
+                  .drop_front(StringRef("llvm.matrix.").size()));
+        write(".");
+        std::string Tmp = "";
+        raw_string_ostream SS(Tmp);
+
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << ".";
+          prettyPrintMatrixType(II->getOperand(1), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_transpose:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_columnwise_load:
+          prettyPrintMatrixType(II, SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_columnwise_store:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getOperand(0)->getType()->getScalarType();
+          break;
+        default:
+          llvm_unreachable("Unhandled case");
+        }
+        SS.flush();
+        write(Tmp);
+      }
+    }
+
+    unsigned getNumShapeArgs(CallInst *CI) const {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          return 3;
+        case Intrinsic::matrix_transpose:
+        case Intrinsic::matrix_columnwise_load:
+        case Intrinsic::matrix_columnwise_store:
+          return 2;
+        default:
+          return 0;
+        }
+      }
+      return 0;
+    }
+
+    /// Special printing for values: for pointers, we print if they refer to an
+    /// (function) external address or a stack address, for other values we
+    /// either print the constant or "scalar"/"matrix" for other values.
+    void write(Value *V) {
+      V = getUnderlyingObjectThroughLoads(V);
+      if (V->getType()->isPointerTy()) {
+        if (isa<AllocaInst>(V)) {
+          Stream << "stack addr";
+          LineLength += StringRef("stack addr").size();
+        } else {
+          Stream << "addr";
+          LineLength += StringRef("addr").size();
+        }
+        if (!V->getName().empty()) {
+          Stream << " %" << V->getName() << "";
+          LineLength += V->getName().size() + 2;
+        }
+        return;
+      }
+
+      std::string Tmp;
+      raw_string_ostream TmpStream(Tmp);
+
+      if (auto *CI = dyn_cast<ConstantInt>(V))
+        TmpStream << CI->getValue();
+      else if (isa<Constant>(V))
+        TmpStream << "constant";
+      else {
+        if (isMatrix(V))
+          TmpStream << "matrix";
+        else
+          TmpStream << "scalar";
+      }
+      TmpStream.flush();
+      Tmp = StringRef(Tmp).trim();
+      LineLength += Tmp.size();
+      Stream << Tmp;
+    }
+
+    /// Linearize expression \p Expr starting at an indentation of \p Indent.
+    /// Expressions that are re-used multiple times are prefixed with (reused)
+    /// at the re-used root instruction.
+    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused) {
+      auto *I = cast<Instruction>(Expr);
+      maybeIndent(Indent);
+      SmallVector<Value *, 8> Ops;
+
+      bool Reused = !ReusedExprs.insert(Expr).second;
+      if (Reused && !ParentReused)
+        write("(reused) ");
+
+      if (auto *CI = dyn_cast<CallInst>(I)) {
+        writeFnName(CI);
+
+        Ops.append(CallSite(CI).arg_begin(),
+                   CallSite(CI).arg_end() - getNumShapeArgs(CI));
+      } else if (isa<BitCastInst>(Expr)) {
+        // Special case bitcasts, which are used to materialize matrixes from
+        // non-matrix ops.
+        write("matrix");
+        return;
+      } else {
+        Ops.append(I->value_op_begin(), I->value_op_end());
+        write(std::string(I->getOpcodeName()));
+      }
+
+      write(std::string("("));
+
+      unsigned NumOpsToBreak = 1;
+      if (match(Expr, m_Intrinsic<Intrinsic::matrix_columnwise_load>()))
+        NumOpsToBreak = 2;
+
+      for (Value *Op : Ops) {
+        if (Ops.size() > NumOpsToBreak)
+          lineBreak();
+
+        maybeIndent(Indent + 1);
+        if (isMatrix(Op))
+          linearizeExpr(Op, Indent + 1, Reused);
+        else
+          write(Op);
+        if (Op != Ops.back())
+          write(", ");
+      }
+
+      write(")");
+    }
+
+    const std::string &getResult() {
+      Stream.flush();
+      return Str;
+    }
+  };
+
+  /// Generate remarks for matrix operations in a function. To generate remarks
+  /// for matrix expressions, the following approach is used:
+  /// 1. Collect leafs of matrix expressions (done in
+  ///    RemarkGenerator::getExpressionLeaves).  Leaves are lowered matrix
+  ///    instructions without other matrix users (like stores).
+  ///
+  /// 2. For each leaf, create a remark containing a linearizied version of the
+  ///    matrix expression.
+  ///
+  /// TODO:
+  ///  * Summarize number of vector instructions generated for each expression.
+  ///  * Account for shared sub-expressions.
+  ///  * Propagate matrix remarks up the inlining chain.
+  struct RemarkGenerator {
+    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
+    OptimizationRemarkEmitter &ORE;
+    const DataLayout &DL;
+
+    RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
+                    OptimizationRemarkEmitter &ORE, const DataLayout &DL)
+        : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
+
+    /// Return all leafs of matrix expressions. Those are instructions in
+    /// Inst2ColumnMatrix returing void. Currently that should only include
+    /// stores.
+    SmallVector<Value *, 4> getExpressionLeaves() {
+      SmallVector<Value *, 4> Leaves;
+      for (auto &KV : Inst2ColumnMatrix)
+        if (KV.first->getType()->isVoidTy())
+          Leaves.push_back(KV.first);
+
+      return Leaves;
+    }
+
+    void emitRemarks() {
+      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
+        return;
+
+      // Find leafs of matrix expressions.
+      auto Leaves = getExpressionLeaves();
+
+      // Generate remarks for each leaf.
+      for (auto *L : Leaves) {
+        OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
+                               cast<Instruction>(L)->getDebugLoc(),
+                               cast<Instruction>(L)->getParent());
+        Rem << "Lowered matrix expression ";
+        Rem << ("\n" + linearize(L, DL));
+        ORE.emit(Rem);
+      }
+    }
+
+    std::string linearize(Value *L, const DataLayout &DL) {
+      ExprLinearizer Lin(DL, Inst2ColumnMatrix);
+      Lin.linearizeExpr(L, 0, false);
+      return Lin.getResult();
+    }
+  };
 };
 } // namespace
 
 PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  LowerMatrixIntrinsics LMT(F, TTI);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  LowerMatrixIntrinsics LMT(F, TTI, ORE);
   if (LMT.Visit()) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -871,14 +1168,16 @@ class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    LowerMatrixIntrinsics LMT(F, *TTI);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    LowerMatrixIntrinsics LMT(F, TTI, ORE);
     bool C = LMT.Visit();
     return C;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.setPreservesCFG();
   }
 };
@@ -888,6 +1187,7 @@ static const char pass_name[] = "Lower the matrix intrinsics";
 char LowerMatrixIntrinsicsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
                     false, false)
 

diff  --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
new file mode 100644
index 000000000000..402f83f3ef38
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -0,0 +1,195 @@
+; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+; CHECK-LABEL: remark: test.h:40:20: Lowered matrix expression
+; CHECK-NEXT: store(
+; CHECK-NEXT:  transpose.2x6.double(load(addr %A)),
+; CHECK-NEXT:  addr %B)
+define void @transpose(<12 x double>* %A, <12 x double>* %B) !dbg !23 {
+  %load = load <12 x double>, <12 x double>* %A, !dbg !24
+  %t = call <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double> %load, i32 2, i32 6), !dbg !24
+  store <12 x double> %t, <12 x double>* %B, !dbg !24
+  ret void
+}
+
+declare <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double>, i32, i32)
+
+
+; CHECK-LABEL: remark: test.h:50:20: Lowered matrix expression
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   multiply.2x6.6x2.double(
+; CHECK-NEXT:    load(addr %A),
+; CHECK-NEXT:    load(addr %B)),
+; CHECK-NEXT:   addr %C)
+define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !dbg !25 {
+  %A.matrix = load <12 x double>, <12 x double>* %A, !dbg !26
+  %B.matrix = load <12 x double>, <12 x double>* %B, !dbg !26
+  %t = call <4 x double> @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26
+  store <4 x double> %t, <4 x double>* %C, !dbg !26
+  ret void
+}
+
+declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32)
+
+; CHECK-LABEL: remark: test.h:60:20: Lowered matrix expression
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   columnwise.load.3x3.double(addr %A, 5),
+; CHECK-NEXT:   addr %B)
+define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
+  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !28
+  store <9 x double> %A.matrix, <9 x double>* %B, !dbg !28
+  ret void
+}
+
+declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
+
+; CHECK-LABEL: remark: test.h:70:20: Lowered matrix expression
+; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:   columnwise.load.3x3.double(addr %A, 5),
+; CHECK-NEXT:   addr %B,
+; CHECK-NEXT:   10)
+define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
+  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !30
+  call void @llvm.matrix.columnwise.store(<9 x double> %A.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !30
+  ret void
+}
+
+declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32)
+
+; CHECK-LABEL: remark: test.h:80:20: Lowered matrix expression
+; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:   fmul(
+; CHECK-NEXT:    fadd(
+; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:   addr %B,
+; CHECK-NEXT:   10)
+
+define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
+  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
+  ret void
+}
+
+; CHECK-LABEL: remark: test.h:90:20: Lowered matrix expression
+; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:   fmul(
+; CHECK-NEXT:    fadd(
+; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:   addr %B,
+; CHECK-NEXT:   10)
+; CHECK-NEXT:  remark: test.h:90:20: Lowered matrix expression
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   multiply.2x6.6x2.double(
+; CHECK-NEXT:    load(addr %C),
+; CHECK-NEXT:    load(addr %D)),
+; CHECK-NEXT:   addr %E)
+
+define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
+  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
+
+  %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
+  %D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34
+  %Mult.matrix = call <4 x double> @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34
+  store <4 x double> %Mult.matrix, <4 x double>* %E, !dbg !34
+
+  ret void
+}
+
+; CHECK-LABEL: remark: test.h:100:20: Lowered matrix expression
+; CHECK-NEXT:  columnwise.store.3x3.double(
+; CHECK-NEXT:   fmul(
+; CHECK-NEXT:    fadd(
+; CHECK-NEXT:     columnwise.load.3x3.double(addr %A, 5)
+; CHECK-NEXT:     (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:    (reused) columnwise.load.3x3.double(addr %A, 5)),
+; CHECK-NEXT:   stack addr %B,
+; CHECK-NEXT:   10)
+define void @stackaddresses(<9 x double>* %A) !dbg !35 {
+  %B = alloca <9 x double>
+  %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
+  ret void
+}
+
+; CHECK-LABEL: remark: test.h:30:20: Lowered matrix expression
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   transpose.5x3.double(load(addr %A)),
+; CHECK-NEXT:   stack addr %s1)
+%S1 = type {<15 x double>*}
+define void @get_underlying_object(%S1* %A) !dbg !21 {
+entry:
+  %s1 = alloca <15 x double>, !dbg !22
+  %a1 = getelementptr %S1, %S1* %A, i32 0, i32 0, !dbg !22
+  %a2 = load <15 x double>*, <15 x double>** %a1, !dbg !22
+  %av = load <15 x double>, <15 x double>* %a2, !dbg !22
+
+  %s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
+  %s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
+
+  %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
+
+  store <15 x double> %t, <15 x double>* %s3, !dbg !22
+  ret void
+}
+
+declare <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double>, i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.h", directory: "/test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !11}
+!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+!14 = !DILocation(line: 1, column: 27, scope: !5)
+
+!5 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!19 = !DILocation(line: 10, column: 20, scope: !5)
+!20 = !DILocation(line: 10, column: 10, scope: !5)
+
+!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!22 = !DILocation(line: 30, column: 20, scope: !21)
+
+!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!24 = !DILocation(line: 40, column: 20, scope: !23)
+
+!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!26 = !DILocation(line: 50, column: 20, scope: !25)
+
+!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!28 = !DILocation(line: 60, column: 20, scope: !27)
+
+!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!30 = !DILocation(line: 70, column: 20, scope: !29)
+
+!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!32 = !DILocation(line: 80, column: 20, scope: !31)
+
+!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!34 = !DILocation(line: 90, column: 20, scope: !33)
+
+!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!36 = !DILocation(line: 100, column: 20, scope: !35)


        


More information about the llvm-commits mailing list