[llvm] r364964 - [SLP] Recommit: Look-ahead operand reordering heuristic.

Wed Jul 3 08:41:41 PDT 2019

I'm seeing miscompiles with AVX512 after this change (wrong floating point
results). Any ideas? I'll try getting a small test case.

On Tue, Jul 2, 2019 at 10:20 PM Vasileios Porpodas via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

> Author: vporpo
> Date: Tue Jul  2 13:20:28 2019
> New Revision: 364964
>
> URL: http://llvm.org/viewvc/llvm-project?rev=364964&view=rev
> Log:
> [SLP] Recommit: Look-ahead operand reordering heuristic.
>
> Summary: This patch introduces a new heuristic for guiding operand
> reordering. The new "look-ahead" heuristic can look beyond the immediate
> predecessors. This helps break ties when the immediate predecessors have
> identical opcodes (see lit test for an example).
>
> Reviewers: RKSimon, ABataev, dtemirbulatov, Ayal, hfinkel, rnk
>
> Reviewed By: RKSimon, dtemirbulatov
>
> Subscribers: hiraditya, phosek, rnk, rcorcs, llvm-commits
>
> Tags: #llvm
>
> Differential Revision: https://reviews.llvm.org/D60897
>
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=364964&r1=364963&r2=364964&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Jul  2
> 13:20:28 2019
> @@ -147,6 +147,20 @@ static cl::opt<unsigned> MinTreeSize(
>      "slp-min-tree-size", cl::init(3), cl::Hidden,
>      cl::desc("Only vectorize small trees if they are fully
> vectorizable"));
>
> +// The maximum depth that the look-ahead score heuristic will explore.
> +// The higher this value, the higher the compilation time overhead.
> +static cl::opt<int> LookAheadMaxDepth(
> +    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
> +    cl::desc("The maximum look-ahead depth for operand reordering
> scores"));
> +
> +// The Look-ahead heuristic goes through the users of the bundle to
> calculate
> +// the users cost in getExternalUsesCost(). To avoid compilation time
> increase
> +// we limit the number of users visited to this value.
> +static cl::opt<unsigned> LookAheadUsersBudget(
> +    "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
> +    cl::desc("The maximum number of users to visit while visiting the "
> +             "predecessors. This prevents compilation time increase."));
> +
>  static cl::opt<bool>
>      ViewSLPTree("view-slp-tree", cl::Hidden,
>                  cl::desc("Display the SLP trees with Graphviz"));
> @@ -708,6 +722,7 @@ public:
>
>      const DataLayout &DL;
>      ScalarEvolution &SE;
> +    const BoUpSLP &R;
>
>      /// \returns the operand data at \p OpIdx and \p Lane.
>      OperandData &getData(unsigned OpIdx, unsigned Lane) {
> @@ -733,6 +748,215 @@ public:
>        std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
>      }
>
> +    // The hard-coded scores listed here are not very important. When
> computing
> +    // the scores of matching one sub-tree with another, we are basically
> +    // counting the number of values that are matching. So even if all
> scores
> +    // are set to 1, we would still get a decent matching result.
> +    // However, sometimes we have to break ties. For example we may have
> to
> +    // choose between matching loads vs matching opcodes. This is what
> these
> +    // scores are helping us with: they provide the order of preference.
> +
> +    /// Loads from consecutive memory addresses, e.g. load(A[i]),
> load(A[i+1]).
> +    static const int ScoreConsecutiveLoads = 3;
> +    /// Constants.
> +    static const int ScoreConstants = 2;
> +    /// Instructions with the same opcode.
> +    static const int ScoreSameOpcode = 2;
> +    /// Instructions with alt opcodes (e.g, add + sub).
> +    static const int ScoreAltOpcodes = 1;
> +    /// Identical instructions (a.k.a. splat or broadcast).
> +    static const int ScoreSplat = 1;
> +    /// Matching with an undef is preferable to failing.
> +    static const int ScoreUndef = 1;
> +    /// Score for failing to find a decent match.
> +    static const int ScoreFail = 0;
> +    /// User exteranl to the vectorized code.
> +    static const int ExternalUseCost = 1;
> +    /// The user is internal but in a different lane.
> +    static const int UserInDiffLaneCost = ExternalUseCost;
> +
> +    /// \returns the score of placing \p V1 and \p V2 in consecutive
> lanes.
> +    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
> +                               ScalarEvolution &SE) {
> +      auto *LI1 = dyn_cast<LoadInst>(V1);
> +      auto *LI2 = dyn_cast<LoadInst>(V2);
> +      if (LI1 && LI2)
> +        return isConsecutiveAccess(LI1, LI2, DL, SE)
> +                   ? VLOperands::ScoreConsecutiveLoads
> +                   : VLOperands::ScoreFail;
> +
> +      auto *C1 = dyn_cast<Constant>(V1);
> +      auto *C2 = dyn_cast<Constant>(V2);
> +      if (C1 && C2)
> +        return VLOperands::ScoreConstants;
> +
> +      auto *I1 = dyn_cast<Instruction>(V1);
> +      auto *I2 = dyn_cast<Instruction>(V2);
> +      if (I1 && I2) {
> +        if (I1 == I2)
> +          return VLOperands::ScoreSplat;
> +        InstructionsState S = getSameOpcode({I1, I2});
> +        // Note: Only consider instructions with <= 2 operands to avoid
> +        // complexity explosion.
> +        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
> +          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
> +                                  : VLOperands::ScoreSameOpcode;
> +      }
> +
> +      if (isa<UndefValue>(V2))
> +        return VLOperands::ScoreUndef;
> +
> +      return VLOperands::ScoreFail;
> +    }
> +
> +    /// Holds the values and their lane that are taking part in the
> look-ahead
> +    /// score calculation. This is used in the external uses cost
> calculation.
> +    SmallDenseMap<Value *, int> InLookAheadValues;
> +
> +    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that
> are
> +    /// either external to the vectorized code, or require shuffling.
> +    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
> +                            const std::pair<Value *, int> &RHS) {
> +      int Cost = 0;
> +      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
> +      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
> +        Value *V = Values[Idx].first;
> +        // Calculate the absolute lane, using the minimum relative lane
> of LHS
> +        // and RHS as base and Idx as the offset.
> +        int Ln = std::min(LHS.second, RHS.second) + Idx;
> +        assert(Ln >= 0 && "Bad lane calculation");
> +        unsigned UsersBudget = LookAheadUsersBudget;
> +        for (User *U : V->users()) {
> +          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
> +            // The user is in the VectorizableTree. Check if we need to
> insert.
> +            auto It = llvm::find(UserTE->Scalars, U);
> +            assert(It != UserTE->Scalars.end() && "U is in UserTE");
> +            int UserLn = std::distance(UserTE->Scalars.begin(), It);
> +            assert(UserLn >= 0 && "Bad lane");
> +            if (UserLn != Ln)
> +              Cost += UserInDiffLaneCost;
> +          } else {
> +            // Check if the user is in the look-ahead code.
> +            auto It2 = InLookAheadValues.find(U);
> +            if (It2 != InLookAheadValues.end()) {
> +              // The user is in the look-ahead code. Check the lane.
> +              if (It2->second != Ln)
> +                Cost += UserInDiffLaneCost;
> +            } else {
> +              // The user is neither in SLP tree nor in the look-ahead
> code.
> +              Cost += ExternalUseCost;
> +            }
> +          }
> +          // Limit the number of visited uses to cap compilation time.
> +          if (--UsersBudget == 0)
> +            break;
> +        }
> +      }
> +      return Cost;
> +    }
> +
> +    /// Go through the operands of \p LHS and \p RHS recursively until \p
> +    /// MaxLevel, and return the cummulative score. For example:
> +    /// \verbatim
> +    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
> +    ///     \ /         \ /         \ /        \ /
> +    ///      +           +           +          +
> +    ///     G1          G2          G3         G4
> +    /// \endverbatim
> +    /// The getScoreAtLevelRec(G1, G2) function will try to match the
> nodes at
> +    /// each level recursively, accumulating the score. It starts from
> matching
> +    /// the additions at level 0, then moves on to the loads (level 1).
> The
> +    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]}
> and
> +    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
> +    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
> +    /// Please note that the order of the operands does not matter, as we
> +    /// evaluate the score of all profitable combinations of operands. In
> +    /// other words the score of G1 and G4 is the same as G1 and G2. This
> +    /// heuristic is based on ideas described in:
> +    ///   Look-ahead SLP: Auto-vectorization in the presence of
> commutative
> +    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
> +    ///   LuÃs F. W. GÃ³es
> +    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
> +                           const std::pair<Value *, int> &RHS, int
> CurrLevel,
> +                           int MaxLevel) {
> +
> +      Value *V1 = LHS.first;
> +      Value *V2 = RHS.first;
> +      // Get the shallow score of V1 and V2.
> +      int ShallowScoreAtThisLevel =
> +          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
> +                                       getExternalUsesCost(LHS, RHS));
> +      int Lane1 = LHS.second;
> +      int Lane2 = RHS.second;
> +
> +      // If reached MaxLevel,
> +      //  or if V1 and V2 are not instructions,
> +      //  or if they are SPLAT,
> +      //  or if they are not consecutive, early return the current cost.
> +      auto *I1 = dyn_cast<Instruction>(V1);
> +      auto *I2 = dyn_cast<Instruction>(V2);
> +      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
> +          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
> +          (isa<LoadInst>(I1) && isa<LoadInst>(I2) &&
> ShallowScoreAtThisLevel))
> +        return ShallowScoreAtThisLevel;
> +      assert(I1 && I2 && "Should have early exited.");
> +
> +      // Keep track of in-tree values for determining the external-use
> cost.
> +      InLookAheadValues[V1] = Lane1;
> +      InLookAheadValues[V2] = Lane2;
> +
> +      // Contains the I2 operand indexes that got matched with I1
> operands.
> +      SmallSet<unsigned, 4> Op2Used;
> +
> +      // Recursion towards the operands of I1 and I2. We are trying all
> possbile
> +      // operand pairs, and keeping track of the best score.
> +      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
> +           OpIdx1 != NumOperands1; ++OpIdx1) {
> +        // Try to pair op1I with the best operand of I2.
> +        int MaxTmpScore = 0;
> +        unsigned MaxOpIdx2 = 0;
> +        bool FoundBest = false;
> +        // If I2 is commutative try all combinations.
> +        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
> +        unsigned ToIdx = isCommutative(I2)
> +                             ? I2->getNumOperands()
> +                             : std::min(I2->getNumOperands(), OpIdx1 + 1);
> +        assert(FromIdx <= ToIdx && "Bad index");
> +        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
> +          // Skip operands already paired with OpIdx1.
> +          if (Op2Used.count(OpIdx2))
> +            continue;
> +          // Recursively calculate the cost at each level
> +          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1),
> Lane1},
> +                                            {I2->getOperand(OpIdx2),
> Lane2},
> +                                            CurrLevel + 1, MaxLevel);
> +          // Look for the best score.
> +          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore)
> {
> +            MaxTmpScore = TmpScore;
> +            MaxOpIdx2 = OpIdx2;
> +            FoundBest = true;
> +          }
> +        }
> +        if (FoundBest) {
> +          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit
> it.
> +          Op2Used.insert(MaxOpIdx2);
> +          ShallowScoreAtThisLevel += MaxTmpScore;
> +        }
> +      }
> +      return ShallowScoreAtThisLevel;
> +    }
> +
> +    /// \Returns the look-ahead score, which tells us how much the
> sub-trees
> +    /// rooted at \p LHS and \p RHS match, the more they match the higher
> the
> +    /// score. This helps break ties in an informed way when we cannot
> decide on
> +    /// the order of the operands by just considering the immediate
> +    /// predecessors.
> +    int getLookAheadScore(const std::pair<Value *, int> &LHS,
> +                          const std::pair<Value *, int> &RHS) {
> +      InLookAheadValues.clear();
> +      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
> +    }
> +
>      // Search all operands in Ops[*][Lane] for the one that matches best
>      // Ops[OpIdx][LastLane] and return its opreand index.
>      // If no good match can be found, return None.
> @@ -750,9 +974,6 @@ public:
>        // The linearized opcode of the operand at OpIdx, Lane.
>        bool OpIdxAPO = getData(OpIdx, Lane).APO;
>
> -      const unsigned BestScore = 2;
> -      const unsigned GoodScore = 1;
> -
>        // The best operand index and its score.
>        // Sometimes we have more than one option (e.g., Opcode and
> Undefs), so we
>        // are using the score to differentiate between the two.
> @@ -781,41 +1002,19 @@ public:
>          // Look for an operand that matches the current mode.
>          switch (RMode) {
>          case ReorderingMode::Load:
> -          if (isa<LoadInst>(Op)) {
> -            // Figure out which is left and right, so that we can check
> for
> -            // consecutive loads
> -            bool LeftToRight = Lane > LastLane;
> -            Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
> -            Value *OpRight = (LeftToRight) ? Op : OpLastLane;
> -            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
> -                                    cast<LoadInst>(OpRight), DL, SE))
> -              BestOp.Idx = Idx;
> -          }
> -          break;
> -        case ReorderingMode::Opcode:
> -          // We accept both Instructions and Undefs, but with different
> scores.
> -          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
> -               cast<Instruction>(Op)->getOpcode() ==
> -                   cast<Instruction>(OpLastLane)->getOpcode()) ||
> -              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
> -              isa<UndefValue>(Op)) {
> -            // An instruction has a higher score than an undef.
> -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore :
> BestScore;
> -            if (Score > BestOp.Score) {
> -              BestOp.Idx = Idx;
> -              BestOp.Score = Score;
> -            }
> -          }
> -          break;
>          case ReorderingMode::Constant:
> -          if (isa<Constant>(Op)) {
> -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore :
> BestScore;
> -            if (Score > BestOp.Score) {
> -              BestOp.Idx = Idx;
> -              BestOp.Score = Score;
> -            }
> +        case ReorderingMode::Opcode: {
> +          bool LeftToRight = Lane > LastLane;
> +          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
> +          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
> +          unsigned Score =
> +              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
> +          if (Score > BestOp.Score) {
> +            BestOp.Idx = Idx;
> +            BestOp.Score = Score;
>            }
>            break;
> +        }
>          case ReorderingMode::Splat:
>            if (Op == OpLastLane)
>              BestOp.Idx = Idx;
> @@ -946,8 +1145,8 @@ public:
>    public:
>      /// Initialize with all the operands of the instruction vector \p
> RootVL.
>      VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
> -               ScalarEvolution &SE)
> -        : DL(DL), SE(SE) {
> +               ScalarEvolution &SE, const BoUpSLP &R)
> +        : DL(DL), SE(SE), R(R) {
>        // Append all the operands of RootVL.
>        appendOperandsOfVL(RootVL);
>      }
> @@ -1169,7 +1368,8 @@ private:
>                                               SmallVectorImpl<Value *>
> &Left,
>                                               SmallVectorImpl<Value *>
> &Right,
>                                               const DataLayout &DL,
> -                                             ScalarEvolution &SE);
> +                                             ScalarEvolution &SE,
> +                                             const BoUpSLP &R);
>    struct TreeEntry {
>      using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
>      TreeEntry(VecTreeTy &Container) : Container(Container) {}
> @@ -2371,7 +2571,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>          // Commutative predicate - collect + sort operands of the
> instructions
>          // so that each side is more likely to have the same opcode.
>          assert(P0 == SwapP0 && "Commutative Predicate mismatch");
> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>        } else {
>          // Collect operands - commute if it uses the swapped predicate.
>          for (Value *V : VL) {
> @@ -2416,7 +2616,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        // have the same opcode.
>        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
>          ValueList Left, Right;
> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>          buildTree_rec(Left, Depth + 1, {TE, 0});
>          buildTree_rec(Right, Depth + 1, {TE, 1});
>          return;
> @@ -2585,7 +2785,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>        // Reorder operands if reordering would enable vectorization.
>        if (isa<BinaryOperator>(VL0)) {
>          ValueList Left, Right;
> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>          buildTree_rec(Left, Depth + 1, {TE, 0});
>          buildTree_rec(Right, Depth + 1, {TE, 1});
>          return;
> @@ -3302,13 +3502,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Valu
>
>  // Perform operand reordering on the instructions in VL and return the
> reordered
>  // operands in Left and Right.
> -void BoUpSLP::reorderInputsAccordingToOpcode(
> -    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
> -    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
> -    ScalarEvolution &SE) {
> +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
> +                                             SmallVectorImpl<Value *>
> &Left,
> +                                             SmallVectorImpl<Value *>
> &Right,
> +                                             const DataLayout &DL,
> +                                             ScalarEvolution &SE,
> +                                             const BoUpSLP &R) {
>    if (VL.empty())
>      return;
> -  VLOperands Ops(VL, DL, SE);
> +  VLOperands Ops(VL, DL, SE, R);
>    // Reorder the operands in place.
>    Ops.reorder();
>    Left = Ops.getVL(0);
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll?rev=364964&r1=364963&r2=364964&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll Tue Jul  2
> 13:20:28 2019
> @@ -27,22 +27,19 @@ define void @lookahead_basic(double* %ar
>  ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 5
>  ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 6
>  ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 7
> -; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
> -; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
> -; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
> -; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
> -; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
> -; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
> -; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
> -; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
> -; CHECK-NEXT:    [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
> -; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
> -; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
> -; CHECK-NEXT:    [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
> -; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]],
> [[SUBCD_0]]
> -; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]],
> [[SUBAB_1]]
> -; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
> -; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
> [[TMP0]], align 8
> +; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>*
> [[TMP2]], align 8
> +; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>*
> [[TMP4]], align 8
> +; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>*
> [[TMP6]], align 8
> +; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
> +; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
> +; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
> +; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
> +; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]],
> align 8
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -164,22 +161,23 @@ define void @lookahead_alt2(double* %arr
>  ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 5
>  ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 6
>  ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double*
> [[ARRAY]], i64 7
> -; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
> -; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
> -; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
> -; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
> -; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
> -; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
> -; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
> -; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
> -; CHECK-NEXT:    [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]]
> -; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
> -; CHECK-NEXT:    [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]]
> -; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
> -; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]],
> [[SUBCD_0]]
> -; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]],
> [[SUBAB_1]]
> -; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
> -; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
> [[TMP0]], align 8
> +; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>*
> [[TMP2]], align 8
> +; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>*
> [[TMP4]], align 8
> +; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>*
> [[TMP6]], align 8
> +; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
> +; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
> +; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2
> x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
> +; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
> +; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
> +; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2
> x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
> +; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]],
> [[TMP10]]
> +; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
> +; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]],
> align 8
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -239,6 +237,97 @@ define void @lookahead_external_uses(dou
>  ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double*
> [[B]], i64 2
>  ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double*
> [[A]], i64 2
>  ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double*
> [[B]], i64 1
> +; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
> +; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
> +; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
> +; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
> +; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
> +; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
> [[TMP0]], align 8
> +; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double
> [[C0]], i32 0
> +; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]],
> double [[A1]], i32 1
> +; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double
> [[D0]], i32 0
> +; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]],
> double [[B2]], i32 1
> +; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
> +; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double
> [[A0]], i32 0
> +; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]],
> double [[A2]], i32 1
> +; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
> +; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
> +; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double*
> [[S:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double*
> [[S]], i64 1
> +; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x
> double>*
> +; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]],
> align 8
> +; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
> +; CHECK-NEXT:    ret void
> +;
> +entry:
> +  %IdxA0 = getelementptr inbounds double, double* %A, i64 0
> +  %IdxB0 = getelementptr inbounds double, double* %B, i64 0
> +  %IdxC0 = getelementptr inbounds double, double* %C, i64 0
> +  %IdxD0 = getelementptr inbounds double, double* %D, i64 0
> +
> +  %IdxA1 = getelementptr inbounds double, double* %A, i64 1
> +  %IdxB2 = getelementptr inbounds double, double* %B, i64 2
> +  %IdxA2 = getelementptr inbounds double, double* %A, i64 2
> +  %IdxB1 = getelementptr inbounds double, double* %B, i64 1
> +
> +  %A0 = load double, double *%IdxA0, align 8
> +  %B0 = load double, double *%IdxB0, align 8
> +  %C0 = load double, double *%IdxC0, align 8
> +  %D0 = load double, double *%IdxD0, align 8
> +
> +  %A1 = load double, double *%IdxA1, align 8
> +  %B2 = load double, double *%IdxB2, align 8
> +  %A2 = load double, double *%IdxA2, align 8
> +  %B1 = load double, double *%IdxB1, align 8
> +
> +  %subA0B0 = fsub fast double %A0, %B0
> +  %subC0D0 = fsub fast double %C0, %D0
> +
> +  %subA1B2 = fsub fast double %A1, %B2
> +  %subA2B1 = fsub fast double %A2, %B1
> +
> +  %add0 = fadd fast double %subA0B0, %subC0D0
> +  %add1 = fadd fast double %subA1B2, %subA2B1
> +
> +  %IdxS0 = getelementptr inbounds double, double* %S, i64 0
> +  %IdxS1 = getelementptr inbounds double, double* %S, i64 1
> +
> +  store double %add0, double *%IdxS0, align 8
> +  store double %add1, double *%IdxS1, align 8
> +
> +  ; External use
> +  store double %A1, double *%Ext1, align 8
> +  ret void
> +}
> +
> +; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
> +;     \  /   \  /       /  \  /   \  / \
> +;       -     -    U1,U2,U3  -     -  U4,U5
> +;        \   /                \   /
> +;          +                    +
> +;          |                    |
> +;         S[0]                 S[1]
> +;
> +;
> +; If we limit the users budget for the look-ahead heuristic to 2, then the
> +; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
> +; over A[1] (with 3 external users).
> +; The result is that the operands are of the Add not reordered and the
> loads
> +; from A get vectorized instead of the loads from B.
> +;
> +define void @lookahead_limit_users_budget(double* %A, double *%B, double
> *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3,
> double *%Ext4, double *%Ext5) {
> +; CHECK-LABEL: @lookahead_limit_users_budget(
> +; CHECK-NEXT:  entry:
> +; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double*
> [[A:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double*
> [[B:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double*
> [[C:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double*
> [[D:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double*
> [[A]], i64 1
> +; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double*
> [[B]], i64 2
> +; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double*
> [[A]], i64 2
> +; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double*
> [[B]], i64 1
>  ; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
>  ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
>  ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
> @@ -262,6 +351,10 @@ define void @lookahead_external_uses(dou
>  ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]],
> align 8
>  ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]],
> i32 1
>  ; CHECK-NEXT:    store double [[TMP12]], double* [[EXT1:%.*]], align 8
> +; CHECK-NEXT:    store double [[TMP12]], double* [[EXT2:%.*]], align 8
> +; CHECK-NEXT:    store double [[TMP12]], double* [[EXT3:%.*]], align 8
> +; CHECK-NEXT:    store double [[B1]], double* [[EXT4:%.*]], align 8
> +; CHECK-NEXT:    store double [[B1]], double* [[EXT5:%.*]], align 8
>  ; CHECK-NEXT:    ret void
>  ;
>  entry:
> @@ -300,7 +393,56 @@ entry:
>    store double %add0, double *%IdxS0, align 8
>    store double %add1, double *%IdxS1, align 8
>
> -  ; External use
> +  ; External uses of A1
>    store double %A1, double *%Ext1, align 8
> +  store double %A1, double *%Ext2, align 8
> +  store double %A1, double *%Ext3, align 8
> +
> +  ; External uses of B1
> +  store double %B1, double *%Ext4, align 8
> +  store double %B1, double *%Ext5, align 8
> +
> +  ret void
> +}
> +
> +; This checks that the lookahead code does not crash when instructions
> with the same opcodes have different numbers of operands (in this case the
> calls).
> +
> +%Class = type { i8 }
> +declare double @_ZN1i2ayEv(%Class*)
> +declare double @_ZN1i2axEv()
> +
> +define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
> +; CHECK-LABEL: @lookahead_crash(
> +; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double*
> [[A:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double*
> [[A]], i64 1
> +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
> +; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>*
> [[TMP1]], align 8
> +; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(%Class*
> [[ARG0:%.*]])
> +; CHECK-NEXT:    [[C1:%.*]] = call double @_ZN1i2axEv()
> +; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double
> [[C0]], i32 0
> +; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]],
> double [[C1]], i32 1
> +; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
> +; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double*
> [[S:%.*]], i64 0
> +; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double*
> [[S]], i64 1
> +; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
> +; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]],
> align 8
> +; CHECK-NEXT:    ret void
> +;
> +  %IdxA0 = getelementptr inbounds double, double* %A, i64 0
> +  %IdxA1 = getelementptr inbounds double, double* %A, i64 1
> +
> +  %A0 = load double, double *%IdxA0, align 8
> +  %A1 = load double, double *%IdxA1, align 8
> +
> +  %C0 = call double @_ZN1i2ayEv(%Class *%Arg0)
> +  %C1 = call double @_ZN1i2axEv()
> +
> +  %add0 = fadd fast double %A0, %C0
> +  %add1 = fadd fast double %A1, %C1
> +
> +  %IdxS0 = getelementptr inbounds double, double* %S, i64 0
> +  %IdxS1 = getelementptr inbounds double, double* %S, i64 1
> +  store double %add0, double *%IdxS0, align 8
> +  store double %add1, double *%IdxS1, align 8
>    ret void
>  }
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190703/75a57f66/attachment.html>