[llvm] r364084 - [SLP] Look-ahead operand reordering heuristic.

Mon Jun 24 13:37:17 PDT 2019

CReduce gave me this:

// clang -cc1 -triple x86_64-linux-gnu -O2 -vectorize-slp -emit-llvm
site_engagement_service-b5b9c7.reduced.cpp
struct f {
  int c;
};
class g {
public:
  f operator*();
  void operator++();
  friend bool operator!=(g, g);
};
class h {
public:
  g begin();
  g end();
};
class i {
public:
  static double ax();
  double ay();
  void az(double);
};
h d;
void a() {
  for (auto a : d) {
    i b;
    double e = a.c * b.ay();
    int c = e += c * i::ax();
    b.az(e);
  }
}

On Fri, Jun 21, 2019 at 4:11 PM Reid Kleckner <rnk at google.com> wrote:

> I reverted this in r364111 since it caused crashes while building some
> file in Chromium. I'll creduce it and pass it along soon.
>
> On Fri, Jun 21, 2019 at 10:53 AM Simon Pilgrim via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: rksimon
>> Date: Fri Jun 21 10:57:01 2019
>> New Revision: 364084
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=364084&view=rev
>> Log:
>> [SLP] Look-ahead operand reordering heuristic.
>>
>> This patch introduces a new heuristic for guiding operand reordering. The
>> new "look-ahead" heuristic can look beyond the immediate predecessors. This
>> helps break ties when the immediate predecessors have identical opcodes
>> (see lit test for an example).
>>
>> Committed on behalf of @vporpo (Vasileios Porpodas)
>>
>> Differential Revision: https://reviews.llvm.org/D60897
>>
>> Modified:
>>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>>     llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll
>>
>> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=364084&r1=364083&r2=364084&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Jun 21
>> 10:57:01 2019
>> @@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize(
>>      "slp-min-tree-size", cl::init(3), cl::Hidden,
>>      cl::desc("Only vectorize small trees if they are fully
>> vectorizable"));
>>
>> +// The maximum depth that the look-ahead score heuristic will explore.
>> +// The higher this value, the higher the compilation time overhead.
>> +static cl::opt<int> LookAheadMaxDepth(
>> +    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
>> +    cl::desc("The maximum look-ahead depth for operand reordering
>> scores"));
>> +
>>  static cl::opt<bool>
>>      ViewSLPTree("view-slp-tree", cl::Hidden,
>>                  cl::desc("Display the SLP trees with Graphviz"));
>> @@ -708,6 +714,7 @@ public:
>>
>>      const DataLayout &DL;
>>      ScalarEvolution &SE;
>> +    const BoUpSLP &R;
>>
>>      /// \returns the operand data at \p OpIdx and \p Lane.
>>      OperandData &getData(unsigned OpIdx, unsigned Lane) {
>> @@ -733,6 +740,207 @@ public:
>>        std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
>>      }
>>
>> +    // The hard-coded scores listed here are not very important. When
>> computing
>> +    // the scores of matching one sub-tree with another, we are basically
>> +    // counting the number of values that are matching. So even if all
>> scores
>> +    // are set to 1, we would still get a decent matching result.
>> +    // However, sometimes we have to break ties. For example we may have
>> to
>> +    // choose between matching loads vs matching opcodes. This is what
>> these
>> +    // scores are helping us with: they provide the order of preference.
>> +
>> +    /// Loads from consecutive memory addresses, e.g. load(A[i]),
>> load(A[i+1]).
>> +    static const int ScoreConsecutiveLoads = 3;
>> +    /// Constants.
>> +    static const int ScoreConstants = 2;
>> +    /// Instructions with the same opcode.
>> +    static const int ScoreSameOpcode = 2;
>> +    /// Instructions with alt opcodes (e.g, add + sub).
>> +    static const int ScoreAltOpcodes = 1;
>> +    /// Identical instructions (a.k.a. splat or broadcast).
>> +    static const int ScoreSplat = 1;
>> +    /// Matching with an undef is preferable to failing.
>> +    static const int ScoreUndef = 1;
>> +    /// Score for failing to find a decent match.
>> +    static const int ScoreFail = 0;
>> +    /// User external to the vectorized code.
>> +    static const int ExternalUseCost = 1;
>> +    /// The user is internal but in a different lane.
>> +    static const int UserInDiffLaneCost = ExternalUseCost;
>> +
>> +    /// \returns the score of placing \p V1 and \p V2 in consecutive
>> lanes.
>> +    static int getShallowScore(Value *V1, Value *V2, const DataLayout
>> &DL,
>> +                               ScalarEvolution &SE) {
>> +      auto *LI1 = dyn_cast<LoadInst>(V1);
>> +      auto *LI2 = dyn_cast<LoadInst>(V2);
>> +      if (LI1 && LI2)
>> +        return isConsecutiveAccess(LI1, LI2, DL, SE)
>> +                   ? VLOperands::ScoreConsecutiveLoads
>> +                   : VLOperands::ScoreFail;
>> +
>> +      auto *C1 = dyn_cast<Constant>(V1);
>> +      auto *C2 = dyn_cast<Constant>(V2);
>> +      if (C1 && C2)
>> +        return VLOperands::ScoreConstants;
>> +
>> +      auto *I1 = dyn_cast<Instruction>(V1);
>> +      auto *I2 = dyn_cast<Instruction>(V2);
>> +      if (I1 && I2) {
>> +        if (I1 == I2)
>> +          return VLOperands::ScoreSplat;
>> +        InstructionsState S = getSameOpcode({I1, I2});
>> +        // Note: Only consider instructions with <= 2 operands to avoid
>> +        // complexity explosion.
>> +        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
>> +          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
>> +                                  : VLOperands::ScoreSameOpcode;
>> +      }
>> +
>> +      if (isa<UndefValue>(V2))
>> +        return VLOperands::ScoreUndef;
>> +
>> +      return VLOperands::ScoreFail;
>> +    }
>> +
>> +    /// Holds the values and their lane that are taking part in the
>> look-ahead
>> +    /// score calculation. This is used in the external uses cost
>> calculation.
>> +    SmallDenseMap<Value *, int> InLookAheadValues;
>> +
>> +    /// \Returns the additinal cost due to uses of \p LHS and \p RHS
>> that are
>> +    /// either external to the vectorized code, or require shuffling.
>> +    int getExternalUsesCost(const std::pair<Value *, int> &LHS,
>> +                            const std::pair<Value *, int> &RHS) {
>> +      int Cost = 0;
>> +      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
>> +      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
>> +        Value *V = Values[Idx].first;
>> +        // Calculate the absolute lane, using the minimum relative lane
>> of LHS
>> +        // and RHS as base and Idx as the offset.
>> +        int Ln = std::min(LHS.second, RHS.second) + Idx;
>> +        assert(Ln >= 0 && "Bad lane calculation");
>> +        for (User *U : V->users()) {
>> +          if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
>> +            // The user is in the VectorizableTree. Check if we need to
>> insert.
>> +            auto It = llvm::find(UserTE->Scalars, U);
>> +            assert(It != UserTE->Scalars.end() && "U is in UserTE");
>> +            int UserLn = std::distance(UserTE->Scalars.begin(), It);
>> +            assert(UserLn >= 0 && "Bad lane");
>> +            if (UserLn != Ln)
>> +              Cost += UserInDiffLaneCost;
>> +          } else {
>> +            // Check if the user is in the look-ahead code.
>> +            auto It2 = InLookAheadValues.find(U);
>> +            if (It2 != InLookAheadValues.end()) {
>> +              // The user is in the look-ahead code. Check the lane.
>> +              if (It2->second != Ln)
>> +                Cost += UserInDiffLaneCost;
>> +            } else {
>> +              // The user is neither in SLP tree nor in the look-ahead
>> code.
>> +              Cost += ExternalUseCost;
>> +            }
>> +          }
>> +        }
>> +      }
>> +      return Cost;
>> +    }
>> +
>> +    /// Go through the operands of \p LHS and \p RHS recursively until \p
>> +    /// MaxLevel, and return the cummulative score. For example:
>> +    /// \verbatim
>> +    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1]
>> +    ///     \ /         \ /         \ /        \ /
>> +    ///      +           +           +          +
>> +    ///     G1          G2          G3         G4
>> +    /// \endverbatim
>> +    /// The getScoreAtLevelRec(G1, G2) function will try to match the
>> nodes at
>> +    /// each level recursively, accumulating the score. It starts from
>> matching
>> +    /// the additions at level 0, then moves on to the loads (level 1).
>> The
>> +    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]}
>> and
>> +    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
>> +    /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
>> +    /// Please note that the order of the operands does not matter, as we
>> +    /// evaluate the score of all profitable combinations of operands. In
>> +    /// other words the score of G1 and G4 is the same as G1 and G2. This
>> +    /// heuristic is based on ideas described in:
>> +    ///   Look-ahead SLP: Auto-vectorization in the presence of
>> commutative
>> +    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O.
>> Rocha,
>> +    ///   LuÃs F. W. GÃ³es
>> +    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
>> +                           const std::pair<Value *, int> &RHS, int
>> CurrLevel,
>> +                           int MaxLevel) {
>> +
>> +      Value *V1 = LHS.first;
>> +      Value *V2 = RHS.first;
>> +      // Get the shallow score of V1 and V2.
>> +      int ShallowScoreAtThisLevel =
>> +          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
>> +                                       getExternalUsesCost(LHS, RHS));
>> +      int Lane1 = LHS.second;
>> +      int Lane2 = RHS.second;
>> +
>> +      // If reached MaxLevel,
>> +      //  or if V1 and V2 are not instructions,
>> +      //  or if they are SPLAT,
>> +      //  or if they are not consecutive, early return the current cost.
>> +      auto *I1 = dyn_cast<Instruction>(V1);
>> +      auto *I2 = dyn_cast<Instruction>(V2);
>> +      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
>> +          ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
>> +          (isa<LoadInst>(I1) && isa<LoadInst>(I2) &&
>> ShallowScoreAtThisLevel))
>> +        return ShallowScoreAtThisLevel;
>> +      assert(I1 && I2 && "Should have early exited.");
>> +
>> +      // Keep track of in-tree values for determining the external-use
>> cost.
>> +      InLookAheadValues[V1] = Lane1;
>> +      InLookAheadValues[V2] = Lane2;
>> +
>> +      // Contains the I2 operand indexes that got matched with I1
>> operands.
>> +      SmallSet<int, 4> Op2Used;
>> +
>> +      // Recursion towards the operands of I1 and I2. We are trying all
>> possbile
>> +      // operand pairs, and keeping track of the best score.
>> +      for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
>> +           OpIdx1 != NumOperands1; ++OpIdx1) {
>> +        // Try to pair op1I with the best operand of I2.
>> +        int MaxTmpScore = 0;
>> +        int MaxOpIdx2 = -1;
>> +        // If I2 is commutative try all combinations.
>> +        int FromIdx = isCommutative(I2) ? 0 : OpIdx1;
>> +        int ToIdx = isCommutative(I2) ? I2->getNumOperands() : OpIdx1 +
>> 1;
>> +        assert(FromIdx < ToIdx && "Bad index");
>> +        for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
>> +          // Skip operands already paired with OpIdx1.
>> +          if (Op2Used.count(OpIdx2))
>> +            continue;
>> +          // Recursively calculate the cost at each level
>> +          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1),
>> Lane1},
>> +                                            {I2->getOperand(OpIdx2),
>> Lane2},
>> +                                            CurrLevel + 1, MaxLevel);
>> +          // Look for the best score.
>> +          if (TmpScore > VLOperands::ScoreFail && TmpScore >
>> MaxTmpScore) {
>> +            MaxTmpScore = TmpScore;
>> +            MaxOpIdx2 = OpIdx2;
>> +          }
>> +        }
>> +        if (MaxOpIdx2 >= 0) {
>> +          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never
>> revisit it.
>> +          Op2Used.insert(MaxOpIdx2);
>> +          ShallowScoreAtThisLevel += MaxTmpScore;
>> +        }
>> +      }
>> +      return ShallowScoreAtThisLevel;
>> +    }
>> +
>> +    /// \Returns the look-ahead score, which tells us how much the
>> sub-trees
>> +    /// rooted at \p LHS and \p RHS match, the more they match the
>> higher the
>> +    /// score. This helps break ties in an informed way when we cannot
>> decide on
>> +    /// the order of the operands by just considering the immediate
>> +    /// predecessors.
>> +    int getLookAheadScore(const std::pair<Value *, int> &LHS,
>> +                          const std::pair<Value *, int> &RHS) {
>> +      InLookAheadValues.clear();
>> +      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
>> +    }
>> +
>>      // Search all operands in Ops[*][Lane] for the one that matches best
>>      // Ops[OpIdx][LastLane] and return its opreand index.
>>      // If no good match can be found, return None.
>> @@ -750,9 +958,6 @@ public:
>>        // The linearized opcode of the operand at OpIdx, Lane.
>>        bool OpIdxAPO = getData(OpIdx, Lane).APO;
>>
>> -      const unsigned BestScore = 2;
>> -      const unsigned GoodScore = 1;
>> -
>>        // The best operand index and its score.
>>        // Sometimes we have more than one option (e.g., Opcode and
>> Undefs), so we
>>        // are using the score to differentiate between the two.
>> @@ -781,41 +986,19 @@ public:
>>          // Look for an operand that matches the current mode.
>>          switch (RMode) {
>>          case ReorderingMode::Load:
>> -          if (isa<LoadInst>(Op)) {
>> -            // Figure out which is left and right, so that we can check
>> for
>> -            // consecutive loads
>> -            bool LeftToRight = Lane > LastLane;
>> -            Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
>> -            Value *OpRight = (LeftToRight) ? Op : OpLastLane;
>> -            if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
>> -                                    cast<LoadInst>(OpRight), DL, SE))
>> -              BestOp.Idx = Idx;
>> -          }
>> -          break;
>> -        case ReorderingMode::Opcode:
>> -          // We accept both Instructions and Undefs, but with different
>> scores.
>> -          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
>> -               cast<Instruction>(Op)->getOpcode() ==
>> -                   cast<Instruction>(OpLastLane)->getOpcode()) ||
>> -              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
>> -              isa<UndefValue>(Op)) {
>> -            // An instruction has a higher score than an undef.
>> -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore :
>> BestScore;
>> -            if (Score > BestOp.Score) {
>> -              BestOp.Idx = Idx;
>> -              BestOp.Score = Score;
>> -            }
>> -          }
>> -          break;
>>          case ReorderingMode::Constant:
>> -          if (isa<Constant>(Op)) {
>> -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore :
>> BestScore;
>> -            if (Score > BestOp.Score) {
>> -              BestOp.Idx = Idx;
>> -              BestOp.Score = Score;
>> -            }
>> +        case ReorderingMode::Opcode: {
>> +          bool LeftToRight = Lane > LastLane;
>> +          Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
>> +          Value *OpRight = (LeftToRight) ? Op : OpLastLane;
>> +          unsigned Score =
>> +              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
>> +          if (Score > BestOp.Score) {
>> +            BestOp.Idx = Idx;
>> +            BestOp.Score = Score;
>>            }
>>            break;
>> +        }
>>          case ReorderingMode::Splat:
>>            if (Op == OpLastLane)
>>              BestOp.Idx = Idx;
>> @@ -946,8 +1129,8 @@ public:
>>    public:
>>      /// Initialize with all the operands of the instruction vector \p
>> RootVL.
>>      VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
>> -               ScalarEvolution &SE)
>> -        : DL(DL), SE(SE) {
>> +               ScalarEvolution &SE, const BoUpSLP &R)
>> +        : DL(DL), SE(SE), R(R) {
>>        // Append all the operands of RootVL.
>>        appendOperandsOfVL(RootVL);
>>      }
>> @@ -1169,7 +1352,8 @@ private:
>>                                               SmallVectorImpl<Value *>
>> &Left,
>>                                               SmallVectorImpl<Value *>
>> &Right,
>>                                               const DataLayout &DL,
>> -                                             ScalarEvolution &SE);
>> +                                             ScalarEvolution &SE,
>> +                                             const BoUpSLP &R);
>>    struct TreeEntry {
>>      using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
>>      TreeEntry(VecTreeTy &Container) : Container(Container) {}
>> @@ -2371,7 +2555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          // Commutative predicate - collect + sort operands of the
>> instructions
>>          // so that each side is more likely to have the same opcode.
>>          assert(P0 == SwapP0 && "Commutative Predicate mismatch");
>> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
>> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>>        } else {
>>          // Collect operands - commute if it uses the swapped predicate.
>>          for (Value *V : VL) {
>> @@ -2415,7 +2599,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        // have the same opcode.
>>        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
>>          ValueList Left, Right;
>> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
>> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>>          buildTree_rec(Left, Depth + 1, {TE, 0});
>>          buildTree_rec(Right, Depth + 1, {TE, 1});
>>          return;
>> @@ -2584,7 +2768,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        // Reorder operands if reordering would enable vectorization.
>>        if (isa<BinaryOperator>(VL0)) {
>>          ValueList Left, Right;
>> -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
>> +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
>>          buildTree_rec(Left, Depth + 1, {TE, 0});
>>          buildTree_rec(Right, Depth + 1, {TE, 1});
>>          return;
>> @@ -3299,13 +3483,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Valu
>>
>>  // Perform operand reordering on the instructions in VL and return the
>> reordered
>>  // operands in Left and Right.
>> -void BoUpSLP::reorderInputsAccordingToOpcode(
>> -    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
>> -    SmallVectorImpl<Value *> &Right, const DataLayout &DL,
>> -    ScalarEvolution &SE) {
>> +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
>> +                                             SmallVectorImpl<Value *>
>> &Left,
>> +                                             SmallVectorImpl<Value *>
>> &Right,
>> +                                             const DataLayout &DL,
>> +                                             ScalarEvolution &SE,
>> +                                             const BoUpSLP &R) {
>>    if (VL.empty())
>>      return;
>> -  VLOperands Ops(VL, DL, SE);
>> +  VLOperands Ops(VL, DL, SE, R);
>>    // Reorder the operands in place.
>>    Ops.reorder();
>>    Left = Ops.getVL(0);
>>
>> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll?rev=364084&r1=364083&r2=364084&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll (original)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/lookahead.ll Fri Jun 21
>> 10:57:01 2019
>> @@ -27,22 +27,19 @@ define void @lookahead_basic(double* %ar
>>  ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 5
>>  ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 6
>>  ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 7
>> -; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
>> -; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
>> -; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
>> -; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
>> -; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
>> -; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
>> -; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
>> -; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
>> -; CHECK-NEXT:    [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]]
>> -; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
>> -; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
>> -; CHECK-NEXT:    [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]]
>> -; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]],
>> [[SUBCD_0]]
>> -; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]],
>> [[SUBAB_1]]
>> -; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
>> -; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
>> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP0]], align 8
>> +; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP2]], align 8
>> +; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP4]], align 8
>> +; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP6]], align 8
>> +; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
>> +; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
>> +; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]],
>> [[TMP9]]
>> +; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x
>> double>*
>> +; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]],
>> align 8
>>  ; CHECK-NEXT:    ret void
>>  ;
>>  entry:
>> @@ -164,22 +161,23 @@ define void @lookahead_alt2(double* %arr
>>  ; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 5
>>  ; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 6
>>  ; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double*
>> [[ARRAY]], i64 7
>> -; CHECK-NEXT:    [[A_0:%.*]] = load double, double* [[IDX0]], align 8
>> -; CHECK-NEXT:    [[A_1:%.*]] = load double, double* [[IDX1]], align 8
>> -; CHECK-NEXT:    [[B_0:%.*]] = load double, double* [[IDX2]], align 8
>> -; CHECK-NEXT:    [[B_1:%.*]] = load double, double* [[IDX3]], align 8
>> -; CHECK-NEXT:    [[C_0:%.*]] = load double, double* [[IDX4]], align 8
>> -; CHECK-NEXT:    [[C_1:%.*]] = load double, double* [[IDX5]], align 8
>> -; CHECK-NEXT:    [[D_0:%.*]] = load double, double* [[IDX6]], align 8
>> -; CHECK-NEXT:    [[D_1:%.*]] = load double, double* [[IDX7]], align 8
>> -; CHECK-NEXT:    [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]]
>> -; CHECK-NEXT:    [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]]
>> -; CHECK-NEXT:    [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]]
>> -; CHECK-NEXT:    [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]]
>> -; CHECK-NEXT:    [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]],
>> [[SUBCD_0]]
>> -; CHECK-NEXT:    [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]],
>> [[SUBAB_1]]
>> -; CHECK-NEXT:    store double [[ADDABCD_0]], double* [[IDX0]], align 8
>> -; CHECK-NEXT:    store double [[ADDCDAB_1]], double* [[IDX1]], align 8
>> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP0]], align 8
>> +; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP2]], align 8
>> +; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP4]], align 8
>> +; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
>> +; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP6]], align 8
>> +; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
>> +; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
>> +; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2
>> x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
>> +; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]],
>> [[TMP3]]
>> +; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]],
>> [[TMP3]]
>> +; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]],
>> <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
>> +; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]],
>> [[TMP10]]
>> +; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x
>> double>*
>> +; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]],
>> align 8
>>  ; CHECK-NEXT:    ret void
>>  ;
>>  entry:
>> @@ -239,29 +237,28 @@ define void @lookahead_external_uses(dou
>>  ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double*
>> [[B]], i64 2
>>  ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double*
>> [[A]], i64 2
>>  ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double*
>> [[B]], i64 1
>> -; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
>> +; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
>>  ; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
>>  ; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
>> -; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x
>> double>*
>> -; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP0]], align 8
>> +; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
>>  ; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
>>  ; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
>> -; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
>> -; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double
>> [[B0]], i32 0
>> -; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]],
>> double [[B2]], i32 1
>> -; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
>> -; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> undef, double
>> [[C0]], i32 0
>> -; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]],
>> double [[A2]], i32 1
>> -; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double
>> [[D0]], i32 0
>> -; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]],
>> double [[B1]], i32 1
>> -; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
>> -; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]],
>> [[TMP9]]
>> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x
>> double>*
>> +; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>*
>> [[TMP0]], align 8
>> +; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double
>> [[C0]], i32 0
>> +; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]],
>> double [[A1]], i32 1
>> +; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double
>> [[D0]], i32 0
>> +; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]],
>> double [[B2]], i32 1
>> +; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
>> +; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double
>> [[A0]], i32 0
>> +; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]],
>> double [[A2]], i32 1
>> +; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
>> +; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]],
>> [[TMP6]]
>>  ; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double*
>> [[S:%.*]], i64 0
>>  ; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double*
>> [[S]], i64 1
>>  ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x
>> double>*
>>  ; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]],
>> align 8
>> -; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]],
>> i32 1
>> -; CHECK-NEXT:    store double [[TMP12]], double* [[EXT1:%.*]], align 8
>> +; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
>>  ; CHECK-NEXT:    ret void
>>  ;
>>  entry:
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190624/38bff7d6/attachment.html>