[llvm] r327898 - [DAG, X86] Revert r327197 "Revert r327170, r327171, r327172"
Volkan Keles via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 21 13:09:05 PDT 2018
> On Mar 19, 2018, at 1:19 PM, Nirav Dave via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>
> Author: niravd
> Date: Mon Mar 19 13:19:46 2018
> New Revision: 327898
>
> URL: http://llvm.org/viewvc/llvm-project?rev=327898&view=rev
> Log:
> [DAG, X86] Revert r327197 "Revert r327170, r327171, r327172"
>
> Reland ISel cycle checking improvements after simplifying node id
> invariant traversal and correcting typo.
>
> Added:
> llvm/trunk/test/CodeGen/X86/pr36274.ll
> llvm/trunk/test/CodeGen/X86/pr36312.ll
> Modified:
> llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h
> llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
> llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
> llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
> llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp
> llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
> llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
> llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
> llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
> llvm/trunk/test/CodeGen/X86/avg.ll
> llvm/trunk/test/CodeGen/X86/avx-vbroadcastf128.ll
> llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll
> llvm/trunk/test/CodeGen/X86/avx2-vbroadcasti128.ll
> llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti128.ll
> llvm/trunk/test/CodeGen/X86/i256-add.ll
> llvm/trunk/test/CodeGen/X86/masked_memop.ll
> llvm/trunk/test/CodeGen/X86/merge-consecutive-stores.ll
> llvm/trunk/test/CodeGen/X86/nontemporal.ll
> llvm/trunk/test/CodeGen/X86/required-vector-width.ll
> llvm/trunk/test/CodeGen/X86/store_op_load_fold2.ll
> llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
> llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll
>
> Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/SelectionDAGISel.h Mon Mar 19 13:19:46 2018
> @@ -110,6 +110,8 @@ public:
> CodeGenOpt::Level OptLevel,
> bool IgnoreChains = false);
>
> + static void EnforceNodeIdInvariant(SDNode *N);
> +
> // Opcodes used by the DAG state machine:
> enum BuiltinOpcodes {
> OPC_Scope,
> @@ -199,23 +201,28 @@ protected:
> /// of the new node T.
> void ReplaceUses(SDValue F, SDValue T) {
> CurDAG->ReplaceAllUsesOfValueWith(F, T);
> + EnforceNodeIdInvariant(T.getNode());
> }
>
> /// ReplaceUses - replace all uses of the old nodes F with the use
> /// of the new nodes T.
> void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) {
> CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num);
> + for (unsigned i = 0; i < Num; ++i)
> + EnforceNodeIdInvariant(T[i].getNode());
> }
>
> /// ReplaceUses - replace all uses of the old node F with the use
> /// of the new node T.
> void ReplaceUses(SDNode *F, SDNode *T) {
> CurDAG->ReplaceAllUsesWith(F, T);
> + EnforceNodeIdInvariant(T);
> }
>
> /// Replace all uses of \c F with \c T, then remove \c F from the DAG.
> void ReplaceNode(SDNode *F, SDNode *T) {
> CurDAG->ReplaceAllUsesWith(F, T);
> + EnforceNodeIdInvariant(T);
> CurDAG->RemoveDeadNode(F);
> }
>
>
> Modified: llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h (original)
> +++ llvm/trunk/include/llvm/CodeGen/SelectionDAGNodes.h Mon Mar 19 13:19:46 2018
> @@ -802,16 +802,44 @@ public:
> /// searches to be performed in parallel, caching of results across
> /// queries and incremental addition to Worklist. Stops early if N is
> /// found but will resume. Remember to clear Visited and Worklists
> - /// if DAG changes.
> + /// if DAG changes. MaxSteps gives a maximum number of nodes to visit before
> + /// giving up. The TopologicalPrune flag signals that positive NodeIds are
> + /// topologically ordered (Operands have strictly smaller node id) and search
> + /// can be pruned leveraging this.
> static bool hasPredecessorHelper(const SDNode *N,
> SmallPtrSetImpl<const SDNode *> &Visited,
> SmallVectorImpl<const SDNode *> &Worklist,
> - unsigned int MaxSteps = 0) {
> + unsigned int MaxSteps = 0,
> + bool TopologicalPrune = false) {
> + SmallVector<const SDNode *, 8> DeferredNodes;
> if (Visited.count(N))
> return true;
> +
> + // Node Id's are assigned in three places: As a topological
> + // ordering (> 0), during legalization (results in values set to
> + // 0), new nodes (set to -1). If N has a topolgical id then we
> + // know that all nodes with ids smaller than it cannot be
> + // successors and we need not check them. Filter out all node
> + // that can't be matches. We add them to the worklist before exit
> + // in case of multiple calls. Note that during selection the topological id
> + // may be violated if a node's predecessor is selected before it. We mark
> + // this at selection negating the id of unselected successors and
> + // restricting topological pruning to positive ids.
> +
> + int NId = N->getNodeId();
> + // If we Invalidated the Id, reconstruct original NId.
> + if (NId < -1)
> + NId = -(NId + 1);
> +
> + bool Found = false;
> while (!Worklist.empty()) {
> const SDNode *M = Worklist.pop_back_val();
> - bool Found = false;
> + int MId = M->getNodeId();
> + if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) &&
> + (MId > 0) && (MId < NId)) {
> + DeferredNodes.push_back(M);
> + continue;
> + }
> for (const SDValue &OpV : M->op_values()) {
> SDNode *Op = OpV.getNode();
> if (Visited.insert(Op).second)
> @@ -820,11 +848,16 @@ public:
> Found = true;
> }
> if (Found)
> - return true;
> + break;
> if (MaxSteps != 0 && Visited.size() >= MaxSteps)
> - return true;
> + break;
> }
> - return false;
> + // Push deferred nodes back on worklist.
> + Worklist.append(DeferredNodes.begin(), DeferredNodes.end());
> + // If we bailed early, conservatively return found.
> + if (MaxSteps != 0 && Visited.size() >= MaxSteps)
> + return true;
> + return Found;
> }
>
> /// Return true if all the users of N are contained in Nodes.
>
> Modified: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp (original)
> +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp Mon Mar 19 13:19:46 2018
> @@ -960,6 +960,43 @@ public:
>
> } // end anonymous namespace
>
> +// This function is used to enforce the topological node id property
> +// property leveraged during Instruction selection. Before selection all
> +// nodes are given a non-negative id such that all nodes have a larger id than
> +// their operands. As this holds transitively we can prune checks that a node N
> +// is a predecessor of M another by not recursively checking through M's
> +// operands if N's ID is larger than M's ID. This is significantly improves
> +// performance of for various legality checks (e.g. IsLegalToFold /
> +// UpdateChains).
> +
> +// However, when we fuse multiple nodes into a single node
> +// during selection we may induce a predecessor relationship between inputs and
> +// outputs of distinct nodes being merged violating the topological property.
> +// Should a fused node have a successor which has yet to be selected, our
> +// legality checks would be incorrect. To avoid this we mark all unselected
> +// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x =>
> +// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M.
> +// We use bit-negation to more clearly enforce that node id -1 can only be
> +// achieved by selected nodes). As the conversion is reversable the original Id,
> +// topological pruning can still be leveraged when looking for unselected nodes.
> +// This method is call internally in all ISel replacement calls.
> +void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
> + SmallVector<SDNode *, 4> Nodes;
> + Nodes.push_back(Node);
> +
> + while (!Nodes.empty()) {
> + SDNode *N = Nodes.pop_back_val();
> + for (auto *U : N->uses()) {
> + auto UId = U->getNodeId();
> + if (UId > 0) {
> + int InvalidatedUId = -UId + 1;
> + U->setNodeId(InvalidatedUId);
> + Nodes.push_back(U);
> + }
> + }
> + }
> +}
> +
> void SelectionDAGISel::DoInstructionSelection() {
> DEBUG(dbgs() << "===== Instruction selection begins: "
> << printMBBReference(*FuncInfo->MBB) << " '"
> @@ -995,6 +1032,33 @@ void SelectionDAGISel::DoInstructionSele
> if (Node->use_empty())
> continue;
>
> +#ifndef NDEBUG
> + SmallVector<SDNode *, 4> Nodes;
> + Nodes.push_back(Node);
> +
> + while (!Nodes.empty()) {
> + auto N = Nodes.pop_back_val();
> + if (N->getOpcode() == ISD::TokenFactor || N->getNodeId() < 0)
> + continue;
> + for (const SDValue &Op : N->op_values()) {
> + if (Op->getOpcode() == ISD::TokenFactor)
> + Nodes.push_back(Op.getNode());
> + else {
> + // We rely on topological ordering of node ids for checking for
> + // cycles when fusing nodes during selection. All unselected nodes
> + // successors of an already selected node should have a negative id.
> + // This assertion will catch such cases. If this assertion triggers
> + // it is likely you using DAG-level Value/Node replacement functions
> + // (versus equivalent ISEL replacement) in backend-specific
> + // selections. See comment in EnforceNodeIdInvariant for more
> + // details.
> + assert(Op->getNodeId() != -1 &&
> + "Node has already selected predecessor node”);
Hi Nirav,
Our out-of-tree targets hits this assertion. Could you help me to find out the issue?
I investigated a bit and I found out it is caused by DAGTypeLegalizer. NodeUpdateListener::NodeUpdated sets NodeID to DAGTypeLegalizer::NewNode which is -1. Do you think DAGTypeLegalizer needs to be updated too?
Volkan
> + }
> + }
> + }
> +#endif
> +
> // When we are using non-default rounding modes or FP exception behavior
> // FP operations are represented by StrictFP pseudo-operations. They
> // need to be simplified here so that the target-specific instruction
> @@ -2164,54 +2228,44 @@ static SDNode *findGlueUse(SDNode *N) {
> return nullptr;
> }
>
> -/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
> -/// This function iteratively traverses up the operand chain, ignoring
> -/// certain nodes.
> -static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
> - SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
> +/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path
> +/// beyond "ImmedUse". We may ignore chains as they are checked separately.
> +static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
> bool IgnoreChains) {
> - // The NodeID's are given uniques ID's where a node ID is guaranteed to be
> - // greater than all of its (recursive) operands. If we scan to a point where
> - // 'use' is smaller than the node we're scanning for, then we know we will
> - // never find it.
> - //
> - // The Use may be -1 (unassigned) if it is a newly allocated node. This can
> - // happen because we scan down to newly selected nodes in the case of glue
> - // uses.
> - std::vector<SDNode *> WorkList;
> - WorkList.push_back(Use);
> -
> - while (!WorkList.empty()) {
> - Use = WorkList.back();
> - WorkList.pop_back();
> - // NodeId topological order of TokenFactors is not guaranteed. Do not skip.
> - if (Use->getOpcode() != ISD::TokenFactor &&
> - Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
> - continue;
> -
> - // Don't revisit nodes if we already scanned it and didn't fail, we know we
> - // won't fail if we scan it again.
> - if (!Visited.insert(Use).second)
> - continue;
> -
> - for (const SDValue &Op : Use->op_values()) {
> - // Ignore chain uses, they are validated by HandleMergeInputChains.
> - if (Op.getValueType() == MVT::Other && IgnoreChains)
> - continue;
> + SmallPtrSet<const SDNode *, 16> Visited;
> + SmallVector<const SDNode *, 16> WorkList;
> + // Only check if we have non-immediate uses of Def.
> + if (ImmedUse->isOnlyUserOf(Def))
> + return false;
>
> + // We don't care about paths to Def that go through ImmedUse so mark it
> + // visited and mark non-def operands as used.
> + Visited.insert(ImmedUse);
> + for (const SDValue &Op : ImmedUse->op_values()) {
> + SDNode *N = Op.getNode();
> + // Ignore chain deps (they are validated by
> + // HandleMergeInputChains) and immediate uses
> + if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
> + continue;
> + if (!Visited.insert(N).second)
> + continue;
> + WorkList.push_back(N);
> + }
> +
> + // Initialize worklist to operands of Root.
> + if (Root != ImmedUse) {
> + for (const SDValue &Op : Root->op_values()) {
> SDNode *N = Op.getNode();
> - if (N == Def) {
> - if (Use == ImmedUse || Use == Root)
> - continue; // We are not looking for immediate use.
> - assert(N != Root);
> - return true;
> - }
> -
> - // Traverse up the operand chain.
> + // Ignore chains (they are validated by HandleMergeInputChains)
> + if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
> + continue;
> + if (!Visited.insert(N).second)
> + continue;
> WorkList.push_back(N);
> }
> }
> - return false;
> +
> + return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true);
> }
>
> /// IsProfitableToFold - Returns true if it's profitable to fold the specific
> @@ -2283,13 +2337,12 @@ bool SelectionDAGISel::IsLegalToFold(SDV
>
> // If our query node has a glue result with a use, we've walked up it. If
> // the user (which has already been selected) has a chain or indirectly uses
> - // the chain, our WalkChainUsers predicate will not consider it. Because of
> + // the chain, HandleMergeInputChains will not consider it. Because of
> // this, we cannot ignore chains in this predicate.
> IgnoreChains = false;
> }
>
> - SmallPtrSet<SDNode*, 16> Visited;
> - return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
> + return !findNonImmUse(Root, N.getNode(), U, IgnoreChains);
> }
>
> void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
> @@ -2393,7 +2446,7 @@ void SelectionDAGISel::UpdateChains(
> static_cast<SDNode *>(nullptr));
> });
> if (ChainNode->getOpcode() != ISD::TokenFactor)
> - CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
> + ReplaceUses(ChainVal, InputChain);
>
> // If the node became dead and we haven't already seen it, delete it.
> if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
> @@ -2408,143 +2461,6 @@ void SelectionDAGISel::UpdateChains(
> DEBUG(dbgs() << "ISEL: Match complete!\n");
> }
>
> -enum ChainResult {
> - CR_Simple,
> - CR_InducesCycle,
> - CR_LeadsToInteriorNode
> -};
> -
> -/// WalkChainUsers - Walk down the users of the specified chained node that is
> -/// part of the pattern we're matching, looking at all of the users we find.
> -/// This determines whether something is an interior node, whether we have a
> -/// non-pattern node in between two pattern nodes (which prevent folding because
> -/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
> -/// between pattern nodes (in which case the TF becomes part of the pattern).
> -///
> -/// The walk we do here is guaranteed to be small because we quickly get down to
> -/// already selected nodes "below" us.
> -static ChainResult
> -WalkChainUsers(const SDNode *ChainedNode,
> - SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
> - DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
> - SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
> - ChainResult Result = CR_Simple;
> -
> - for (SDNode::use_iterator UI = ChainedNode->use_begin(),
> - E = ChainedNode->use_end(); UI != E; ++UI) {
> - // Make sure the use is of the chain, not some other value we produce.
> - if (UI.getUse().getValueType() != MVT::Other) continue;
> -
> - SDNode *User = *UI;
> -
> - if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph.
> - continue;
> -
> - // If we see an already-selected machine node, then we've gone beyond the
> - // pattern that we're selecting down into the already selected chunk of the
> - // DAG.
> - unsigned UserOpcode = User->getOpcode();
> - if (User->isMachineOpcode() ||
> - UserOpcode == ISD::CopyToReg ||
> - UserOpcode == ISD::CopyFromReg ||
> - UserOpcode == ISD::INLINEASM ||
> - UserOpcode == ISD::EH_LABEL ||
> - UserOpcode == ISD::LIFETIME_START ||
> - UserOpcode == ISD::LIFETIME_END) {
> - // If their node ID got reset to -1 then they've already been selected.
> - // Treat them like a MachineOpcode.
> - if (User->getNodeId() == -1)
> - continue;
> - }
> -
> - // If we have a TokenFactor, we handle it specially.
> - if (User->getOpcode() != ISD::TokenFactor) {
> - // If the node isn't a token factor and isn't part of our pattern, then it
> - // must be a random chained node in between two nodes we're selecting.
> - // This happens when we have something like:
> - // x = load ptr
> - // call
> - // y = x+4
> - // store y -> ptr
> - // Because we structurally match the load/store as a read/modify/write,
> - // but the call is chained between them. We cannot fold in this case
> - // because it would induce a cycle in the graph.
> - if (!std::count(ChainedNodesInPattern.begin(),
> - ChainedNodesInPattern.end(), User))
> - return CR_InducesCycle;
> -
> - // Otherwise we found a node that is part of our pattern. For example in:
> - // x = load ptr
> - // y = x+4
> - // store y -> ptr
> - // This would happen when we're scanning down from the load and see the
> - // store as a user. Record that there is a use of ChainedNode that is
> - // part of the pattern and keep scanning uses.
> - Result = CR_LeadsToInteriorNode;
> - InteriorChainedNodes.push_back(User);
> - continue;
> - }
> -
> - // If we found a TokenFactor, there are two cases to consider: first if the
> - // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
> - // uses of the TF are in our pattern) we just want to ignore it. Second,
> - // the TokenFactor can be sandwiched in between two chained nodes, like so:
> - // [Load chain]
> - // ^
> - // |
> - // [Load]
> - // ^ ^
> - // | \ DAG's like cheese
> - // / \ do you?
> - // / |
> - // [TokenFactor] [Op]
> - // ^ ^
> - // | |
> - // \ /
> - // \ /
> - // [Store]
> - //
> - // In this case, the TokenFactor becomes part of our match and we rewrite it
> - // as a new TokenFactor.
> - //
> - // To distinguish these two cases, do a recursive walk down the uses.
> - auto MemoizeResult = TokenFactorResult.find(User);
> - bool Visited = MemoizeResult != TokenFactorResult.end();
> - // Recursively walk chain users only if the result is not memoized.
> - if (!Visited) {
> - auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
> - InteriorChainedNodes);
> - MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
> - }
> - switch (MemoizeResult->second) {
> - case CR_Simple:
> - // If the uses of the TokenFactor are just already-selected nodes, ignore
> - // it, it is "below" our pattern.
> - continue;
> - case CR_InducesCycle:
> - // If the uses of the TokenFactor lead to nodes that are not part of our
> - // pattern that are not selected, folding would turn this into a cycle,
> - // bail out now.
> - return CR_InducesCycle;
> - case CR_LeadsToInteriorNode:
> - break; // Otherwise, keep processing.
> - }
> -
> - // Okay, we know we're in the interesting interior case. The TokenFactor
> - // is now going to be considered part of the pattern so that we rewrite its
> - // uses (it may have uses that are not part of the pattern) with the
> - // ultimate chain result of the generated code. We will also add its chain
> - // inputs as inputs to the ultimate TokenFactor we create.
> - Result = CR_LeadsToInteriorNode;
> - if (!Visited) {
> - ChainedNodesInPattern.push_back(User);
> - InteriorChainedNodes.push_back(User);
> - }
> - }
> -
> - return Result;
> -}
> -
> /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
> /// operation for when the pattern matched at least one node with a chains. The
> /// input vector contains a list of all of the chained nodes that we match. We
> @@ -2554,47 +2470,56 @@ WalkChainUsers(const SDNode *ChainedNode
> static SDValue
> HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
> SelectionDAG *CurDAG) {
> - // Used for memoization. Without it WalkChainUsers could take exponential
> - // time to run.
> - DenseMap<const SDNode *, ChainResult> TokenFactorResult;
> - // Walk all of the chained nodes we've matched, recursively scanning down the
> - // users of the chain result. This adds any TokenFactor nodes that are caught
> - // in between chained nodes to the chained and interior nodes list.
> - SmallVector<SDNode*, 3> InteriorChainedNodes;
> - for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
> - if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
> - TokenFactorResult,
> - InteriorChainedNodes) == CR_InducesCycle)
> - return SDValue(); // Would induce a cycle.
> - }
>
> - // Okay, we have walked all the matched nodes and collected TokenFactor nodes
> - // that we are interested in. Form our input TokenFactor node.
> + SmallPtrSet<const SDNode *, 16> Visited;
> + SmallVector<const SDNode *, 8> Worklist;
> SmallVector<SDValue, 3> InputChains;
> - for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
> - // Add the input chain of this node to the InputChains list (which will be
> - // the operands of the generated TokenFactor) if it's not an interior node.
> - SDNode *N = ChainNodesMatched[i];
> - if (N->getOpcode() != ISD::TokenFactor) {
> - if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
> - continue;
> -
> - // Otherwise, add the input chain.
> - SDValue InChain = ChainNodesMatched[i]->getOperand(0);
> - assert(InChain.getValueType() == MVT::Other && "Not a chain");
> - InputChains.push_back(InChain);
> - continue;
> - }
> + unsigned int Max = 8192;
>
> - // If we have a token factor, we want to add all inputs of the token factor
> - // that are not part of the pattern we're matching.
> - for (const SDValue &Op : N->op_values()) {
> - if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
> - Op.getNode()))
> - InputChains.push_back(Op);
> - }
> - }
> + // Quick exit on trivial merge.
> + if (ChainNodesMatched.size() == 1)
> + return ChainNodesMatched[0]->getOperand(0);
> +
> + // Add chains that aren't already added (internal). Peek through
> + // token factors.
> + std::function<void(const SDValue)> AddChains = [&](const SDValue V) {
> + if (V.getValueType() != MVT::Other)
> + return;
> + if (V->getOpcode() == ISD::EntryToken)
> + return;
> + if (!Visited.insert(V.getNode()).second)
> + return;
> + if (V->getOpcode() == ISD::TokenFactor) {
> + for (const SDValue &Op : V->op_values())
> + AddChains(Op);
> + } else
> + InputChains.push_back(V);
> + };
> +
> + for (auto *N : ChainNodesMatched) {
> + Worklist.push_back(N);
> + Visited.insert(N);
> + }
> +
> + while (!Worklist.empty())
> + AddChains(Worklist.pop_back_val()->getOperand(0));
> +
> + // Skip the search if there are no chain dependencies.
> + if (InputChains.size() == 0)
> + return CurDAG->getEntryNode();
> +
> + // If one of these chains is a successor of input, we must have a
> + // node that is both the predecessor and successor of the
> + // to-be-merged nodes. Fail.
> + Visited.clear();
> + for (SDValue V : InputChains)
> + Worklist.push_back(V.getNode());
> +
> + for (auto *N : ChainNodesMatched)
> + if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true))
> + return SDValue();
>
> + // Return merged chain.
> if (InputChains.size() == 1)
> return InputChains[0];
> return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
> @@ -2639,8 +2564,8 @@ MorphNode(SDNode *Node, unsigned TargetO
> // Move the glue if needed.
> if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
> (unsigned)OldGlueResultNo != ResNumResults-1)
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
> - SDValue(Res, ResNumResults-1));
> + ReplaceUses(SDValue(Node, OldGlueResultNo),
> + SDValue(Res, ResNumResults - 1));
>
> if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
> --ResNumResults;
> @@ -2648,14 +2573,15 @@ MorphNode(SDNode *Node, unsigned TargetO
> // Move the chain reference if needed.
> if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
> (unsigned)OldChainResultNo != ResNumResults-1)
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
> - SDValue(Res, ResNumResults-1));
> + ReplaceUses(SDValue(Node, OldChainResultNo),
> + SDValue(Res, ResNumResults - 1));
>
> // Otherwise, no replacement happened because the node already exists. Replace
> // Uses of the old node with the new one.
> if (Res != Node) {
> - CurDAG->ReplaceAllUsesWith(Node, Res);
> - CurDAG->RemoveDeadNode(Node);
> + ReplaceNode(Node, Res);
> + } else {
> + EnforceNodeIdInvariant(Res);
> }
>
> return Res;
> @@ -2972,8 +2898,7 @@ void SelectionDAGISel::SelectCodeCommon(
> return;
> case ISD::AssertSext:
> case ISD::AssertZext:
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
> - NodeToMatch->getOperand(0));
> + ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
> CurDAG->RemoveDeadNode(NodeToMatch);
> return;
> case ISD::INLINEASM:
> @@ -3731,7 +3656,7 @@ void SelectionDAGISel::SelectCodeCommon(
> NodeToMatch->getValueType(i).getSizeInBits() ==
> Res.getValueSizeInBits()) &&
> "invalid replacement");
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
> + ReplaceUses(SDValue(NodeToMatch, i), Res);
> }
>
> // Update chain uses.
> @@ -3744,8 +3669,8 @@ void SelectionDAGISel::SelectCodeCommon(
> if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
> MVT::Glue &&
> InputGlue.getNode())
> - CurDAG->ReplaceAllUsesOfValueWith(
> - SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
> + ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1),
> + InputGlue);
>
> assert(NodeToMatch->use_empty() &&
> "Didn't replace all uses of the node?");
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Mon Mar 19 13:19:46 2018
> @@ -766,12 +766,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I
>
> if (ProduceCarry) {
> // Replace the carry-use
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
> + ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
> }
>
> // Replace the remaining uses.
> - CurDAG->ReplaceAllUsesWith(N, RegSequence);
> - CurDAG->RemoveDeadNode(N);
> + ReplaceNode(N, RegSequence);
> }
>
> void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
>
> Modified: llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp Mon Mar 19 13:19:46 2018
> @@ -500,7 +500,7 @@ bool ARMDAGToDAGISel::canExtractShiftFro
>
> void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
> CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
> - CurDAG->ReplaceAllUsesWith(N, M);
> + ReplaceUses(N, M);
> }
>
> bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
>
> Modified: llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp Mon Mar 19 13:19:46 2018
> @@ -662,7 +662,7 @@ void HexagonDAGToDAGISel::SelectBitcast(
> return;
> }
>
> - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
> + ReplaceUses(SDValue(N, 0), N->getOperand(0));
> CurDAG->RemoveDeadNode(N);
> }
>
> @@ -726,7 +726,6 @@ void HexagonDAGToDAGISel::SelectTypecast
> SDNode *T = CurDAG->MorphNodeTo(N, N->getOpcode(),
> CurDAG->getVTList(OpTy), {Op});
> ReplaceNode(T, Op.getNode());
> - CurDAG->RemoveDeadNode(T);
> }
>
> void HexagonDAGToDAGISel::SelectP2D(SDNode *N) {
> @@ -2185,4 +2184,3 @@ void HexagonDAGToDAGISel::rebalanceAddre
> RootHeights.clear();
> RootWeights.clear();
> }
> -
>
> Modified: llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp (original)
> +++ llvm/trunk/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp Mon Mar 19 13:19:46 2018
> @@ -1953,7 +1953,6 @@ void HvxSelector::selectShuffle(SDNode *
> // If the mask is all -1's, generate "undef".
> if (!UseLeft && !UseRight) {
> ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
> - DAG.RemoveDeadNode(N);
> return;
> }
>
> @@ -2009,7 +2008,6 @@ void HvxSelector::selectRor(SDNode *N) {
> NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
>
> ISel.ReplaceNode(N, NewN);
> - DAG.RemoveDeadNode(N);
> }
>
> void HvxSelector::selectVAlign(SDNode *N) {
> @@ -2074,8 +2072,7 @@ void HexagonDAGToDAGISel::SelectV65Gathe
> MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
> cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
>
> - ReplaceUses(N, Result);
> - CurDAG->RemoveDeadNode(N);
> + ReplaceNode(N, Result);
> }
>
> void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
> @@ -2117,8 +2114,7 @@ void HexagonDAGToDAGISel::SelectV65Gathe
> MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
> cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
>
> - ReplaceUses(N, Result);
> - CurDAG->RemoveDeadNode(N);
> + ReplaceNode(N, Result);
> }
>
> void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
> @@ -2161,5 +2157,3 @@ void HexagonDAGToDAGISel::SelectHVXDualO
> ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
> CurDAG->RemoveDeadNode(N);
> }
> -
> -
>
> Modified: llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp Mon Mar 19 13:19:46 2018
> @@ -596,7 +596,13 @@ static void insertDAGNode(SelectionDAG *
> if (N.getNode()->getNodeId() == -1 ||
> N.getNode()->getNodeId() > Pos->getNodeId()) {
> DAG->RepositionNode(Pos->getIterator(), N.getNode());
> - N.getNode()->setNodeId(Pos->getNodeId());
> + // Mark Node as invalid for pruning as after this it may be a successor to a
> + // selected node but otherwise be in the same position of Pos.
> + // Conservatively mark it with the same -abs(Id) to assure node id
> + // invariant is preserved.
> + int PId = Pos->getNodeId();
> + int InvalidatedPId = -(PId + 1);
> + N->setNodeId((PId > 0) ? InvalidatedPId : PId);
> }
> }
>
> @@ -1027,8 +1033,7 @@ bool SystemZDAGToDAGISel::tryRISBGZero(S
> };
> SDValue New = convertTo(
> DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
> - ReplaceUses(N, New.getNode());
> - CurDAG->RemoveDeadNode(N);
> + ReplaceNode(N, New.getNode());
> return true;
> }
>
> @@ -1119,8 +1124,7 @@ void SystemZDAGToDAGISel::splitLargeImme
> SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
> SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
>
> - ReplaceUses(Node, Or.getNode());
> - CurDAG->RemoveDeadNode(Node);
> + ReplaceNode(Node, Or.getNode());
>
> SelectCode(Or.getNode());
> }
> @@ -1618,4 +1622,3 @@ void SystemZDAGToDAGISel::PreprocessISel
> if (MadeChange)
> CurDAG->RemoveDeadNodes();
> }
> -
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Mon Mar 19 13:19:46 2018
> @@ -2174,50 +2174,84 @@ static bool isFusableLoadOpStorePattern(
> LoadNode->getOffset() != StoreNode->getOffset())
> return false;
>
> - // Check if the chain is produced by the load or is a TokenFactor with
> - // the load output chain as an operand. Return InputChain by reference.
> + bool FoundLoad = false;
> + SmallVector<SDValue, 4> ChainOps;
> + SmallVector<const SDNode *, 4> LoopWorklist;
> + SmallPtrSet<const SDNode *, 16> Visited;
> + const unsigned int Max = 1024;
> +
> + // Visualization of Load-Op-Store fusion:
> + // -------------------------
> + // Legend:
> + // *-lines = Chain operand dependencies.
> + // |-lines = Normal operand dependencies.
> + // Dependencies flow down and right. n-suffix references multiple nodes.
> + //
> + // C Xn C
> + // * * *
> + // * * *
> + // Xn A-LD Yn TF Yn
> + // * * \ | * |
> + // * * \ | * |
> + // * * \ | => A--LD_OP_ST
> + // * * \| \
> + // TF OP \
> + // * | \ Zn
> + // * | \
> + // A-ST Zn
> + //
> +
> + // This merge induced dependences from: #1: Xn -> LD, OP, Zn
> + // #2: Yn -> LD
> + // #3: ST -> Zn
> +
> + // Ensure the transform is safe by checking for the dual
> + // dependencies to make sure we do not induce a loop.
> +
> + // As LD is a predecessor to both OP and ST we can do this by checking:
> + // a). if LD is a predecessor to a member of Xn or Yn.
> + // b). if a Zn is a predecessor to ST.
> +
> + // However, (b) can only occur through being a chain predecessor to
> + // ST, which is the same as Zn being a member or predecessor of Xn,
> + // which is a subset of LD being a predecessor of Xn. So it's
> + // subsumed by check (a).
> +
> SDValue Chain = StoreNode->getChain();
>
> - bool ChainCheck = false;
> + // Gather X elements in ChainOps.
> if (Chain == Load.getValue(1)) {
> - ChainCheck = true;
> - InputChain = LoadNode->getChain();
> + FoundLoad = true;
> + ChainOps.push_back(Load.getOperand(0));
> } else if (Chain.getOpcode() == ISD::TokenFactor) {
> - SmallVector<SDValue, 4> ChainOps;
> for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
> SDValue Op = Chain.getOperand(i);
> if (Op == Load.getValue(1)) {
> - ChainCheck = true;
> + FoundLoad = true;
> // Drop Load, but keep its chain. No cycle check necessary.
> ChainOps.push_back(Load.getOperand(0));
> continue;
> }
> -
> - // Make sure using Op as part of the chain would not cause a cycle here.
> - // In theory, we could check whether the chain node is a predecessor of
> - // the load. But that can be very expensive. Instead visit the uses and
> - // make sure they all have smaller node id than the load.
> - int LoadId = LoadNode->getNodeId();
> - for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
> - UE = UI->use_end(); UI != UE; ++UI) {
> - if (UI.getUse().getResNo() != 0)
> - continue;
> - if (UI->getNodeId() > LoadId)
> - return false;
> - }
> -
> + LoopWorklist.push_back(Op.getNode());
> ChainOps.push_back(Op);
> }
> -
> - if (ChainCheck)
> - // Make a new TokenFactor with all the other input chains except
> - // for the load.
> - InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
> - MVT::Other, ChainOps);
> }
> - if (!ChainCheck)
> +
> + if (!FoundLoad)
> + return false;
> +
> + // Worklist is currently Xn. Add Yn to worklist.
> + for (SDValue Op : StoredVal->ops())
> + if (Op.getNode() != LoadNode)
> + LoopWorklist.push_back(Op.getNode());
> +
> + // Check (a) if Load is a predecessor to Xn + Yn
> + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
> + true))
> return false;
>
> + InputChain =
> + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
> return true;
> }
>
> @@ -2448,6 +2482,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoM
> MemOp[1] = LoadNode->getMemOperand();
> Result->setMemRefs(MemOp, MemOp + 2);
>
> + // Update Load Chain uses as well.
> + ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
> ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
> ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
> CurDAG->RemoveDeadNode(Node);
> @@ -3159,8 +3195,7 @@ void X86DAGToDAGISel::Select(SDNode *Nod
> // Emit a testl or testw.
> SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
> // Replace CMP with TEST.
> - CurDAG->ReplaceAllUsesWith(Node, NewNode);
> - CurDAG->RemoveDeadNode(Node);
> + ReplaceNode(Node, NewNode);
> return;
> }
> break;
>
> Modified: llvm/trunk/test/CodeGen/X86/avg.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avg.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avg.ll Mon Mar 19 13:19:46 2018
> @@ -90,12 +90,12 @@ define void @avg_v16i8(<16 x i8>* %a, <1
> define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
> ; SSE2-LABEL: avg_v32i8:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa 16(%rdi), %xmm0
> -; SSE2-NEXT: movdqa (%rsi), %xmm1
> -; SSE2-NEXT: pavgb (%rdi), %xmm1
> -; SSE2-NEXT: pavgb 16(%rsi), %xmm0
> -; SSE2-NEXT: movdqu %xmm0, (%rax)
> +; SSE2-NEXT: movdqa (%rsi), %xmm0
> +; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: pavgb (%rdi), %xmm0
> +; SSE2-NEXT: pavgb 16(%rdi), %xmm1
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> +; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> ;
> ; AVX1-LABEL: avg_v32i8:
> @@ -528,18 +528,18 @@ define void @avg_v48i8(<48 x i8>* %a, <4
> define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
> ; SSE2-LABEL: avg_v64i8:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa 32(%rdi), %xmm0
> -; SSE2-NEXT: movdqa (%rsi), %xmm1
> -; SSE2-NEXT: movdqa 16(%rsi), %xmm2
> +; SSE2-NEXT: movdqa (%rsi), %xmm0
> +; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: movdqa 32(%rsi), %xmm2
> ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
> -; SSE2-NEXT: pavgb (%rdi), %xmm1
> -; SSE2-NEXT: pavgb 16(%rdi), %xmm2
> -; SSE2-NEXT: pavgb 32(%rsi), %xmm0
> +; SSE2-NEXT: pavgb (%rdi), %xmm0
> +; SSE2-NEXT: pavgb 16(%rdi), %xmm1
> +; SSE2-NEXT: pavgb 32(%rdi), %xmm2
> ; SSE2-NEXT: pavgb 48(%rdi), %xmm3
> ; SSE2-NEXT: movdqu %xmm3, (%rax)
> -; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: movdqu %xmm2, (%rax)
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> +; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> ;
> ; AVX1-LABEL: avg_v64i8:
> @@ -565,23 +565,23 @@ define void @avg_v64i8(<64 x i8>* %a, <6
> ;
> ; AVX2-LABEL: avg_v64i8:
> ; AVX2: # %bb.0:
> -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
> -; AVX2-NEXT: vmovdqa (%rsi), %ymm1
> -; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1
> -; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
> -; AVX2-NEXT: vmovdqu %ymm0, (%rax)
> +; AVX2-NEXT: vmovdqa (%rsi), %ymm0
> +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
> +; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
> ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
> +; AVX2-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX2-NEXT: vzeroupper
> ; AVX2-NEXT: retq
> ;
> ; AVX512F-LABEL: avg_v64i8:
> ; AVX512F: # %bb.0:
> -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
> -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
> -; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1
> -; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
> -; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
> +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
> +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
> +; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
> ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
> +; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX512F-NEXT: vzeroupper
> ; AVX512F-NEXT: retq
> ;
> @@ -661,12 +661,12 @@ define void @avg_v8i16(<8 x i16>* %a, <8
> define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
> ; SSE2-LABEL: avg_v16i16:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa 16(%rdi), %xmm0
> -; SSE2-NEXT: movdqa (%rsi), %xmm1
> -; SSE2-NEXT: pavgw (%rdi), %xmm1
> -; SSE2-NEXT: pavgw 16(%rsi), %xmm0
> -; SSE2-NEXT: movdqu %xmm0, (%rax)
> +; SSE2-NEXT: movdqa (%rsi), %xmm0
> +; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: pavgw (%rdi), %xmm0
> +; SSE2-NEXT: pavgw 16(%rdi), %xmm1
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> +; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> ;
> ; AVX1-LABEL: avg_v16i16:
> @@ -712,18 +712,18 @@ define void @avg_v16i16(<16 x i16>* %a,
> define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
> ; SSE2-LABEL: avg_v32i16:
> ; SSE2: # %bb.0:
> -; SSE2-NEXT: movdqa 32(%rdi), %xmm0
> -; SSE2-NEXT: movdqa (%rsi), %xmm1
> -; SSE2-NEXT: movdqa 16(%rsi), %xmm2
> +; SSE2-NEXT: movdqa (%rsi), %xmm0
> +; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: movdqa 32(%rsi), %xmm2
> ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
> -; SSE2-NEXT: pavgw (%rdi), %xmm1
> -; SSE2-NEXT: pavgw 16(%rdi), %xmm2
> -; SSE2-NEXT: pavgw 32(%rsi), %xmm0
> +; SSE2-NEXT: pavgw (%rdi), %xmm0
> +; SSE2-NEXT: pavgw 16(%rdi), %xmm1
> +; SSE2-NEXT: pavgw 32(%rdi), %xmm2
> ; SSE2-NEXT: pavgw 48(%rdi), %xmm3
> ; SSE2-NEXT: movdqu %xmm3, (%rax)
> -; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: movdqu %xmm2, (%rax)
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> +; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> ;
> ; AVX1-LABEL: avg_v32i16:
> @@ -749,23 +749,23 @@ define void @avg_v32i16(<32 x i16>* %a,
> ;
> ; AVX2-LABEL: avg_v32i16:
> ; AVX2: # %bb.0:
> -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
> -; AVX2-NEXT: vmovdqa (%rsi), %ymm1
> -; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1
> -; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
> -; AVX2-NEXT: vmovdqu %ymm0, (%rax)
> +; AVX2-NEXT: vmovdqa (%rsi), %ymm0
> +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
> +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
> ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
> +; AVX2-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX2-NEXT: vzeroupper
> ; AVX2-NEXT: retq
> ;
> ; AVX512F-LABEL: avg_v32i16:
> ; AVX512F: # %bb.0:
> -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
> -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
> -; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1
> -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
> -; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
> +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
> +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
> +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
> ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
> +; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX512F-NEXT: vzeroupper
> ; AVX512F-NEXT: retq
> ;
> @@ -874,9 +874,9 @@ define void @avg_v32i8_2(<32 x i8>* %a,
> ; SSE2-LABEL: avg_v32i8_2:
> ; SSE2: # %bb.0:
> ; SSE2-NEXT: movdqa (%rdi), %xmm0
> -; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: movdqa 16(%rdi), %xmm1
> ; SSE2-NEXT: pavgb (%rsi), %xmm0
> -; SSE2-NEXT: pavgb 16(%rdi), %xmm1
> +; SSE2-NEXT: pavgb 16(%rsi), %xmm1
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> ; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> @@ -1055,9 +1055,9 @@ define void @avg_v16i16_2(<16 x i16>* %a
> ; SSE2-LABEL: avg_v16i16_2:
> ; SSE2: # %bb.0:
> ; SSE2-NEXT: movdqa (%rdi), %xmm0
> -; SSE2-NEXT: movdqa 16(%rsi), %xmm1
> +; SSE2-NEXT: movdqa 16(%rdi), %xmm1
> ; SSE2-NEXT: pavgw (%rsi), %xmm0
> -; SSE2-NEXT: pavgw 16(%rdi), %xmm1
> +; SSE2-NEXT: pavgw 16(%rsi), %xmm1
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> ; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> @@ -1107,14 +1107,14 @@ define void @avg_v32i16_2(<32 x i16>* %a
> ; SSE2: # %bb.0:
> ; SSE2-NEXT: movdqa (%rdi), %xmm0
> ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
> -; SSE2-NEXT: movdqa 48(%rdi), %xmm2
> -; SSE2-NEXT: movdqa 32(%rsi), %xmm3
> +; SSE2-NEXT: movdqa 32(%rdi), %xmm2
> +; SSE2-NEXT: movdqa 48(%rdi), %xmm3
> ; SSE2-NEXT: pavgw (%rsi), %xmm0
> ; SSE2-NEXT: pavgw 16(%rsi), %xmm1
> -; SSE2-NEXT: pavgw 32(%rdi), %xmm3
> -; SSE2-NEXT: pavgw 48(%rsi), %xmm2
> -; SSE2-NEXT: movdqu %xmm2, (%rax)
> +; SSE2-NEXT: pavgw 32(%rsi), %xmm2
> +; SSE2-NEXT: pavgw 48(%rsi), %xmm3
> ; SSE2-NEXT: movdqu %xmm3, (%rax)
> +; SSE2-NEXT: movdqu %xmm2, (%rax)
> ; SSE2-NEXT: movdqu %xmm1, (%rax)
> ; SSE2-NEXT: movdqu %xmm0, (%rax)
> ; SSE2-NEXT: retq
> @@ -1143,9 +1143,9 @@ define void @avg_v32i16_2(<32 x i16>* %a
> ; AVX2-LABEL: avg_v32i16_2:
> ; AVX2: # %bb.0:
> ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
> -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
> ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
> -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
> +; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
> ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
> ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX2-NEXT: vzeroupper
> @@ -1154,9 +1154,9 @@ define void @avg_v32i16_2(<32 x i16>* %a
> ; AVX512F-LABEL: avg_v32i16_2:
> ; AVX512F: # %bb.0:
> ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
> -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
> +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
> ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
> -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
> +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
> ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
> ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
> ; AVX512F-NEXT: vzeroupper
>
> Modified: llvm/trunk/test/CodeGen/X86/avx-vbroadcastf128.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vbroadcastf128.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx-vbroadcastf128.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx-vbroadcastf128.ll Mon Mar 19 13:19:46 2018
> @@ -235,18 +235,16 @@ define <8 x i32> @PR29088(<4 x i32>* %p0
> ; X32: # %bb.0:
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-NEXT: vmovaps (%ecx), %xmm0
> ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-NEXT: vmovaps %ymm1, (%eax)
> -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-NEXT: retl
> ;
> ; X64-LABEL: PR29088:
> ; X64: # %bb.0:
> -; X64-NEXT: vmovaps (%rdi), %xmm0
> ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-NEXT: vmovaps %ymm1, (%rsi)
> -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-NEXT: retq
> %ld = load <4 x i32>, <4 x i32>* %p0
> store <8 x float> zeroinitializer, <8 x float>* %p1
>
> Modified: llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll Mon Mar 19 13:19:46 2018
> @@ -1065,9 +1065,7 @@ define void @isel_crash_16b(i8* %cV_R.ad
> ; X64: ## %bb.0: ## %eintry
> ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-NEXT: movb (%rdi), %al
> -; X64-NEXT: vmovd %eax, %xmm1
> -; X64-NEXT: vpbroadcastb %xmm1, %xmm1
> +; X64-NEXT: vpbroadcastb (%rdi), %xmm1
> ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> ; X64-NEXT: retq
> @@ -1118,9 +1116,7 @@ define void @isel_crash_32b(i8* %cV_R.ad
> ; X64-NEXT: subq $128, %rsp
> ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> ; X64-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-NEXT: movb (%rdi), %al
> -; X64-NEXT: vmovd %eax, %xmm1
> -; X64-NEXT: vpbroadcastb %xmm1, %ymm1
> +; X64-NEXT: vpbroadcastb (%rdi), %ymm1
> ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> ; X64-NEXT: movq %rbp, %rsp
> @@ -1160,9 +1156,7 @@ define void @isel_crash_8w(i16* %cV_R.ad
> ; X64: ## %bb.0: ## %entry
> ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-NEXT: movzwl (%rdi), %eax
> -; X64-NEXT: vmovd %eax, %xmm1
> -; X64-NEXT: vpbroadcastw %xmm1, %xmm1
> +; X64-NEXT: vpbroadcastw (%rdi), %xmm1
> ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> ; X64-NEXT: retq
> @@ -1213,9 +1207,7 @@ define void @isel_crash_16w(i16* %cV_R.a
> ; X64-NEXT: subq $128, %rsp
> ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> ; X64-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-NEXT: movzwl (%rdi), %eax
> -; X64-NEXT: vmovd %eax, %xmm1
> -; X64-NEXT: vpbroadcastw %xmm1, %ymm1
> +; X64-NEXT: vpbroadcastw (%rdi), %ymm1
> ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> ; X64-NEXT: movq %rbp, %rsp
> @@ -1251,26 +1243,14 @@ define void @isel_crash_4d(i32* %cV_R.ad
> ; X32-NEXT: addl $60, %esp
> ; X32-NEXT: retl
> ;
> -; X64-AVX2-LABEL: isel_crash_4d:
> -; X64-AVX2: ## %bb.0: ## %entry
> -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: movl (%rdi), %eax
> -; X64-AVX2-NEXT: vmovd %eax, %xmm1
> -; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
> -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: retq
> -;
> -; X64-AVX512VL-LABEL: isel_crash_4d:
> -; X64-AVX512VL: ## %bb.0: ## %entry
> -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: movl (%rdi), %eax
> -; X64-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
> -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: retq
> +; X64-LABEL: isel_crash_4d:
> +; X64: ## %bb.0: ## %entry
> +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: vbroadcastss (%rdi), %xmm1
> +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: retq
> entry:
> %__a.addr.i = alloca <2 x i64>, align 16
> %__b.addr.i = alloca <2 x i64>, align 16
> @@ -1307,46 +1287,24 @@ define void @isel_crash_8d(i32* %cV_R.ad
> ; X32-NEXT: vzeroupper
> ; X32-NEXT: retl
> ;
> -; X64-AVX2-LABEL: isel_crash_8d:
> -; X64-AVX2: ## %bb.0: ## %eintry
> -; X64-AVX2-NEXT: pushq %rbp
> -; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
> -; X64-AVX2-NEXT: .cfi_offset %rbp, -16
> -; X64-AVX2-NEXT: movq %rsp, %rbp
> -; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
> -; X64-AVX2-NEXT: andq $-32, %rsp
> -; X64-AVX2-NEXT: subq $128, %rsp
> -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-AVX2-NEXT: movl (%rdi), %eax
> -; X64-AVX2-NEXT: vmovd %eax, %xmm1
> -; X64-AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
> -; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: movq %rbp, %rsp
> -; X64-AVX2-NEXT: popq %rbp
> -; X64-AVX2-NEXT: vzeroupper
> -; X64-AVX2-NEXT: retq
> -;
> -; X64-AVX512VL-LABEL: isel_crash_8d:
> -; X64-AVX512VL: ## %bb.0: ## %eintry
> -; X64-AVX512VL-NEXT: pushq %rbp
> -; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
> -; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
> -; X64-AVX512VL-NEXT: movq %rsp, %rbp
> -; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
> -; X64-AVX512VL-NEXT: andq $-32, %rsp
> -; X64-AVX512VL-NEXT: subq $128, %rsp
> -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-AVX512VL-NEXT: movl (%rdi), %eax
> -; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1
> -; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: movq %rbp, %rsp
> -; X64-AVX512VL-NEXT: popq %rbp
> -; X64-AVX512VL-NEXT: vzeroupper
> -; X64-AVX512VL-NEXT: retq
> +; X64-LABEL: isel_crash_8d:
> +; X64: ## %bb.0: ## %eintry
> +; X64-NEXT: pushq %rbp
> +; X64-NEXT: .cfi_def_cfa_offset 16
> +; X64-NEXT: .cfi_offset %rbp, -16
> +; X64-NEXT: movq %rsp, %rbp
> +; X64-NEXT: .cfi_def_cfa_register %rbp
> +; X64-NEXT: andq $-32, %rsp
> +; X64-NEXT: subq $128, %rsp
> +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; X64-NEXT: vmovaps %ymm0, (%rsp)
> +; X64-NEXT: vbroadcastss (%rdi), %ymm1
> +; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> +; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
> +; X64-NEXT: movq %rbp, %rsp
> +; X64-NEXT: popq %rbp
> +; X64-NEXT: vzeroupper
> +; X64-NEXT: retq
> eintry:
> %__a.addr.i = alloca <4 x i64>, align 16
> %__b.addr.i = alloca <4 x i64>, align 16
> @@ -1370,33 +1328,20 @@ define void @isel_crash_2q(i64* %cV_R.ad
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
> ; X32-NEXT: vmovaps %xmm0, (%esp)
> -; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
> -; X32-NEXT: vpbroadcastq %xmm1, %xmm1
> +; X32-NEXT: vpbroadcastq (%eax), %xmm1
> ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
> ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
> ; X32-NEXT: addl $60, %esp
> ; X32-NEXT: retl
> ;
> -; X64-AVX2-LABEL: isel_crash_2q:
> -; X64-AVX2: ## %bb.0: ## %entry
> -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: movq (%rdi), %rax
> -; X64-AVX2-NEXT: vmovq %rax, %xmm1
> -; X64-AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
> -; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: retq
> -;
> -; X64-AVX512VL-LABEL: isel_crash_2q:
> -; X64-AVX512VL: ## %bb.0: ## %entry
> -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: movq (%rdi), %rax
> -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm1
> -; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: retq
> +; X64-LABEL: isel_crash_2q:
> +; X64: ## %bb.0: ## %entry
> +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: vpbroadcastq (%rdi), %xmm1
> +; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
> +; X64-NEXT: retq
> entry:
> %__a.addr.i = alloca <2 x i64>, align 16
> %__b.addr.i = alloca <2 x i64>, align 16
> @@ -1433,46 +1378,24 @@ define void @isel_crash_4q(i64* %cV_R.ad
> ; X32-NEXT: vzeroupper
> ; X32-NEXT: retl
> ;
> -; X64-AVX2-LABEL: isel_crash_4q:
> -; X64-AVX2: ## %bb.0: ## %eintry
> -; X64-AVX2-NEXT: pushq %rbp
> -; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
> -; X64-AVX2-NEXT: .cfi_offset %rbp, -16
> -; X64-AVX2-NEXT: movq %rsp, %rbp
> -; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
> -; X64-AVX2-NEXT: andq $-32, %rsp
> -; X64-AVX2-NEXT: subq $128, %rsp
> -; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-AVX2-NEXT: movq (%rdi), %rax
> -; X64-AVX2-NEXT: vmovq %rax, %xmm1
> -; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
> -; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> -; X64-AVX2-NEXT: movq %rbp, %rsp
> -; X64-AVX2-NEXT: popq %rbp
> -; X64-AVX2-NEXT: vzeroupper
> -; X64-AVX2-NEXT: retq
> -;
> -; X64-AVX512VL-LABEL: isel_crash_4q:
> -; X64-AVX512VL: ## %bb.0: ## %eintry
> -; X64-AVX512VL-NEXT: pushq %rbp
> -; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
> -; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
> -; X64-AVX512VL-NEXT: movq %rsp, %rbp
> -; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
> -; X64-AVX512VL-NEXT: andq $-32, %rsp
> -; X64-AVX512VL-NEXT: subq $128, %rsp
> -; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
> -; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
> -; X64-AVX512VL-NEXT: movq (%rdi), %rax
> -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1
> -; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
> -; X64-AVX512VL-NEXT: movq %rbp, %rsp
> -; X64-AVX512VL-NEXT: popq %rbp
> -; X64-AVX512VL-NEXT: vzeroupper
> -; X64-AVX512VL-NEXT: retq
> +; X64-LABEL: isel_crash_4q:
> +; X64: ## %bb.0: ## %eintry
> +; X64-NEXT: pushq %rbp
> +; X64-NEXT: .cfi_def_cfa_offset 16
> +; X64-NEXT: .cfi_offset %rbp, -16
> +; X64-NEXT: movq %rsp, %rbp
> +; X64-NEXT: .cfi_def_cfa_register %rbp
> +; X64-NEXT: andq $-32, %rsp
> +; X64-NEXT: subq $128, %rsp
> +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; X64-NEXT: vmovaps %ymm0, (%rsp)
> +; X64-NEXT: vbroadcastsd (%rdi), %ymm1
> +; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
> +; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
> +; X64-NEXT: movq %rbp, %rsp
> +; X64-NEXT: popq %rbp
> +; X64-NEXT: vzeroupper
> +; X64-NEXT: retq
> eintry:
> %__a.addr.i = alloca <4 x i64>, align 16
> %__b.addr.i = alloca <4 x i64>, align 16
>
> Modified: llvm/trunk/test/CodeGen/X86/avx2-vbroadcasti128.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vbroadcasti128.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx2-vbroadcasti128.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx2-vbroadcasti128.ll Mon Mar 19 13:19:46 2018
> @@ -271,18 +271,16 @@ define <8 x i32> @PR29088(<4 x i32>* %p0
> ; X32: # %bb.0:
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-NEXT: vmovaps (%ecx), %xmm0
> ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-NEXT: vmovaps %ymm1, (%eax)
> -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-NEXT: retl
> ;
> ; X64-LABEL: PR29088:
> ; X64: # %bb.0:
> -; X64-NEXT: vmovaps (%rdi), %xmm0
> ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-NEXT: vmovaps %ymm1, (%rsi)
> -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-NEXT: retq
> %ld = load <4 x i32>, <4 x i32>* %p0
> store <8 x float> zeroinitializer, <8 x float>* %p1
>
> Modified: llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti128.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti128.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti128.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti128.ll Mon Mar 19 13:19:46 2018
> @@ -186,26 +186,23 @@ define <64 x i8> @test_broadcast_16i8_64
> define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
> ; X64-AVX512VL-LABEL: PR29088:
> ; X64-AVX512VL: ## %bb.0:
> -; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
> -; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512VL-NEXT: retq
> ;
> ; X64-AVX512BWVL-LABEL: PR29088:
> ; X64-AVX512BWVL: ## %bb.0:
> -; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
> -; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512BWVL-NEXT: retq
> ;
> ; X64-AVX512DQVL-LABEL: PR29088:
> ; X64-AVX512DQVL: ## %bb.0:
> -; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi)
> -; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512DQVL-NEXT: retq
> %ld = load <4 x i32>, <4 x i32>* %p0
> store <8 x float> zeroinitializer, <8 x float>* %p1
>
> Modified: llvm/trunk/test/CodeGen/X86/i256-add.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/i256-add.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/i256-add.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/i256-add.ll Mon Mar 19 13:19:46 2018
> @@ -9,40 +9,30 @@ define void @add(i256* %p, i256* %q) nou
> ; X32-NEXT: pushl %ebx
> ; X32-NEXT: pushl %edi
> ; X32-NEXT: pushl %esi
> -; X32-NEXT: subl $12, %esp
> +; X32-NEXT: subl $8, %esp
> +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> +; X32-NEXT: movl 28(%eax), %ecx
> +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
> +; X32-NEXT: movl 24(%eax), %ecx
> +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
> +; X32-NEXT: movl 20(%eax), %esi
> +; X32-NEXT: movl 16(%eax), %edi
> +; X32-NEXT: movl 12(%eax), %ebx
> +; X32-NEXT: movl 8(%eax), %ebp
> +; X32-NEXT: movl (%eax), %ecx
> +; X32-NEXT: movl 4(%eax), %edx
> ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-NEXT: movl 8(%ecx), %edi
> -; X32-NEXT: movl (%ecx), %edx
> -; X32-NEXT: movl 4(%ecx), %ebx
> -; X32-NEXT: movl 28(%eax), %esi
> -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
> -; X32-NEXT: movl 24(%eax), %ebp
> -; X32-NEXT: addl (%eax), %edx
> -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
> -; X32-NEXT: adcl 4(%eax), %ebx
> -; X32-NEXT: adcl 8(%eax), %edi
> -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
> -; X32-NEXT: movl 20(%eax), %edi
> -; X32-NEXT: movl 12(%eax), %edx
> -; X32-NEXT: movl 16(%eax), %esi
> -; X32-NEXT: adcl 12(%ecx), %edx
> -; X32-NEXT: adcl 16(%ecx), %esi
> -; X32-NEXT: adcl 20(%ecx), %edi
> -; X32-NEXT: movl %ebp, %eax
> -; X32-NEXT: adcl 24(%ecx), %eax
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
> -; X32-NEXT: adcl %ebp, 28(%ecx)
> -; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
> -; X32-NEXT: movl %ebp, 8(%ecx)
> -; X32-NEXT: movl %ebx, 4(%ecx)
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
> -; X32-NEXT: movl %ebx, (%ecx)
> -; X32-NEXT: movl %edx, 12(%ecx)
> -; X32-NEXT: movl %esi, 16(%ecx)
> -; X32-NEXT: movl %edi, 20(%ecx)
> -; X32-NEXT: movl %eax, 24(%ecx)
> -; X32-NEXT: addl $12, %esp
> +; X32-NEXT: addl %ecx, (%eax)
> +; X32-NEXT: adcl %edx, 4(%eax)
> +; X32-NEXT: adcl %ebp, 8(%eax)
> +; X32-NEXT: adcl %ebx, 12(%eax)
> +; X32-NEXT: adcl %edi, 16(%eax)
> +; X32-NEXT: adcl %esi, 20(%eax)
> +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
> +; X32-NEXT: adcl %ecx, 24(%eax)
> +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
> +; X32-NEXT: adcl %ecx, 28(%eax)
> +; X32-NEXT: addl $8, %esp
> ; X32-NEXT: popl %esi
> ; X32-NEXT: popl %edi
> ; X32-NEXT: popl %ebx
> @@ -51,17 +41,14 @@ define void @add(i256* %p, i256* %q) nou
> ;
> ; X64-LABEL: add:
> ; X64: # %bb.0:
> -; X64-NEXT: movq 16(%rdi), %rax
> -; X64-NEXT: movq (%rdi), %rcx
> -; X64-NEXT: movq 8(%rdi), %rdx
> -; X64-NEXT: movq 24(%rsi), %r8
> -; X64-NEXT: addq (%rsi), %rcx
> -; X64-NEXT: adcq 8(%rsi), %rdx
> -; X64-NEXT: adcq 16(%rsi), %rax
> -; X64-NEXT: adcq %r8, 24(%rdi)
> -; X64-NEXT: movq %rax, 16(%rdi)
> -; X64-NEXT: movq %rdx, 8(%rdi)
> -; X64-NEXT: movq %rcx, (%rdi)
> +; X64-NEXT: movq 24(%rsi), %rax
> +; X64-NEXT: movq 16(%rsi), %rcx
> +; X64-NEXT: movq (%rsi), %rdx
> +; X64-NEXT: movq 8(%rsi), %rsi
> +; X64-NEXT: addq %rdx, (%rdi)
> +; X64-NEXT: adcq %rsi, 8(%rdi)
> +; X64-NEXT: adcq %rcx, 16(%rdi)
> +; X64-NEXT: adcq %rax, 24(%rdi)
> ; X64-NEXT: retq
> %a = load i256, i256* %p
> %b = load i256, i256* %q
> @@ -77,35 +64,28 @@ define void @sub(i256* %p, i256* %q) nou
> ; X32-NEXT: pushl %edi
> ; X32-NEXT: pushl %esi
> ; X32-NEXT: subl $8, %esp
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-NEXT: movl 16(%ecx), %eax
> -; X32-NEXT: movl 12(%ecx), %edx
> -; X32-NEXT: movl 8(%ecx), %edi
> -; X32-NEXT: movl (%ecx), %ebx
> -; X32-NEXT: movl 4(%ecx), %ebp
> -; X32-NEXT: subl (%esi), %ebx
> -; X32-NEXT: sbbl 4(%esi), %ebp
> -; X32-NEXT: sbbl 8(%esi), %edi
> -; X32-NEXT: sbbl 12(%esi), %edx
> -; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
> -; X32-NEXT: sbbl 16(%esi), %eax
> -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
> -; X32-NEXT: movl 20(%ecx), %edx
> -; X32-NEXT: sbbl 20(%esi), %edx
> -; X32-NEXT: movl 24(%ecx), %eax
> -; X32-NEXT: sbbl 24(%esi), %eax
> -; X32-NEXT: movl 28(%esi), %esi
> -; X32-NEXT: sbbl %esi, 28(%ecx)
> -; X32-NEXT: movl %edi, 8(%ecx)
> -; X32-NEXT: movl %ebp, 4(%ecx)
> -; X32-NEXT: movl %ebx, (%ecx)
> -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
> -; X32-NEXT: movl %esi, 12(%ecx)
> -; X32-NEXT: movl (%esp), %esi # 4-byte Reload
> -; X32-NEXT: movl %esi, 16(%ecx)
> -; X32-NEXT: movl %edx, 20(%ecx)
> -; X32-NEXT: movl %eax, 24(%ecx)
> +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> +; X32-NEXT: movl 28(%eax), %ecx
> +; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
> +; X32-NEXT: movl 24(%eax), %ecx
> +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
> +; X32-NEXT: movl 20(%eax), %esi
> +; X32-NEXT: movl 16(%eax), %edi
> +; X32-NEXT: movl 12(%eax), %ebx
> +; X32-NEXT: movl 8(%eax), %ebp
> +; X32-NEXT: movl (%eax), %ecx
> +; X32-NEXT: movl 4(%eax), %edx
> +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
> +; X32-NEXT: subl %ecx, (%eax)
> +; X32-NEXT: sbbl %edx, 4(%eax)
> +; X32-NEXT: sbbl %ebp, 8(%eax)
> +; X32-NEXT: sbbl %ebx, 12(%eax)
> +; X32-NEXT: sbbl %edi, 16(%eax)
> +; X32-NEXT: sbbl %esi, 20(%eax)
> +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
> +; X32-NEXT: sbbl %ecx, 24(%eax)
> +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
> +; X32-NEXT: sbbl %ecx, 28(%eax)
> ; X32-NEXT: addl $8, %esp
> ; X32-NEXT: popl %esi
> ; X32-NEXT: popl %edi
> @@ -115,17 +95,14 @@ define void @sub(i256* %p, i256* %q) nou
> ;
> ; X64-LABEL: sub:
> ; X64: # %bb.0:
> -; X64-NEXT: movq 16(%rdi), %rax
> -; X64-NEXT: movq (%rdi), %rcx
> -; X64-NEXT: movq 8(%rdi), %rdx
> -; X64-NEXT: movq 24(%rsi), %r8
> -; X64-NEXT: subq (%rsi), %rcx
> -; X64-NEXT: sbbq 8(%rsi), %rdx
> -; X64-NEXT: sbbq 16(%rsi), %rax
> -; X64-NEXT: sbbq %r8, 24(%rdi)
> -; X64-NEXT: movq %rax, 16(%rdi)
> -; X64-NEXT: movq %rdx, 8(%rdi)
> -; X64-NEXT: movq %rcx, (%rdi)
> +; X64-NEXT: movq 24(%rsi), %rax
> +; X64-NEXT: movq 16(%rsi), %rcx
> +; X64-NEXT: movq (%rsi), %rdx
> +; X64-NEXT: movq 8(%rsi), %rsi
> +; X64-NEXT: subq %rdx, (%rdi)
> +; X64-NEXT: sbbq %rsi, 8(%rdi)
> +; X64-NEXT: sbbq %rcx, 16(%rdi)
> +; X64-NEXT: sbbq %rax, 24(%rdi)
> ; X64-NEXT: retq
> %a = load i256, i256* %p
> %b = load i256, i256* %q
>
> Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Mon Mar 19 13:19:46 2018
> @@ -1264,8 +1264,7 @@ define <8 x double> @load_one_mask_bit_s
> ; AVX-LABEL: load_one_mask_bit_set5:
> ; AVX: ## %bb.0:
> ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
> -; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
> -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
> +; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
> ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
> ; AVX-NEXT: retq
> ;
>
> Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-stores.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores.ll Mon Mar 19 13:19:46 2018
> @@ -10,12 +10,11 @@ define i32 @foo (i64* %so) nounwind uwta
> ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; CHECK-NEXT: movl $0, 28(%eax)
> ; CHECK-NEXT: movl $0, 24(%eax)
> -; CHECK-NEXT: movl 20(%eax), %ecx
> -; CHECK-NEXT: movl $0, 20(%eax)
> -; CHECK-NEXT: xorl %edx, %edx
> -; CHECK-NEXT: cmpl 16(%eax), %edx
> +; CHECK-NEXT: xorl %ecx, %ecx
> +; CHECK-NEXT: cmpl 16(%eax), %ecx
> ; CHECK-NEXT: movl $0, 16(%eax)
> -; CHECK-NEXT: sbbl %ecx, %edx
> +; CHECK-NEXT: sbbl 20(%eax), %ecx
> +; CHECK-NEXT: movl $0, 20(%eax)
> ; CHECK-NEXT: setl %al
> ; CHECK-NEXT: movzbl %al, %eax
> ; CHECK-NEXT: negl %eax
>
> Modified: llvm/trunk/test/CodeGen/X86/nontemporal.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/nontemporal.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/nontemporal.ll Mon Mar 19 13:19:46 2018
> @@ -13,36 +13,35 @@ define i32 @f(<4 x float> %A, i8* %B, <2
> ; X32-SSE-NEXT: andl $-16, %esp
> ; X32-SSE-NEXT: subl $16, %esp
> ; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
> -; X32-SSE-NEXT: movl 12(%ebp), %eax
> +; X32-SSE-NEXT: movl 12(%ebp), %ecx
> ; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4
> ; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5
> ; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6
> -; X32-SSE-NEXT: movl 8(%ebp), %edx
> -; X32-SSE-NEXT: movl 80(%ebp), %ecx
> -; X32-SSE-NEXT: movl (%ecx), %esi
> +; X32-SSE-NEXT: movl 8(%ebp), %esi
> +; X32-SSE-NEXT: movl 80(%ebp), %edx
> +; X32-SSE-NEXT: movl (%edx), %eax
> ; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
> -; X32-SSE-NEXT: movntps %xmm0, (%edx)
> +; X32-SSE-NEXT: movntps %xmm0, (%esi)
> ; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntdq %xmm2, (%edx)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntdq %xmm2, (%esi)
> ; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntpd %xmm1, (%edx)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntpd %xmm1, (%esi)
> ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntdq %xmm6, (%edx)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntdq %xmm6, (%esi)
> ; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntdq %xmm5, (%edx)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntdq %xmm5, (%esi)
> ; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntdq %xmm4, (%edx)
> -; X32-SSE-NEXT: addl (%ecx), %esi
> -; X32-SSE-NEXT: movntil %eax, (%edx)
> -; X32-SSE-NEXT: movl (%ecx), %eax
> -; X32-SSE-NEXT: addl %esi, %eax
> -; X32-SSE-NEXT: movsd %xmm3, (%edx)
> -; X32-SSE-NEXT: addl (%ecx), %eax
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntdq %xmm4, (%esi)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movntil %ecx, (%esi)
> +; X32-SSE-NEXT: addl (%edx), %eax
> +; X32-SSE-NEXT: movsd %xmm3, (%esi)
> +; X32-SSE-NEXT: addl (%edx), %eax
> ; X32-SSE-NEXT: leal -4(%ebp), %esp
> ; X32-SSE-NEXT: popl %esi
> ; X32-SSE-NEXT: popl %ebp
> @@ -56,36 +55,35 @@ define i32 @f(<4 x float> %A, i8* %B, <2
> ; X32-AVX-NEXT: andl $-16, %esp
> ; X32-AVX-NEXT: subl $16, %esp
> ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
> -; X32-AVX-NEXT: movl 12(%ebp), %eax
> +; X32-AVX-NEXT: movl 12(%ebp), %ecx
> ; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4
> ; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5
> ; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6
> -; X32-AVX-NEXT: movl 8(%ebp), %ecx
> -; X32-AVX-NEXT: movl 80(%ebp), %edx
> -; X32-AVX-NEXT: movl (%edx), %esi
> +; X32-AVX-NEXT: movl 8(%ebp), %edx
> +; X32-AVX-NEXT: movl 80(%ebp), %esi
> +; X32-AVX-NEXT: movl (%esi), %eax
> ; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
> -; X32-AVX-NEXT: vmovntps %xmm0, (%ecx)
> +; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
> ; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
> ; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
> ; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
> ; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
> ; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
> -; X32-AVX-NEXT: addl (%edx), %esi
> -; X32-AVX-NEXT: movntil %eax, (%ecx)
> -; X32-AVX-NEXT: movl (%edx), %eax
> -; X32-AVX-NEXT: addl %esi, %eax
> -; X32-AVX-NEXT: vmovsd %xmm3, (%ecx)
> -; X32-AVX-NEXT: addl (%edx), %eax
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: movntil %ecx, (%edx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> +; X32-AVX-NEXT: vmovsd %xmm3, (%edx)
> +; X32-AVX-NEXT: addl (%esi), %eax
> ; X32-AVX-NEXT: leal -4(%ebp), %esp
> ; X32-AVX-NEXT: popl %esi
> ; X32-AVX-NEXT: popl %ebp
>
> Added: llvm/trunk/test/CodeGen/X86/pr36274.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr36274.ll?rev=327898&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/pr36274.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/pr36274.ll Mon Mar 19 13:19:46 2018
> @@ -0,0 +1,33 @@
> +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
> +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
> +
> +; This tests is checking for a case where the x86 load-op-store fusion
> +; misses a dependence between the fused load and a non-fused operand
> +; to the load causing a cycle. Here the dependence in question comes
> +; from the carry in input of the adcl.
> +
> + at vx = external local_unnamed_addr global <2 x i32>, align 8
> +
> +define void @pr36274(i32* %somewhere) {
> +; CHECK-LABEL: pr36274:
> +; CHECK: # %bb.0:
> +; CHECK-NEXT: movl vx+4, %eax
> +; CHECK-NEXT: addl $1, vx
> +; CHECK-NEXT: adcl $0, %eax
> +; CHECK-NEXT: movl %eax, vx+4
> +; CHECK-NEXT: retl
> + %a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0
> + %a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1
> + %x1 = load volatile i32, i32* %a1, align 4
> + %x0 = load volatile i32, i32* %a0, align 8
> + %vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0
> + %vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1
> + %x = bitcast <2 x i32> %vx1 to i64
> + %add = add i64 %x, 1
> + %vadd = bitcast i64 %add to <2 x i32>
> + %vx1_0 = extractelement <2 x i32> %vadd, i32 0
> + %vx1_1 = extractelement <2 x i32> %vadd, i32 1
> + store i32 %vx1_0, i32* %a0, align 8
> + store i32 %vx1_1, i32* %a1, align 4
> + ret void
> +}
>
> Added: llvm/trunk/test/CodeGen/X86/pr36312.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr36312.ll?rev=327898&view=auto
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/pr36312.ll (added)
> +++ llvm/trunk/test/CodeGen/X86/pr36312.ll Mon Mar 19 13:19:46 2018
> @@ -0,0 +1,35 @@
> +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
> +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
> +
> +%struct.anon = type { i32, i32 }
> +
> + at c = common global %struct.anon zeroinitializer, align 4
> + at d = local_unnamed_addr global %struct.anon* @c, align 8
> + at a = common local_unnamed_addr global i32 0, align 4
> + at b = common local_unnamed_addr global i32 0, align 4
> +
> +; Function Attrs: norecurse nounwind uwtable
> +define void @g() local_unnamed_addr #0 {
> +; CHECK-LABEL: g:
> +; CHECK: # %bb.0: # %entry
> +; CHECK-NEXT: movq {{.*}}(%rip), %rax
> +; CHECK-NEXT: movl 4(%rax), %eax
> +; CHECK-NEXT: xorl %ecx, %ecx
> +; CHECK-NEXT: incl {{.*}}(%rip)
> +; CHECK-NEXT: setne %cl
> +; CHECK-NEXT: addl %eax, %ecx
> +; CHECK-NEXT: movl %ecx, {{.*}}(%rip)
> +; CHECK-NEXT: retq
> +entry:
> + %0 = load %struct.anon*, %struct.anon** @d, align 8
> + %y = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
> + %1 = load i32, i32* %y, align 4
> + %2 = load i32, i32* @b, align 4
> + %inc = add nsw i32 %2, 1
> + store i32 %inc, i32* @b, align 4
> + %tobool = icmp ne i32 %inc, 0
> + %land.ext = zext i1 %tobool to i32
> + %add = add nsw i32 %1, %land.ext
> + store i32 %add, i32* @a, align 4
> + ret void
> +}
>
> Modified: llvm/trunk/test/CodeGen/X86/required-vector-width.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/required-vector-width.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/required-vector-width.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/required-vector-width.ll Mon Mar 19 13:19:46 2018
> @@ -39,12 +39,12 @@ define void @add512(<16 x i32>* %a, <16
> define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
> ; CHECK-LABEL: avg_v64i8_256:
> ; CHECK: # %bb.0:
> -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
> -; CHECK-NEXT: vmovdqa (%rsi), %ymm1
> -; CHECK-NEXT: vpavgb (%rdi), %ymm1, %ymm1
> -; CHECK-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
> -; CHECK-NEXT: vmovdqu %ymm0, (%rax)
> +; CHECK-NEXT: vmovdqa (%rsi), %ymm0
> +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
> +; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0
> +; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
> ; CHECK-NEXT: vmovdqu %ymm1, (%rax)
> +; CHECK-NEXT: vmovdqu %ymm0, (%rax)
> ; CHECK-NEXT: vzeroupper
> ; CHECK-NEXT: retq
> %1 = load <64 x i8>, <64 x i8>* %a
>
> Modified: llvm/trunk/test/CodeGen/X86/store_op_load_fold2.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/store_op_load_fold2.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/store_op_load_fold2.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/store_op_load_fold2.ll Mon Mar 19 13:19:46 2018
> @@ -17,14 +17,14 @@ cond_true2732.preheader:
> store i64 %tmp2676.us.us, i64* %tmp2666
> ret i32 0
>
> -; INTEL: and {{e..}}, dword ptr [360]
> -; INTEL: and dword ptr [356], {{e..}}
> -; FIXME: mov dword ptr [360], {{e..}}
> +; INTEL: and {{e..}}, dword ptr [356]
> +; INTEL: and dword ptr [360], {{e..}}
> +; FIXME: mov dword ptr [356], {{e..}}
> ; The above line comes out as 'mov 360, eax', but when the register is ecx it works?
>
> -; ATT: andl 360, %{{e..}}
> -; ATT: andl %{{e..}}, 356
> -; ATT: movl %{{e..}}, 360
> +; ATT: andl 356, %{{e..}}
> +; ATT: andl %{{e..}}, 360
> +; ATT: movl %{{e..}}, 356
>
> }
>
>
> Modified: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll Mon Mar 19 13:19:46 2018
> @@ -751,72 +751,64 @@ define <8 x i32> @test_broadcast_4i32_8i
> ; X32-AVX: # %bb.0:
> ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
> ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
> -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-AVX-NEXT: retl
> ;
> ; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
> ; X32-AVX512F: # %bb.0:
> ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
> ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
> -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-AVX512F-NEXT: retl
> ;
> ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
> ; X32-AVX512BW: # %bb.0:
> ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
> ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
> -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-AVX512BW-NEXT: retl
> ;
> ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
> ; X32-AVX512DQ: # %bb.0:
> ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0
> ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
> -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-AVX512DQ-NEXT: retl
> ;
> ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
> ; X64-AVX: # %bb.0:
> -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
> -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX-NEXT: retq
> ;
> ; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
> ; X64-AVX512F: # %bb.0:
> -; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
> -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512F-NEXT: retq
> ;
> ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
> ; X64-AVX512BW: # %bb.0:
> -; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
> -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512BW-NEXT: retq
> ;
> ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
> ; X64-AVX512DQ: # %bb.0:
> -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
> -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX512DQ-NEXT: retq
> %1 = load <4 x i32>, <4 x i32>* %p0
> store <4 x float> zeroinitializer, <4 x float>* %p1
> @@ -829,10 +821,9 @@ define <16 x i32> @test_broadcast_4i32_1
> ; X32-AVX: # %bb.0:
> ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
> ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
> -; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
> ; X32-AVX-NEXT: retl
> ;
> @@ -840,63 +831,56 @@ define <16 x i32> @test_broadcast_4i32_1
> ; X32-AVX512F: # %bb.0:
> ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
> ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
> -; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512F-NEXT: retl
> ;
> ; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
> ; X32-AVX512BW: # %bb.0:
> ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
> ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
> -; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512BW-NEXT: retl
> ;
> ; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
> ; X32-AVX512DQ: # %bb.0:
> ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
> ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
> -; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
> ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
> -; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X32-AVX512DQ-NEXT: retl
> ;
> ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
> ; X64-AVX: # %bb.0:
> -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
> ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
> ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
> -; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
> ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
> ; X64-AVX-NEXT: retq
> ;
> ; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
> ; X64-AVX512F: # %bb.0:
> -; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
> ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
> -; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512F-NEXT: retq
> ;
> ; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
> ; X64-AVX512BW: # %bb.0:
> -; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
> ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
> +; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
> -; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512BW-NEXT: retq
> ;
> ; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
> ; X64-AVX512DQ: # %bb.0:
> -; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
> ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
> +; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
> -; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
> ; X64-AVX512DQ-NEXT: retq
> %1 = load <4 x i32>, <4 x i32>* %p0
> store <4 x float> zeroinitializer, <4 x float>* %p1
>
> Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll?rev=327898&r1=327897&r2=327898&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-256.ll Mon Mar 19 13:19:46 2018
> @@ -47,8 +47,7 @@ define <4 x double> @var_shuffle_v4f64_v
> ; ALL-NEXT: andl $3, %edx
> ; ALL-NEXT: andl $3, %esi
> ; ALL-NEXT: vmovaps %ymm0, (%rsp)
> -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
> -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
> +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
> ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
> ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
> ; ALL-NEXT: movq %rbp, %rsp
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list