[llvm] 8f48ddd - [X86][AMX] Lower tile copy instruction.

via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 22 15:50:11 PST 2021


Author: Luo, Yuanke
Date: 2021-02-23T07:49:42+08:00
New Revision: 8f48ddd1935831979e1d7f37e47db532534b37c4

URL: https://github.com/llvm/llvm-project/commit/8f48ddd1935831979e1d7f37e47db532534b37c4
DIFF: https://github.com/llvm/llvm-project/commit/8f48ddd1935831979e1d7f37e47db532534b37c4.diff

LOG: [X86][AMX] Lower tile copy instruction.

Since there is no tile copy instruction, we need to store tile
register to stack and load from stack to another tile register.
We need extra GR to hold the stride, and we need stack slot to
hold the tile data register. We would run this pass after copy
propagation, so that we don't miss copy optimization. And we
would run this pass before prolog/epilog insertion, so that we
can allocate stack slot.

Differential Revision: https://reviews.llvm.org/D97112

Added: 
    llvm/lib/Target/X86/X86LowerTileCopy.cpp
    llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll

Modified: 
    llvm/lib/Target/X86/CMakeLists.txt
    llvm/lib/Target/X86/X86.h
    llvm/lib/Target/X86/X86RegisterInfo.cpp
    llvm/lib/Target/X86/X86TargetMachine.cpp
    llvm/test/CodeGen/X86/O0-pipeline.ll
    llvm/test/CodeGen/X86/opt-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 5a5002670296..c2796aaec2cc 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -32,6 +32,7 @@ set(sources
   X86CmovConversion.cpp
   X86DomainReassignment.cpp
   X86DiscriminateMemOps.cpp
+  X86LowerTileCopy.cpp
   X86LowerAMXType.cpp
   X86TileConfig.cpp
   X86PreTileConfig.cpp

diff  --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index e17b9ba5500b..3f38e2510d6e 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -76,10 +76,15 @@ FunctionPass *createX86FlagsCopyLoweringPass();
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
+/// Return a pass that config the tile registers.
 FunctionPass *createX86TileConfigPass();
 
+/// Return a pass that insert pseudo tile config instruction.
 FunctionPass *createX86PreTileConfigPass();
 
+/// Return a pass that lower the tile copy instruction.
+FunctionPass *createX86LowerTileCopyPass();
+
 /// Return a pass that inserts int3 at the end of the function if it ends with a
 /// CALL instruction. The pass does the same for each funclet as well. This
 /// ensures that the open interval of function start and end PCs contains all
@@ -169,6 +174,7 @@ void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86TileConfigPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
+void initializeX86LowerTileCopyPass(PassRegistry &);
 
 namespace X86AS {
 enum : unsigned {

diff  --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
new file mode 100644
index 000000000000..03692d195768
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -0,0 +1,132 @@
+//===-- X86LowerTileCopy.cpp - Expand Tile Copy Instructions---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which lower AMX tile copy instructions. Since
+// there is no tile copy instruction, we need store tile register to stack
+// and load from stack to another tile register. We need extra GR to hold
+// the stride, and we need stack slot to hold the tile data register.
+// We would run this pass after copy propagation, so that we don't miss copy
+// optimization. And we would run this pass before prolog/epilog insertion,
+// so that we can allocate stack slot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-lower-tile-copy"
+
+namespace {
+
+class X86LowerTileCopy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  X86LowerTileCopy() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "X86 Lower Tile Copy"; }
+};
+
+} // namespace
+
+char X86LowerTileCopy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+                      false, false)
+INITIALIZE_PASS_END(X86LowerTileCopy, "lowertilecopy", "Tile Copy Lowering",
+                    false, false)
+
+void X86LowerTileCopy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createX86LowerTileCopyPass() {
+  return new X86LowerTileCopy();
+}
+
+bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  const X86InstrInfo *TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE;) {
+      MachineInstr &MI = *MII++;
+      if (!MI.isCopy())
+        continue;
+      MachineOperand &DstMO = MI.getOperand(0);
+      MachineOperand &SrcMO = MI.getOperand(1);
+      Register SrcReg = SrcMO.getReg();
+      Register DstReg = DstMO.getReg();
+      if (!X86::TILERegClass.contains(DstReg, SrcReg))
+        continue;
+
+      const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+      // Allocate stack slot for tile register
+      unsigned Size = TRI->getSpillSize(X86::TILERegClass);
+      Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
+      int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+      // Allocate stack slot for stride register
+      Size = TRI->getSpillSize(X86::GR64RegClass);
+      Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+      int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+      // TODO: Pick a killed regiter to avoid save/reload. There is problem
+      // to get live interval in this stage.
+      Register GR64Cand = X86::RAX;
+
+      const DebugLoc &DL = MI.getDebugLoc();
+      // mov %rax (%sp)
+      BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
+      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
+          .addReg(GR64Cand);
+      // mov 64 %rax
+      BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      // tilestored %tmm, (%sp, %idx)
+      unsigned Opc = X86::TILESTORED;
+      MachineInstr *NewMI =
+          addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
+              .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
+      MachineOperand &MO = NewMI->getOperand(2);
+      MO.setReg(GR64Cand);
+      MO.setIsKill(true);
+      // tileloadd (%sp, %idx), %tmm
+      Opc = X86::TILELOADD;
+      NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
+                                TileSS);
+      // restore %rax
+      // mov (%sp) %rax
+      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
+                        StrideSS);
+      MI.eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index a20e8153f314..261293c785a1 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -875,6 +875,12 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM,
   default:
     llvm_unreachable("Unexpected machine instruction on tile register!");
     break;
+  case X86::COPY: {
+    Register SrcReg = MI->getOperand(1).getReg();
+    ShapeT Shape = getTileShape(SrcReg, VRM, MRI);
+    VRM->assignVirt2Shape(VirtReg, Shape);
+    return Shape;
+  }
   // We only collect the tile shape that is defined.
   case X86::PTILELOADDV:
   case X86::PTDPBSSDV:

diff  --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5b9747444072..acd3ae905972 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -73,6 +73,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86CallFrameOptimizationPass(PR);
   initializeX86CmovConverterPassPass(PR);
   initializeX86TileConfigPass(PR);
+  initializeX86LowerTileCopyPass(PR);
   initializeX86ExpandPseudoPass(PR);
   initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
@@ -508,6 +509,7 @@ void X86PassConfig::addMachineSSAOptimization() {
 }
 
 void X86PassConfig::addPostRegAlloc() {
+  addPass(createX86LowerTileCopyPass());
   addPass(createX86FloatingPointStackifierPass());
   // When -O0 is enabled, the Load Value Injection Hardening pass will fall back
   // to using the Speculative Execution Side Effect Suppression pass for

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
new file mode 100644
index 000000000000..5fcfa2f295e7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+
+define dso_local void @test1(i8 *%buf) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:    movw $8, %r14w
+; CHECK-NEXT:    tileloadd (%rdi,%rax), %tmm3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %loop.header.preheader
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl $32, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %loop.header
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
+; CHECK-NEXT:    # implicit-def: $rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpw $100, %bp
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %exit
+; CHECK-NEXT:    addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 64)
+  br i1 undef, label %loop.header, label %exit
+
+loop.header:
+  %ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
+  call void @foo()
+  br label %loop.body
+
+loop.body:
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
+  br label %loop.latch
+
+loop.latch:
+  %iv = add i16 %ivphi, 1
+  %c = icmp slt i16 %iv, 100
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+define dso_local void @test2(i8 *%buf) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, %r14w
+; CHECK-NEXT:    tilezero %tmm3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB1_3
+; CHECK-NEXT:  # %bb.1: # %loop.header.preheader
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl $32, %r15d
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_2: # %loop.header
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 2048(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tileloadd 2048(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
+; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
+; CHECK-NEXT:    # implicit-def: $rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpw $100, %bp
+; CHECK-NEXT:    jl .LBB1_2
+; CHECK-NEXT:  .LBB1_3: # %exit
+; CHECK-NEXT:    addq $4056, %rsp # imm = 0xFD8
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
+  br i1 undef, label %loop.header, label %exit
+
+loop.header:
+  %ivphi = phi i16 [0, %entry], [%iv, %loop.latch]
+  call void @foo()
+  br label %loop.body
+
+loop.body:
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %buf, i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %buf, i64 32, x86_amx %t4)
+  br label %loop.latch
+
+loop.latch:
+  %iv = add i16 %ivphi, 1
+  %c = icmp slt i16 %iv, 100
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @foo()
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

diff  --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 40349b869b2c..7d2ee5f27142 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -45,6 +45,7 @@
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
+; CHECK-NEXT:       X86 Lower Tile Copy
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       X86 FP Stackifier
 ; CHECK-NEXT:       Fixup Statepoint Caller Saved

diff  --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 8c4112f5cd77..8ca32ff306ee 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -145,6 +145,7 @@
 ; CHECK-NEXT:       Stack Slot Coloring
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       Machine Loop Invariant Code Motion
+; CHECK-NEXT:       X86 Lower Tile Copy
 ; CHECK-NEXT:       Bundle Machine CFG Edges
 ; CHECK-NEXT:       X86 FP Stackifier
 ; CHECK-NEXT:       MachineDominator Tree Construction


        


More information about the llvm-commits mailing list