PATCH: R600/SI: Bug fixes for mi-sched

Wed Nov 19 17:27:07 PST 2014

On 11/19/2014 12:05 PM, Tom Stellard wrote:
> Even though they weren't being clustered, ds_read2 was still matching, 
> because the load instructions happened to be next to each other by luck.
It would be nice to come up with a test where this happens now

>>> > >0003-R600-SI-Add-SIFoldOperands-pass.patch
>>> > >
>>> > >
>>> > > From 41409a4ba20a0a74acc58ae07a5130144bff8c39 Mon Sep 17 00:00:00 2001
>>> > >From: Tom Stellard<thomas.stellard at amd.com>
>>> > >Date: Tue, 18 Nov 2014 11:45:28 -0500
>>> > >Subject: [PATCH 3/6] R600/SI: Add SIFoldOperands pass
>>> > >
>>> > >This pass attempts to fold the source operands of mov and copy
>>> > >instructions into their uses.
>> >
>> >Is this supposed to replace what's left of
>> >SITargetLowering::legalizeOperands? This doesn't delete it so this
>> >probably isn't going to hit many cases. I'm also slightly worried
>> >this might break my f64 inline immediate patches
> I was planning to remove legalizeOperands in a separate patch.  It does
> seem to hit a fair amount of cases even with legalizeOperands still there.
>
>>> > >---
>>> > >  lib/Target/R600/AMDGPU.h                |   4 +
>>> > >  lib/Target/R600/AMDGPUTargetMachine.cpp |   2 +
>>> > >  lib/Target/R600/CMakeLists.txt          |   1 +
>>> > >  lib/Target/R600/SIFoldOperands.cpp      | 200 ++++++++++++++++++++++++++++++++
>>> > >  test/CodeGen/R600/extload.ll            |  21 ++--
>>> > >  test/CodeGen/R600/local-atomics.ll      |  24 ++--
>>> > >  test/CodeGen/R600/operand-folding.ll    |  40 +++++++
>>> > >  7 files changed, 264 insertions(+), 28 deletions(-)
>>> > >  create mode 100644 lib/Target/R600/SIFoldOperands.cpp
>>> > >  create mode 100644 test/CodeGen/R600/operand-folding.ll
>>> > >
>>> > >diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
>>> > >index 261075e..13379e7 100644
>>> > >--- a/lib/Target/R600/AMDGPU.h
>>> > >+++ b/lib/Target/R600/AMDGPU.h
>>> > >@@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
>>> > >  // SI Passes
>>> > >  FunctionPass *createSITypeRewriter();
>>> > >  FunctionPass *createSIAnnotateControlFlowPass();
>>> > >+FunctionPass *createSIFoldOperandsPass();
>>> > >  FunctionPass *createSILowerI1CopiesPass();
>>> > >  FunctionPass *createSIShrinkInstructionsPass();
>>> > >  FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
>>> > >@@ -47,6 +48,9 @@ FunctionPass *createSIFixSGPRLiveRangesPass();
>>> > >  FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
>>> > >  FunctionPass *createSIInsertWaits(TargetMachine &tm);
>>> > >+void initializeSIFoldOperandsPass(PassRegistry &);
>>> > >+extern char &SIFoldOperandsID;
>>> > >+
>>> > >  void initializeSILowerI1CopiesPass(PassRegistry &);
>>> > >  extern char &SILowerI1CopiesID;
>>> > >diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >index b2cd988..80142f0 100644
>>> > >--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >@@ -159,6 +159,8 @@ bool AMDGPUPassConfig::addInstSelector() {
>>> > >      addPass(createSIFixSGPRCopiesPass(*TM));
>>> > >    }
>>> > >+  addPass(createSILowerI1CopiesPass());
>>> > >+  addPass(createSIFoldOperandsPass());
>>> > >    return false;
>>> > >  }
>>> > >diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
>>> > >index ed0a216..3b703e7 100644
>>> > >--- a/lib/Target/R600/CMakeLists.txt
>>> > >+++ b/lib/Target/R600/CMakeLists.txt
>>> > >@@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen
>>> > >    SIAnnotateControlFlow.cpp
>>> > >    SIFixSGPRCopies.cpp
>>> > >    SIFixSGPRLiveRanges.cpp
>>> > >+  SIFoldOperands.cpp
>>> > >    SIInsertWaits.cpp
>>> > >    SIInstrInfo.cpp
>>> > >    SIISelLowering.cpp
>>> > >diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
>>> > >new file mode 100644
>>> > >index 0000000..1b0ba24
>>> > >--- /dev/null
>>> > >+++ b/lib/Target/R600/SIFoldOperands.cpp
>>> > >@@ -0,0 +1,200 @@
>>> > >+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
>>> > >+//
>>> > >+//                     The LLVM Compiler Infrastructure
>>> > >+//
>>> > >+// This file is distributed under the University of Illinois Open Source
>>> > >+// License. See LICENSE.TXT for details.
>>> > >+//
>>> > >+/// \file
>>> > >+//===----------------------------------------------------------------------===//
>>> > >+//
>>> > >+
>>> > >+#include "AMDGPU.h"
>>> > >+#include "AMDGPUSubtarget.h"
>>> > >+#include "SIInstrInfo.h"
>>> > >+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
>>> > >+#include "llvm/CodeGen/MachineDominators.h"
>>> > >+#include "llvm/CodeGen/MachineFunctionPass.h"
>>> > >+#include "llvm/CodeGen/MachineInstrBuilder.h"
>>> > >+#include "llvm/CodeGen/MachineRegisterInfo.h"
>>> > >+#include "llvm/IR/LLVMContext.h"
>>> > >+#include "llvm/IR/Function.h"
>>> > >+#include "llvm/Support/Debug.h"
>>> > >+#include "llvm/Target/TargetMachine.h"
>>> > >+
>>> > >+#define DEBUG_TYPE "si-fold-operands"
>>> > >+using namespace llvm;
>>> > >+
>>> > >+namespace {
>>> > >+
>>> > >+class SIFoldOperands : public MachineFunctionPass {
>>> > >+public:
>>> > >+  static char ID;
>>> > >+
>>> > >+public:
>>> > >+  SIFoldOperands() : MachineFunctionPass(ID) {
>>> > >+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
>>> > >+  }
>>> > >+
>>> > >+  bool runOnMachineFunction(MachineFunction &MF) override;
>>> > >+
>>> > >+  const char *getPassName() const override {
>>> > >+    return "SI Fold Operands";
>>> > >+  }
>>> > >+
>>> > >+  void getAnalysisUsage(AnalysisUsage &AU) const override {
>>> > >+    AU.addRequired<MachineDominatorTree>();
>>> > >+    AU.setPreservesCFG();
>>> > >+    MachineFunctionPass::getAnalysisUsage(AU);
>>> > >+  }
>>> > >+};
>>> > >+
>>> > >+} // End anonymous namespace.
>>> > >+
>>> > >+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
>>> > >+                      "SI Fold Operands", false, false)
>>> > >+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
>>> > >+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
>>> > >+                    "SI Fold Operands", false, false)
>>> > >+
>>> > >+char SIFoldOperands::ID = 0;
>>> > >+
>>> > >+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
>>> > >+
>>> > >+FunctionPass *llvm::createSIFoldOperandsPass() {
>>> > >+  return new SIFoldOperands();
>>> > >+}
>>> > >+
>>> > >+static bool isSafeToFold(unsigned Opcode) {
>>> > >+  switch(Opcode) {
>>> > >+  case AMDGPU::V_MOV_B32_e32:
>>> > >+  case AMDGPU::V_MOV_B32_e64:
>>> > >+  case AMDGPU::S_MOV_B32:
>>> > >+  case AMDGPU::S_MOV_B64:
>>> > >+  case AMDGPU::COPY:
>>> > >+    return true;
>>> > >+  default:
>>> > >+    return false;
>>> > >+  }
>>> > >+}
>>> > >+
>>> > >+static bool updateOperand(MachineInstr *MI, unsigned OpNo,
>>> > >+                          const MachineOperand &New,
>>> > >+                          const TargetRegisterInfo &TRI) {
>>> > >+  MachineOperand &Old = MI->getOperand(OpNo);
>>> > >+  assert(Old.isReg());
>>> > >+
>>> > >+  if (New.isImm()) {
>>> > >+    Old.ChangeToImmediate(New.getImm());
>>> > >+    return true;
>>> > >+  }
>>> > >+
>>> > >+  if (New.isFPImm()) {
>>> > >+    Old.ChangeToFPImmediate(New.getFPImm());
>>> > >+    return true;
>>> > >+  }
>> >I've been considering replacing all fp immediates with the bitcasted
>> >integers, since handling fp immediates everywhere is a pain and
>> >there isn't much point to distinguishing them. Where do you think
>> >the best place to do this would be?
>> >
> Would it work to convert them in AMDGPUDAGToDAGISel::Select() ?
I think so

>
>>> > >+
>>> > >+  if (New.isReg())  {
>>> > >+    if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
>>> > >+        TargetRegisterInfo::isVirtualRegister(New.getReg())) {
>>> > >+      Old.substVirtReg(New.getReg(), New.getSubReg(), TRI);
>>> > >+      return true;
>>> > >+    }
>>> > >+  }
>>> > >+
>>> > >+  // FIXME: Handle physical registers.
>> >
>> >What cases that can be folded will ever have physical registers at
>> >this point?
> If this pass was run after register allocation it would have to deal with
> physical registers.
I don't expect a need to run this again after register allocation, but maybe

>
>> >
>>> > >+
>>> > >+  return false;
>>> > >+}
>>> > >+
>>> > >+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>>> > >+  MachineRegisterInfo &MRI = MF.getRegInfo();
>>> > >+  const SIInstrInfo *TII =
>>> > >+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
>>> > >+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
>>> > >+
>>> > >+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
>>> > >+                                                  BI != BE; ++BI) {
>>> > >+
>>> > >+    MachineBasicBlock &MBB = *BI;
>>> > >+    MachineBasicBlock::iterator I, Next;
>>> > >+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
>>> > >+      Next = std::next(I);
>>> > >+      MachineInstr &MI = *I;
>>> > >+
>>> > >+      if (!isSafeToFold(MI.getOpcode()))
>>> > >+        continue;
>>> > >+
>>> > >+      const MachineOperand &OpToFold = MI.getOperand(1);
>>> > >+
>>> > >+      // FIXME: Fold operands with subregs.
>>> > >+      if (OpToFold.isReg() &&
>>> > >+          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
>>> > >+           OpToFold.getSubReg()))
>>> > >+        continue;
>>> > >+
>>> > >+      std::vector<std::pair<MachineInstr *, unsigned>> FoldList;
>>> > >+      for (MachineRegisterInfo::use_iterator
>>> > >+           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
>>> > >+           Use != E; ++Use) {
>> >I think you can use a range for here with MRI.use_instructions(Op.getReg())
> I'm using the iterator, because it has a reference to the operand number.
> I can't get that using a range.
>
> I've attached updated patches.  Let me know what you think.
>
> -Tom
>
> 0001-R600-SI-Add-SIFoldOperands-pass.patch
>
>
>  From 70f32795a67223e2618fbd37f959138d85c949ca Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 18 Nov 2014 11:45:28 -0500
> Subject: [PATCH 1/4] R600/SI: Add SIFoldOperands pass
>
> This pass attempts to fold the source operands of mov and copy
> instructions into their uses.
> ---
>   lib/Target/R600/AMDGPU.h                |   4 +
>   lib/Target/R600/AMDGPUTargetMachine.cpp |   2 +
>   lib/Target/R600/CMakeLists.txt          |   1 +
>   lib/Target/R600/SIFoldOperands.cpp      | 202 ++++++++++++++++++++++++++++++++
>   test/CodeGen/R600/extload.ll            |  21 ++--
>   test/CodeGen/R600/local-atomics.ll      |  24 ++--
>   test/CodeGen/R600/operand-folding.ll    |  40 +++++++
>   7 files changed, 266 insertions(+), 28 deletions(-)
>   create mode 100644 lib/Target/R600/SIFoldOperands.cpp
>   create mode 100644 test/CodeGen/R600/operand-folding.ll


LGTM
> 0002-R600-SI-Mark-s_mov_b32-and-s_mov_b64-as-rematerializ.patch
>
>
>  From fa2551ed3093c2dffab3685ee7e1d2fde7974f30 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 14 Nov 2014 18:10:26 -0500
> Subject: [PATCH 2/4] R600/SI: Mark s_mov_b32 and s_mov_b64 as rematerializable
>
> ---
>   lib/Target/R600/SIInstructions.td | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 90da7a9..bd91577 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -96,8 +96,10 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
>   //===----------------------------------------------------------------------===//
>   
>   let isMoveImm = 1 in {
> +let isReMaterializable = 1 in {
>   def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
>   def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
> +} // let isRematerializeable = 1
>   def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
>   def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
>   } // End isMoveImm = 1
> -- 1.8.5.5
LGTM

>
> 0003-R600-SI-Emit-s_mov_b32-m0-1-before-every-DS-instruct.patch
>
>
>  From a7b6ce3cb14627e644d0dad4ca5e2d3c6f8cfe78 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:10:22 -0400
> Subject: [PATCH 3/4] R600/SI: Emit s_mov_b32 m0, -1 before every DS
>   instruction
>
> This s_mov_b32 will write to a virtual register from the M0Reg
> class and all the ds instructions now take an extra M0Reg explicit
> argument.
>
> This change is necessary to prevent issues with the scheduler
> mixing together instructions that expect different values in the m0
> registers.
> ---
>   lib/Target/R600/SIISelLowering.cpp       |  2 +-
>   lib/Target/R600/SIInstrFormats.td        |  1 +
>   lib/Target/R600/SIInstrInfo.td           | 17 +++++++++--------
>   lib/Target/R600/SIInstructions.td        | 15 ++++++++-------
>   lib/Target/R600/SILoadStoreOptimizer.cpp | 10 +++++++++-
>   lib/Target/R600/SILowerControlFlow.cpp   | 23 -----------------------
>   test/CodeGen/R600/shl_add_ptr.ll         |  3 ++-
>   7 files changed, 30 insertions(+), 41 deletions(-)
LGTM

>
> 0004-R600-SI-Add-an-s_mov_b32-to-patterns-which-use-the-M.patch
>
>
>  From 1782fd5e32dcfefbd5c4949cfdbca1d6f5581e41 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:15:24 -0400
> Subject: [PATCH 4/4] R600/SI: Add an s_mov_b32 to patterns which use the
>   M0RegClass
>
> We need to use a s_mov_b32 rather than a copy, so that CSE will
> eliminate redundant moves to the m0 register.
> ---
>   lib/Target/R600/SIInstrInfo.cpp   | 20 --------------------
>   lib/Target/R600/SIInstructions.td | 12 ++++++++----
>   2 files changed, 8 insertions(+), 24 deletions(-)

LGTM


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141119/ee5ebcc3/attachment.html>