PATCH: R600/SI: Bug fixes for mi-sched
Matt Arsenault
Matthew.Arsenault at amd.com
Wed Nov 19 17:27:07 PST 2014
On 11/19/2014 12:05 PM, Tom Stellard wrote:
> Even though they weren't being clustered, ds_read2 was still matching,
> because the load instructions happened to be next to each other by luck.
It would be nice to come up with a test where this happens now
>>> > >0003-R600-SI-Add-SIFoldOperands-pass.patch
>>> > >
>>> > >
>>> > > From 41409a4ba20a0a74acc58ae07a5130144bff8c39 Mon Sep 17 00:00:00 2001
>>> > >From: Tom Stellard<thomas.stellard at amd.com>
>>> > >Date: Tue, 18 Nov 2014 11:45:28 -0500
>>> > >Subject: [PATCH 3/6] R600/SI: Add SIFoldOperands pass
>>> > >
>>> > >This pass attempts to fold the source operands of mov and copy
>>> > >instructions into their uses.
>> >
>> >Is this supposed to replace what's left of
>> >SITargetLowering::legalizeOperands? This doesn't delete it so this
>> >probably isn't going to hit many cases. I'm also slightly worried
>> >this might break my f64 inline immediate patches
> I was planning to remove legalizeOperands in a separate patch. It does
> seem to hit a fair amount of cases even with legalizeOperands still there.
>
>>> > >---
>>> > > lib/Target/R600/AMDGPU.h | 4 +
>>> > > lib/Target/R600/AMDGPUTargetMachine.cpp | 2 +
>>> > > lib/Target/R600/CMakeLists.txt | 1 +
>>> > > lib/Target/R600/SIFoldOperands.cpp | 200 ++++++++++++++++++++++++++++++++
>>> > > test/CodeGen/R600/extload.ll | 21 ++--
>>> > > test/CodeGen/R600/local-atomics.ll | 24 ++--
>>> > > test/CodeGen/R600/operand-folding.ll | 40 +++++++
>>> > > 7 files changed, 264 insertions(+), 28 deletions(-)
>>> > > create mode 100644 lib/Target/R600/SIFoldOperands.cpp
>>> > > create mode 100644 test/CodeGen/R600/operand-folding.ll
>>> > >
>>> > >diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
>>> > >index 261075e..13379e7 100644
>>> > >--- a/lib/Target/R600/AMDGPU.h
>>> > >+++ b/lib/Target/R600/AMDGPU.h
>>> > >@@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
>>> > > // SI Passes
>>> > > FunctionPass *createSITypeRewriter();
>>> > > FunctionPass *createSIAnnotateControlFlowPass();
>>> > >+FunctionPass *createSIFoldOperandsPass();
>>> > > FunctionPass *createSILowerI1CopiesPass();
>>> > > FunctionPass *createSIShrinkInstructionsPass();
>>> > > FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
>>> > >@@ -47,6 +48,9 @@ FunctionPass *createSIFixSGPRLiveRangesPass();
>>> > > FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
>>> > > FunctionPass *createSIInsertWaits(TargetMachine &tm);
>>> > >+void initializeSIFoldOperandsPass(PassRegistry &);
>>> > >+extern char &SIFoldOperandsID;
>>> > >+
>>> > > void initializeSILowerI1CopiesPass(PassRegistry &);
>>> > > extern char &SILowerI1CopiesID;
>>> > >diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >index b2cd988..80142f0 100644
>>> > >--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
>>> > >@@ -159,6 +159,8 @@ bool AMDGPUPassConfig::addInstSelector() {
>>> > > addPass(createSIFixSGPRCopiesPass(*TM));
>>> > > }
>>> > >+ addPass(createSILowerI1CopiesPass());
>>> > >+ addPass(createSIFoldOperandsPass());
>>> > > return false;
>>> > > }
>>> > >diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
>>> > >index ed0a216..3b703e7 100644
>>> > >--- a/lib/Target/R600/CMakeLists.txt
>>> > >+++ b/lib/Target/R600/CMakeLists.txt
>>> > >@@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen
>>> > > SIAnnotateControlFlow.cpp
>>> > > SIFixSGPRCopies.cpp
>>> > > SIFixSGPRLiveRanges.cpp
>>> > >+ SIFoldOperands.cpp
>>> > > SIInsertWaits.cpp
>>> > > SIInstrInfo.cpp
>>> > > SIISelLowering.cpp
>>> > >diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
>>> > >new file mode 100644
>>> > >index 0000000..1b0ba24
>>> > >--- /dev/null
>>> > >+++ b/lib/Target/R600/SIFoldOperands.cpp
>>> > >@@ -0,0 +1,200 @@
>>> > >+//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
>>> > >+//
>>> > >+// The LLVM Compiler Infrastructure
>>> > >+//
>>> > >+// This file is distributed under the University of Illinois Open Source
>>> > >+// License. See LICENSE.TXT for details.
>>> > >+//
>>> > >+/// \file
>>> > >+//===----------------------------------------------------------------------===//
>>> > >+//
>>> > >+
>>> > >+#include "AMDGPU.h"
>>> > >+#include "AMDGPUSubtarget.h"
>>> > >+#include "SIInstrInfo.h"
>>> > >+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
>>> > >+#include "llvm/CodeGen/MachineDominators.h"
>>> > >+#include "llvm/CodeGen/MachineFunctionPass.h"
>>> > >+#include "llvm/CodeGen/MachineInstrBuilder.h"
>>> > >+#include "llvm/CodeGen/MachineRegisterInfo.h"
>>> > >+#include "llvm/IR/LLVMContext.h"
>>> > >+#include "llvm/IR/Function.h"
>>> > >+#include "llvm/Support/Debug.h"
>>> > >+#include "llvm/Target/TargetMachine.h"
>>> > >+
>>> > >+#define DEBUG_TYPE "si-fold-operands"
>>> > >+using namespace llvm;
>>> > >+
>>> > >+namespace {
>>> > >+
>>> > >+class SIFoldOperands : public MachineFunctionPass {
>>> > >+public:
>>> > >+ static char ID;
>>> > >+
>>> > >+public:
>>> > >+ SIFoldOperands() : MachineFunctionPass(ID) {
>>> > >+ initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
>>> > >+ }
>>> > >+
>>> > >+ bool runOnMachineFunction(MachineFunction &MF) override;
>>> > >+
>>> > >+ const char *getPassName() const override {
>>> > >+ return "SI Fold Operands";
>>> > >+ }
>>> > >+
>>> > >+ void getAnalysisUsage(AnalysisUsage &AU) const override {
>>> > >+ AU.addRequired<MachineDominatorTree>();
>>> > >+ AU.setPreservesCFG();
>>> > >+ MachineFunctionPass::getAnalysisUsage(AU);
>>> > >+ }
>>> > >+};
>>> > >+
>>> > >+} // End anonymous namespace.
>>> > >+
>>> > >+INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
>>> > >+ "SI Fold Operands", false, false)
>>> > >+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
>>> > >+INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
>>> > >+ "SI Fold Operands", false, false)
>>> > >+
>>> > >+char SIFoldOperands::ID = 0;
>>> > >+
>>> > >+char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
>>> > >+
>>> > >+FunctionPass *llvm::createSIFoldOperandsPass() {
>>> > >+ return new SIFoldOperands();
>>> > >+}
>>> > >+
>>> > >+static bool isSafeToFold(unsigned Opcode) {
>>> > >+ switch(Opcode) {
>>> > >+ case AMDGPU::V_MOV_B32_e32:
>>> > >+ case AMDGPU::V_MOV_B32_e64:
>>> > >+ case AMDGPU::S_MOV_B32:
>>> > >+ case AMDGPU::S_MOV_B64:
>>> > >+ case AMDGPU::COPY:
>>> > >+ return true;
>>> > >+ default:
>>> > >+ return false;
>>> > >+ }
>>> > >+}
>>> > >+
>>> > >+static bool updateOperand(MachineInstr *MI, unsigned OpNo,
>>> > >+ const MachineOperand &New,
>>> > >+ const TargetRegisterInfo &TRI) {
>>> > >+ MachineOperand &Old = MI->getOperand(OpNo);
>>> > >+ assert(Old.isReg());
>>> > >+
>>> > >+ if (New.isImm()) {
>>> > >+ Old.ChangeToImmediate(New.getImm());
>>> > >+ return true;
>>> > >+ }
>>> > >+
>>> > >+ if (New.isFPImm()) {
>>> > >+ Old.ChangeToFPImmediate(New.getFPImm());
>>> > >+ return true;
>>> > >+ }
>> >I've been considering replacing all fp immediates with the bitcasted
>> >integers, since handling fp immediates everywhere is a pain and
>> >there isn't much point to distinguishing them. Where do you think
>> >the best place to do this would be?
>> >
> Would it work to convert them in AMDGPUDAGToDAGISel::Select() ?
I think so
>
>>> > >+
>>> > >+ if (New.isReg()) {
>>> > >+ if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
>>> > >+ TargetRegisterInfo::isVirtualRegister(New.getReg())) {
>>> > >+ Old.substVirtReg(New.getReg(), New.getSubReg(), TRI);
>>> > >+ return true;
>>> > >+ }
>>> > >+ }
>>> > >+
>>> > >+ // FIXME: Handle physical registers.
>> >
>> >What cases that can be folded will ever have physical registers at
>> >this point?
> If this pass was run after register allocation it would have to deal with
> physical registers.
I don't expect a need to run this again after register allocation, but maybe
>
>> >
>>> > >+
>>> > >+ return false;
>>> > >+}
>>> > >+
>>> > >+bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>>> > >+ MachineRegisterInfo &MRI = MF.getRegInfo();
>>> > >+ const SIInstrInfo *TII =
>>> > >+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
>>> > >+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
>>> > >+
>>> > >+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
>>> > >+ BI != BE; ++BI) {
>>> > >+
>>> > >+ MachineBasicBlock &MBB = *BI;
>>> > >+ MachineBasicBlock::iterator I, Next;
>>> > >+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
>>> > >+ Next = std::next(I);
>>> > >+ MachineInstr &MI = *I;
>>> > >+
>>> > >+ if (!isSafeToFold(MI.getOpcode()))
>>> > >+ continue;
>>> > >+
>>> > >+ const MachineOperand &OpToFold = MI.getOperand(1);
>>> > >+
>>> > >+ // FIXME: Fold operands with subregs.
>>> > >+ if (OpToFold.isReg() &&
>>> > >+ (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
>>> > >+ OpToFold.getSubReg()))
>>> > >+ continue;
>>> > >+
>>> > >+ std::vector<std::pair<MachineInstr *, unsigned>> FoldList;
>>> > >+ for (MachineRegisterInfo::use_iterator
>>> > >+ Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
>>> > >+ Use != E; ++Use) {
>> >I think you can use a range for here with MRI.use_instructions(Op.getReg())
> I'm using the iterator, because it has a reference to the operand number.
> I can't get that using a range.
>
> I've attached updated patches. Let me know what you think.
>
> -Tom
>
> 0001-R600-SI-Add-SIFoldOperands-pass.patch
>
>
> From 70f32795a67223e2618fbd37f959138d85c949ca Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 18 Nov 2014 11:45:28 -0500
> Subject: [PATCH 1/4] R600/SI: Add SIFoldOperands pass
>
> This pass attempts to fold the source operands of mov and copy
> instructions into their uses.
> ---
> lib/Target/R600/AMDGPU.h | 4 +
> lib/Target/R600/AMDGPUTargetMachine.cpp | 2 +
> lib/Target/R600/CMakeLists.txt | 1 +
> lib/Target/R600/SIFoldOperands.cpp | 202 ++++++++++++++++++++++++++++++++
> test/CodeGen/R600/extload.ll | 21 ++--
> test/CodeGen/R600/local-atomics.ll | 24 ++--
> test/CodeGen/R600/operand-folding.ll | 40 +++++++
> 7 files changed, 266 insertions(+), 28 deletions(-)
> create mode 100644 lib/Target/R600/SIFoldOperands.cpp
> create mode 100644 test/CodeGen/R600/operand-folding.ll
LGTM
> 0002-R600-SI-Mark-s_mov_b32-and-s_mov_b64-as-rematerializ.patch
>
>
> From fa2551ed3093c2dffab3685ee7e1d2fde7974f30 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 14 Nov 2014 18:10:26 -0500
> Subject: [PATCH 2/4] R600/SI: Mark s_mov_b32 and s_mov_b64 as rematerializable
>
> ---
> lib/Target/R600/SIInstructions.td | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 90da7a9..bd91577 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -96,8 +96,10 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
> //===----------------------------------------------------------------------===//
>
> let isMoveImm = 1 in {
> +let isReMaterializable = 1 in {
> def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
> def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
> +} // let isRematerializeable = 1
> def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
> def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
> } // End isMoveImm = 1
> -- 1.8.5.5
LGTM
>
> 0003-R600-SI-Emit-s_mov_b32-m0-1-before-every-DS-instruct.patch
>
>
> From a7b6ce3cb14627e644d0dad4ca5e2d3c6f8cfe78 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:10:22 -0400
> Subject: [PATCH 3/4] R600/SI: Emit s_mov_b32 m0, -1 before every DS
> instruction
>
> This s_mov_b32 will write to a virtual register from the M0Reg
> class and all the ds instructions now take an extra M0Reg explicit
> argument.
>
> This change is necessary to prevent issues with the scheduler
> mixing together instructions that expect different values in the m0
> registers.
> ---
> lib/Target/R600/SIISelLowering.cpp | 2 +-
> lib/Target/R600/SIInstrFormats.td | 1 +
> lib/Target/R600/SIInstrInfo.td | 17 +++++++++--------
> lib/Target/R600/SIInstructions.td | 15 ++++++++-------
> lib/Target/R600/SILoadStoreOptimizer.cpp | 10 +++++++++-
> lib/Target/R600/SILowerControlFlow.cpp | 23 -----------------------
> test/CodeGen/R600/shl_add_ptr.ll | 3 ++-
> 7 files changed, 30 insertions(+), 41 deletions(-)
LGTM
>
> 0004-R600-SI-Add-an-s_mov_b32-to-patterns-which-use-the-M.patch
>
>
> From 1782fd5e32dcfefbd5c4949cfdbca1d6f5581e41 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:15:24 -0400
> Subject: [PATCH 4/4] R600/SI: Add an s_mov_b32 to patterns which use the
> M0RegClass
>
> We need to use a s_mov_b32 rather than a copy, so that CSE will
> eliminate redundant moves to the m0 register.
> ---
> lib/Target/R600/SIInstrInfo.cpp | 20 --------------------
> lib/Target/R600/SIInstructions.td | 12 ++++++++----
> 2 files changed, 8 insertions(+), 24 deletions(-)
LGTM
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141119/ee5ebcc3/attachment.html>
More information about the llvm-commits
mailing list