PATCH: R600/SI: Bug fixes for mi-sched

Matt Arsenault Matthew.Arsenault at amd.com
Tue Nov 18 17:49:04 PST 2014


On 11/18/2014 05:10 PM, Tom Stellard wrote:
> Hi Matt,
>
> Here is the latest set of patches for fixing bugs uncovered by enabling the
> MI Scheduler.
>
> -Tom
>
> 0001-R600-SI-Make-SIInstrInfo-isOperandLegal-more-strict.patch
>
>
>  From 14b5c10f1d0af13f7155765deb3a687e3a133cf1 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 18 Nov 2014 11:42:13 -0500
> Subject: [PATCH 1/6] R600/SI: Make SIInstrInfo::isOperandLegal() more strict
>
> A register operand that has a common sub-class with its instruction's
> defined register class is not always legal.  For example,
> SReg_32 and M0Reg both have a common sub-class, but we can't
> use an SReg_32 in instructions that expect a M0Reg.
>
> This prevents the llvm.SI.sendmsg.ll test from failing when the fold
> operand pass is added.
> ---

LGTM. Might want to change the example to use s0 instead of sgpr0 to 
match the asm output

>   lib/Target/R600/SIInstrInfo.cpp | 11 ++++++++++-
>   test/CodeGen/R600/udivrem.ll    | 28 ++++++++++++++--------------
>   test/CodeGen/R600/usubo.ll      |  2 +-
>   3 files changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 694b63c..68d5872 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -1454,7 +1454,16 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
>     if (MO->isReg()) {
>       assert(DefinedRC);
>       const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
> -    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass));
> +
> +    // In order to be legal, the common sub-class must be equal to the
> +    // class of the current operand.  For example:
> +    //
> +    // v_mov_b32 sgpr0 ; Operand defined as vsrc_32
> +    //                 ; RI.getCommonSubClass(sgpr,vsrc_32) = sgpr ; LEGAL
> +    //
> +    // s_sendmsg 0, sgpr0 ; Operand defined as m0reg
> +    //                    ; RI.getCommonSubClass(sgpr,m0) = M0 ; NOT LEGAL
> +    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
>     }
>   
>   
> diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
> index 9112d03..f20705b 100644
> --- a/test/CodeGen/R600/udivrem.ll
> +++ b/test/CodeGen/R600/udivrem.ll
> @@ -32,8 +32,8 @@
>   ; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
>   ; SI: v_cndmask_b32_e64
>   ; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
> -; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
> +; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[RCP]], [[E]]
> +; SI-DAG: v_sub_i32_e32 [[RCP_S_E:v[0-9]+]], [[RCP]], [[E]]
>   ; SI: v_cndmask_b32_e64
>   ; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
>   ; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
> @@ -112,8 +112,8 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
>   ; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
> -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
> +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
> +; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
> @@ -135,8 +135,8 @@ define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
>   ; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
> -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
> +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
> +; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
> @@ -262,8 +262,8 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
>   ; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
> -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
> +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
> +; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
> @@ -285,8 +285,8 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
>   ; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
> -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
> +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
> +; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
> @@ -308,8 +308,8 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
>   ; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
> -; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
> +; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
> +; SI-DAG: v_sub_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
> @@ -331,8 +331,8 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
>   ; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
> -; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
> -; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
> +; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
> +; SI-DAG: v_sub_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
>   ; SI-DAG: v_cndmask_b32_e64
>   ; SI-DAG: v_mul_hi_u32 [[FOURTH_Quotient:v[0-9]+]]
>   ; SI-DAG: v_mul_lo_i32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
> diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
> index 4d40600..abc5bd2 100644
> --- a/test/CodeGen/R600/usubo.ll
> +++ b/test/CodeGen/R600/usubo.ll
> @@ -27,7 +27,7 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
>   }
>   
>   ; FUNC-LABEL: {{^}}v_usubo_i32:
> -; SI: v_subrev_i32_e32
> +; SI: v_sub_i32_e32
>   define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
>     %a = load i32 addrspace(1)* %aptr, align 4
>     %b = load i32 addrspace(1)* %bptr, align 4
> -- 1.8.5.5
>
> 0002-R600-SI-Make-sure-the-LoadStore-optimizer-emits-inst.patch
>
>
>  From 43a26609e9291d3c2b927e1b93bc7bffe4a4f6c7 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 18 Nov 2014 13:37:03 -0500
> Subject: [PATCH 2/6] R600/SI: Make sure the LoadStore optimizer emits
>   instructions with offset0 < offset1
>
> This will help reduce the number of changes needed to the test
> cases when the MI scheduler is enabled.
> ---
>   lib/Target/R600/SILoadStoreOptimizer.cpp | 6 ++++++
>   1 file changed, 6 insertions(+)
>
> diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
> index 4140196..006b869 100644
> --- a/lib/Target/R600/SILoadStoreOptimizer.cpp
> +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp
> @@ -255,6 +255,9 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
>       = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
>     unsigned DestReg = MRI->createVirtualRegister(SuperRC);
>   
> +  if (NewOffset0 > NewOffset1)
> +    std::swap(NewOffset0, NewOffset1);
I don't think this is correct. It will still write offset0 into the low 
register, and offset1 into the high register of the destination pair. 
Just swapping the offsets will change which load the extracted result 
gets. I would expect the scheduler to order the offsets so that this 
ends up happening anyway (I thought there was already a check to prevent 
folding if offset1 > offset0).
> +
>     DebugLoc DL = I->getDebugLoc();
>     MachineInstrBuilder Read2
>       = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
> @@ -322,6 +325,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
>            (NewOffset0 != NewOffset1) &&
>            "Computed offset doesn't fit");
>   
> +  if (NewOffset0 > NewOffset1)
> +    std::swap(NewOffset0, NewOffset1);
> +
>     const MCInstrDesc &Write2Desc = TII->get(Opc);
>     DebugLoc DL = I->getDebugLoc();
>   
> -- 1.8.5.5
>
> 0003-R600-SI-Add-SIFoldOperands-pass.patch
>
>
>  From 41409a4ba20a0a74acc58ae07a5130144bff8c39 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Tue, 18 Nov 2014 11:45:28 -0500
> Subject: [PATCH 3/6] R600/SI: Add SIFoldOperands pass
>
> This pass attempts to fold the source operands of mov and copy
> instructions into their uses.

Is this supposed to replace what's left of 
SITargetLowering::legalizeOperands? This doesn't delete it so this 
probably isn't going to hit many cases. I'm also slightly worried this 
might break my f64 inline immediate patches
> ---
>   lib/Target/R600/AMDGPU.h                |   4 +
>   lib/Target/R600/AMDGPUTargetMachine.cpp |   2 +
>   lib/Target/R600/CMakeLists.txt          |   1 +
>   lib/Target/R600/SIFoldOperands.cpp      | 200 ++++++++++++++++++++++++++++++++
>   test/CodeGen/R600/extload.ll            |  21 ++--
>   test/CodeGen/R600/local-atomics.ll      |  24 ++--
>   test/CodeGen/R600/operand-folding.ll    |  40 +++++++
>   7 files changed, 264 insertions(+), 28 deletions(-)
>   create mode 100644 lib/Target/R600/SIFoldOperands.cpp
>   create mode 100644 test/CodeGen/R600/operand-folding.ll
>
> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> index 261075e..13379e7 100644
> --- a/lib/Target/R600/AMDGPU.h
> +++ b/lib/Target/R600/AMDGPU.h
> @@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
>   // SI Passes
>   FunctionPass *createSITypeRewriter();
>   FunctionPass *createSIAnnotateControlFlowPass();
> +FunctionPass *createSIFoldOperandsPass();
>   FunctionPass *createSILowerI1CopiesPass();
>   FunctionPass *createSIShrinkInstructionsPass();
>   FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
> @@ -47,6 +48,9 @@ FunctionPass *createSIFixSGPRLiveRangesPass();
>   FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
>   FunctionPass *createSIInsertWaits(TargetMachine &tm);
>   
> +void initializeSIFoldOperandsPass(PassRegistry &);
> +extern char &SIFoldOperandsID;
> +
>   void initializeSILowerI1CopiesPass(PassRegistry &);
>   extern char &SILowerI1CopiesID;
>   
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index b2cd988..80142f0 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -159,6 +159,8 @@ bool AMDGPUPassConfig::addInstSelector() {
>       addPass(createSIFixSGPRCopiesPass(*TM));
>     }
>   
> +  addPass(createSILowerI1CopiesPass());
> +  addPass(createSIFoldOperandsPass());
>     return false;
>   }
>   
> diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> index ed0a216..3b703e7 100644
> --- a/lib/Target/R600/CMakeLists.txt
> +++ b/lib/Target/R600/CMakeLists.txt
> @@ -43,6 +43,7 @@ add_llvm_target(R600CodeGen
>     SIAnnotateControlFlow.cpp
>     SIFixSGPRCopies.cpp
>     SIFixSGPRLiveRanges.cpp
> +  SIFoldOperands.cpp
>     SIInsertWaits.cpp
>     SIInstrInfo.cpp
>     SIISelLowering.cpp
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> new file mode 100644
> index 0000000..1b0ba24
> --- /dev/null
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -0,0 +1,200 @@
> +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +/// \file
> +//===----------------------------------------------------------------------===//
> +//
> +
> +#include "AMDGPU.h"
> +#include "AMDGPUSubtarget.h"
> +#include "SIInstrInfo.h"
> +#include "llvm/CodeGen/LiveIntervalAnalysis.h"
> +#include "llvm/CodeGen/MachineDominators.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> +#include "llvm/CodeGen/MachineRegisterInfo.h"
> +#include "llvm/IR/LLVMContext.h"
> +#include "llvm/IR/Function.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Target/TargetMachine.h"
> +
> +#define DEBUG_TYPE "si-fold-operands"
> +using namespace llvm;
> +
> +namespace {
> +
> +class SIFoldOperands : public MachineFunctionPass {
> +public:
> +  static char ID;
> +
> +public:
> +  SIFoldOperands() : MachineFunctionPass(ID) {
> +    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
> +  }
> +
> +  bool runOnMachineFunction(MachineFunction &MF) override;
> +
> +  const char *getPassName() const override {
> +    return "SI Fold Operands";
> +  }
> +
> +  void getAnalysisUsage(AnalysisUsage &AU) const override {
> +    AU.addRequired<MachineDominatorTree>();
> +    AU.setPreservesCFG();
> +    MachineFunctionPass::getAnalysisUsage(AU);
> +  }
> +};
> +
> +} // End anonymous namespace.
> +
> +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
> +                      "SI Fold Operands", false, false)
> +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
> +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE,
> +                    "SI Fold Operands", false, false)
> +
> +char SIFoldOperands::ID = 0;
> +
> +char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
> +
> +FunctionPass *llvm::createSIFoldOperandsPass() {
> +  return new SIFoldOperands();
> +}
> +
> +static bool isSafeToFold(unsigned Opcode) {
> +  switch(Opcode) {
> +  case AMDGPU::V_MOV_B32_e32:
> +  case AMDGPU::V_MOV_B32_e64:
> +  case AMDGPU::S_MOV_B32:
> +  case AMDGPU::S_MOV_B64:
> +  case AMDGPU::COPY:
> +    return true;
> +  default:
> +    return false;
> +  }
> +}
> +
> +static bool updateOperand(MachineInstr *MI, unsigned OpNo,
> +                          const MachineOperand &New,
> +                          const TargetRegisterInfo &TRI) {
> +  MachineOperand &Old = MI->getOperand(OpNo);
> +  assert(Old.isReg());
> +
> +  if (New.isImm()) {
> +    Old.ChangeToImmediate(New.getImm());
> +    return true;
> +  }
> +
> +  if (New.isFPImm()) {
> +    Old.ChangeToFPImmediate(New.getFPImm());
> +    return true;
> +  }
I've been considering replacing all fp immediates with the bitcasted 
integers, since handling fp immediates everywhere is a pain and there 
isn't much point to distinguishing them. Where do you think the best 
place to do this would be?

> +
> +  if (New.isReg())  {
> +    if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
> +        TargetRegisterInfo::isVirtualRegister(New.getReg())) {
> +      Old.substVirtReg(New.getReg(), New.getSubReg(), TRI);
> +      return true;
> +    }
> +  }
> +
> +  // FIXME: Handle physical registers.

What cases that can be folded will ever have physical registers at this 
point?

> +
> +  return false;
> +}
> +
> +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> +  MachineRegisterInfo &MRI = MF.getRegInfo();
> +  const SIInstrInfo *TII =
> +      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
> +  const SIRegisterInfo &TRI = TII->getRegisterInfo();
> +
> +  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
> +                                                  BI != BE; ++BI) {
> +
> +    MachineBasicBlock &MBB = *BI;
> +    MachineBasicBlock::iterator I, Next;
> +    for (I = MBB.begin(); I != MBB.end(); I = Next) {
> +      Next = std::next(I);
> +      MachineInstr &MI = *I;
> +
> +      if (!isSafeToFold(MI.getOpcode()))
> +        continue;
> +
> +      const MachineOperand &OpToFold = MI.getOperand(1);
> +
> +      // FIXME: Fold operands with subregs.
> +      if (OpToFold.isReg() &&
> +          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
> +           OpToFold.getSubReg()))
> +        continue;
> +
> +      std::vector<std::pair<MachineInstr *, unsigned>> FoldList;
> +      for (MachineRegisterInfo::use_iterator
> +           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
> +           Use != E; ++Use) {
I think you can use a range for here with MRI.use_instructions(Op.getReg())
> +
> +        MachineInstr *UseMI = Use->getParent();
> +
> +        // FIXME: Fold operands with subregs.
> +        if (UseMI->getOperand(Use.getOperandNo()).isReg() &&
> +            UseMI->getOperand(Use.getOperandNo()).getSubReg()) {
> +          continue;
> +        }
Should use an intermediate const MachineOperand& rather than 2x 
UseMI->getOperand(Use.getOperandNo())
> +
> +        // In order to fold immediates into copies, we need to change the
> +        // copy to a MOV.
> +        if ((OpToFold.isImm() || OpToFold.isFPImm()) &&
> +             UseMI->getOpcode() == AMDGPU::COPY) {
> +          const TargetRegisterClass *TRC =
> +              MRI.getRegClass(UseMI->getOperand(0).getReg());
> +          unsigned CopyOp;
> +
> +          if (TRI.hasVGPRs(TRC) && TRC->getSize() == 4) {
> +            CopyOp = AMDGPU::V_MOV_B32_e64;
> +          } else if (TRI.isSGPRClass(TRC) && TRC->getSize() == 4) {
> +            CopyOp = AMDGPU::S_MOV_B32;
> +          } else if (TRI.isSGPRClass(TRC) && TRC->getSize() == 8) {
> +            CopyOp = AMDGPU::S_MOV_B64;
> +          } else {
> +            continue;
> +          }
> +          UseMI->setDesc(TII->get(CopyOp));
I think it would be better to restructure this condition. An outer check 
for the size, and then the register class check inside those. Then maybe 
doing the TII->get / setDesc directly there rather than following the 
assignment of CopyOp.

> +        }
> +
> +        const MCInstrDesc &UseDesc = UseMI->getDesc();
> +
> +        // Don't fold into target independent nodes.  Target independent opcodes
> +        // don't have defined register classes.
> +        if (UseDesc.isVariadic() ||
> +            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
> +          continue;
> +
> +        // Normal substitution
> +        if (TII->isOperandLegal(UseMI, Use.getOperandNo(),  &OpToFold)) {
Extra space after ,
> +          FoldList.push_back(std::make_pair(UseMI, Use.getOperandNo()));
> +          continue;
> +        }
> +
> +        // FIXME: We could commute the instruction to create more opportunites
> +        // for folding.  This will only be useful if we have 32-bit instructions.
> +
> +        // FIXME: We could try to change the instruction from 64-bit to 32-bit
> +        // to enable more folding opportunites.  The shrink operands pass
> +        // already does this.
> +      }
> +
> +      for (std::pair<MachineInstr *, unsigned> Fold : FoldList) {
> +        if (updateOperand(Fold.first, Fold.second, OpToFold, TRI)) {
> +          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
> +                Fold.second << " of " << *Fold.first << "\n");'
Single quotes around '\n'
> +        }
> +      }
> +    }
> +  }
> +  return false;
> +}
> diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
> index 5bda8f8..10a307f 100644
> --- a/test/CodeGen/R600/extload.ll
> +++ b/test/CodeGen/R600/extload.ll
> @@ -87,10 +87,9 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
>   }
>   
>   ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
> -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
> -; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
> -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
> -; SI: buffer_store_dwordx2
> +; SI: buffer_load_ubyte v[[LO:[0-9]+]],
> +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
> +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
>   define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
>     %a = load i8 addrspace(1)* %in, align 8
>     %ext = zext i8 %a to i64
> @@ -99,10 +98,9 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
>   }
>   
>   ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
> -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
> -; SI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
> -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
> -; SI: buffer_store_dwordx2
> +; SI: buffer_load_ushort v[[LO:[0-9]+]],
> +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
> +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
>   define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
>     %a = load i16 addrspace(1)* %in, align 8
>     %ext = zext i16 %a to i64
> @@ -111,10 +109,9 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
>   }
>   
>   ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
> -; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
> -; SI-DAG: buffer_load_dword [[LOAD:v[0-9]+]],
> -; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
> -; SI: buffer_store_dwordx2
> +; SI: buffer_load_dword v[[LO:[0-9]+]],
> +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
> +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
>   define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
>     %a = load i32 addrspace(1)* %in, align 8
>     %ext = zext i32 %a to i64
> diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
> index 2ac811f..e9baa08 100644
> --- a/test/CodeGen/R600/local-atomics.ll
> +++ b/test/CodeGen/R600/local-atomics.ll
> @@ -69,8 +69,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
>   ; EG: LDS_ADD_RET *
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
>   ; SI: s_endpgm
>   define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
> @@ -81,8 +80,7 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
>   ; EG: LDS_ADD_RET *
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
>   ; SI: s_endpgm
>   define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
> @@ -129,8 +127,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
>   ; EG: LDS_SUB_RET *
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
>   ; SI: s_endpgm
>   define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
> @@ -141,8 +138,7 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
>   ; EG: LDS_SUB_RET *
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
>   ; SI: s_endpgm
>   define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
> @@ -361,8 +357,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0]
>   ; SI: s_endpgm
>   define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
> @@ -371,8 +366,7 @@ define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
>   ; SI: s_endpgm
>   define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
> @@ -411,8 +405,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
>   ; SI: s_endpgm
>   define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
> @@ -421,8 +414,7 @@ define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
> -; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
> -; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
> +; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1
>   ; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
>   ; SI: s_endpgm
>   define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
> diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
> new file mode 100644
> index 0000000..05177b4
> --- /dev/null
> +++ b/test/CodeGen/R600/operand-folding.ll
> @@ -0,0 +1,40 @@
> +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
> +
> +; CHECK-LABEL: {{^}}fold_sgpr:
> +; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
> +define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
> +entry:
> +  %tmp0 = icmp ne i32 %fold, 0
> +  br i1 %tmp0, label %if, label %endif
> +
> +if:
> +  %id = call i32 @llvm.r600.read.tidig.x()
> +  %offset = add i32 %fold, %id
> +  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 %offset
> +  store i32 0, i32 addrspace(1)* %tmp1
> +  br label %endif
> +
> +endif:
> +  ret void
> +}
> +
> +; CHECK-LABEL: {{^}}fold_imm:
> +; CHECK v_or_i32_e32 v{{[0-9]+}}, 5
> +define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
> +entry:
> +  %fold = add i32 3, 2
> +  %tmp0 = icmp ne i32 %cmp, 0
> +  br i1 %tmp0, label %if, label %endif
> +
> +if:
> +  %id = call i32 @llvm.r600.read.tidig.x()
> +  %val = or i32 %id, %fold
> +  store i32 %val, i32 addrspace(1)* %out
> +  br label %endif
> +
> +endif:
> +  ret void
> +}
> +
> +declare i32 @llvm.r600.read.tidig.x() #0
> +attributes #0 = { readnone }
> -- 1.8.5.5
>
> 0004-R600-SI-Mark-s_mov_b32-and-s_mov_b64-as-rematerializ.patch
>
>
>  From 664d550b79252f2caeeb8a58d702b325cfb866c0 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 14 Nov 2014 18:10:26 -0500
> Subject: [PATCH 4/6] R600/SI: Mark s_mov_b32 and s_mov_b64 as rematerializable
>
> ---
LGTM

>   lib/Target/R600/SIInstructions.td | 2 ++
>   1 file changed, 2 insertions(+)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 90da7a9..bd91577 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -96,8 +96,10 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
>   //===----------------------------------------------------------------------===//
>   
>   let isMoveImm = 1 in {
> +let isReMaterializable = 1 in {
>   def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
>   def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
> +} // let isRematerializeable = 1
>   def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
>   def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
>   } // End isMoveImm = 1
> -- 1.8.5.5
>
> 0005-R600-SI-Emit-s_mov_b32-m0-1-before-every-DS-instruct.patch
>
>
>  From ff94d3a2465d6e0ce50455974938006663529573 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:10:22 -0400
> Subject: [PATCH 5/6] R600/SI: Emit s_mov_b32 m0, -1 before every DS
>   instruction
>
> The s_mov_b32 will write to a virtual register from the M0Reg
> class and all the ds instructions now take an extra M0Reg explicit
> argument.
>
> This change is necessary to prevent issues with the scheduler
> mixing together instructions that expect different values in the m0
> registers.

LGTM

> ---
>   lib/Target/R600/SIISelLowering.cpp       |  2 +-
>   lib/Target/R600/SIInstrFormats.td        |  1 +
>   lib/Target/R600/SIInstrInfo.td           | 17 +++++++++--------
>   lib/Target/R600/SIInstructions.td        | 15 ++++++++-------
>   lib/Target/R600/SILoadStoreOptimizer.cpp | 10 +++++++++-
>   lib/Target/R600/SILowerControlFlow.cpp   | 23 -----------------------
>   test/CodeGen/R600/ds_read2.ll            |  8 ++++----
>   test/CodeGen/R600/ds_read2st64.ll        |  4 ++--
>   test/CodeGen/R600/shl_add_ptr.ll         |  3 ++-
>   9 files changed, 36 insertions(+), 47 deletions(-)
>
> diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
> index 8d4164a..fb45684 100644
> --- a/lib/Target/R600/SIISelLowering.cpp
> +++ b/lib/Target/R600/SIISelLowering.cpp
> @@ -1986,6 +1986,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
>     const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
>         getTargetMachine().getSubtargetImpl()->getInstrInfo());
>   
> +  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
>     TII->legalizeOperands(MI);
>   
>     if (TII->isMIMG(MI->getOpcode())) {
> @@ -2005,7 +2006,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
>   
>       unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
>       MI->setDesc(TII->get(NewOpcode));
> -    MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
>       MRI.setRegClass(VReg, RC);
>       return;
>     }
> diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
> index 10e0a3f..ee1a52b 100644
> --- a/lib/Target/R600/SIInstrFormats.td
> +++ b/lib/Target/R600/SIInstrFormats.td
> @@ -546,6 +546,7 @@ class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
>   
>     let LGKM_CNT = 1;
>     let UseNamedOperandTable = 1;
> +  let DisableEncoding = "$m0";
>   }
>   
>   class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 713e84e..392c272 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -948,7 +948,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
>   class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
>     op,
>     (outs regClass:$vdst),
> -  (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset, M0Reg:$m0),
>     asm#" $vdst, $addr"#"$offset"#" [M0]",
>     []> {
>     let data0 = 0;
> @@ -960,7 +960,8 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
>   class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
>     op,
>     (outs regClass:$vdst),
> -  (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1),
> +  (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1,
> +        M0Reg:$m0),
>     asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]",
>     []> {
>     let data0 = 0;
> @@ -973,7 +974,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
>   class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
>     op,
>     (outs),
> -  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset, M0Reg:$m0),
>     asm#" $addr, $data0"#"$offset"#" [M0]",
>     []> {
>     let data1 = 0;
> @@ -986,7 +987,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
>     op,
>     (outs),
>     (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1,
> -       ds_offset0:$offset0, ds_offset1:$offset1),
> +       ds_offset0:$offset0, ds_offset1:$offset1, M0Reg:$m0),
>     asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]",
>     []> {
>     let mayStore = 1;
> @@ -999,7 +1000,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
>   class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
>     op,
>     (outs rc:$vdst),
> -  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
>     asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>,
>     AtomicNoRet<noRetOp, 1> {
>   
> @@ -1014,7 +1015,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""
>   class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
>     op,
>     (outs rc:$vdst),
> -  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
>     asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]",
>     []>,
>     AtomicNoRet<noRetOp, 1> {
> @@ -1027,7 +1028,7 @@ class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""
>   class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
>     op,
>     (outs),
> -  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset, M0Reg:$m0),
>     asm#" $addr, $data0, $data1"#"$offset"#" [M0]",
>     []>,
>     AtomicNoRet<noRetOp, 0> {
> @@ -1039,7 +1040,7 @@ class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp =
>   class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
>     op,
>     (outs),
> -  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
> +  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset, M0Reg:$m0),
>     asm#" $addr, $data0"#"$offset"#" [M0]",
>     []>,
>     AtomicNoRet<noRetOp, 0> {
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index bd91577..e1eb955 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -2614,7 +2614,7 @@ def : ROTRPattern <V_ALIGNBIT_B32>;
>   
>   class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
>     (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
> -  (inst (i1 0), $ptr, (as_i16imm $offset))
> +  (inst (i1 0), $ptr, (as_i16imm $offset), (S_MOV_B32 -1))
>   >;
>   
>   def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
> @@ -2632,12 +2632,12 @@ def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
>   def : Pat <
>     (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
>                                                       i8:$offset1))),
> -  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1)
> +  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1, (S_MOV_B32 -1))
>   >;
>   
>   class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
>     (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
> -  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
> +  (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
>   >;
>   
>   def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
> @@ -2653,12 +2653,13 @@ def : Pat <
>     (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
>                                                               i8:$offset1)),
>     (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
> -                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1)
> +                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
> +                        (S_MOV_B32 -1))
>   >;
>   
>   class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
>     (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
> -  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
> +  (inst (i1 0), $ptr, $value, (as_i16imm $offset), (S_MOV_B32 -1))
>   >;
>   
>   // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
> @@ -2674,13 +2675,13 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
>   class DSAtomicIncRetPat<DS inst, ValueType vt,
>                           Instruction LoadImm, PatFrag frag> : Pat <
>     (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
> -  (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
> +  (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (S_MOV_B32 -1))
>   >;
>   
>   
>   class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
>     (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
> -  (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
> +  (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset), (S_MOV_B32 -1))
>   >;
>   
>   
> diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
> index 006b869..fc0703f 100644
> --- a/lib/Target/R600/SILoadStoreOptimizer.cpp
> +++ b/lib/Target/R600/SILoadStoreOptimizer.cpp
> @@ -222,6 +222,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
>     // Be careful, since the addresses could be subregisters themselves in weird
>     // cases, like vectors of pointers.
>     const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
> +  const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
>   
>     unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
>     unsigned DestReg1
> @@ -265,6 +266,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
>       .addOperand(*AddrReg) // addr
>       .addImm(NewOffset0) // offset0
>       .addImm(NewOffset1) // offset1
> +    .addOperand(*M0Reg) // M0
>       .addMemOperand(*I->memoperands_begin())
>       .addMemOperand(*Paired->memoperands_begin());
>   
> @@ -283,6 +285,9 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
>     LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
>     LIS->shrinkToUses(&AddrRegLI);
>   
> +  LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
> +  LIS->shrinkToUses(&M0RegLI);
> +
>     LIS->getInterval(DestReg); // Create new LI
>   
>     DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
> @@ -298,6 +303,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
>     // Be sure to use .addOperand(), and not .addReg() with these. We want to be
>     // sure we preserve the subregister index and any register flags set on them.
>     const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
> +  const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
>     const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
>     const MachineOperand *Data1
>       = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
> @@ -339,11 +345,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
>       .addOperand(*Data1) // data1
>       .addImm(NewOffset0) // offset0
>       .addImm(NewOffset1) // offset1
> +    .addOperand(*M0Reg)  // m0
>       .addMemOperand(*I->memoperands_begin())
>       .addMemOperand(*Paired->memoperands_begin());
>   
>     // XXX - How do we express subregisters here?
> -  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
> +  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
> +                          M0Reg->getReg()};
>   
>     LIS->RemoveMachineInstrFromMaps(I);
>     LIS->RemoveMachineInstrFromMaps(Paired);
> diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
> index 9702565..20e8cec 100644
> --- a/lib/Target/R600/SILowerControlFlow.cpp
> +++ b/lib/Target/R600/SILowerControlFlow.cpp
> @@ -88,7 +88,6 @@ private:
>     void Kill(MachineInstr &MI);
>     void Branch(MachineInstr &MI);
>   
> -  void InitM0ForLDS(MachineBasicBlock::iterator MI);
>     void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
>     void IndirectSrc(MachineInstr &MI);
>     void IndirectDst(MachineInstr &MI);
> @@ -325,14 +324,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
>     MI.eraseFromParent();
>   }
>   
> -/// The m0 register stores the maximum allowable address for LDS reads and
> -/// writes.  Its value must be at least the size in bytes of LDS allocated by
> -/// the shader.  For simplicity, we set it to the maximum possible value.
> -void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
> -    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
> -            AMDGPU::M0).addImm(0xffffffff);
> -}
> -
>   void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
>   
>     MachineBasicBlock &MBB = *MI.getParent();
> @@ -391,12 +382,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
>               .addReg(Save);
>   
>     }
> -  // FIXME: Are there any values other than the LDS address clamp that need to
> -  // be stored in the m0 register and may be live for more than a few
> -  // instructions?  If so, we should save the m0 register at the beginning
> -  // of this function and restore it here.
> -  // FIXME: Add support for LDS direct loads.
> -  InitM0ForLDS(&MI);
>     MI.eraseFromParent();
>   }
>   
> @@ -465,7 +450,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>   
>         MachineInstr &MI = *I;
>         if (TII->isDS(MI.getOpcode())) {
> -        NeedM0 = true;
>           NeedWQM = true;
>         }
>   
> @@ -544,13 +528,6 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
>       }
>     }
>   
> -  if (NeedM0) {
> -    MachineBasicBlock &MBB = MF.front();
> -    // Initialize M0 to a value that won't cause LDS access to be discarded
> -    // due to offset clamping
> -    InitM0ForLDS(MBB.getFirstNonPHI());
> -  }
> -
>     if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
>       MachineBasicBlock &MBB = MF.front();
>       BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
> diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
> index 6e0c8be..45d7840 100644
> --- a/test/CodeGen/R600/ds_read2.ll
> +++ b/test/CodeGen/R600/ds_read2.ll
> @@ -46,8 +46,8 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
>   
>   ; SI-LABEL: @simple_read2_f32_too_far
>   ; SI-NOT ds_read2_b32
> -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
> -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
> +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
> +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
Where these really reordered? I expected the scheduler would always 
order accesses from low to high address. This should probably have an 
end of line match on the one without an offset

>   ; SI: s_endpgm
>   define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
>     %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
> @@ -348,8 +348,8 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
>   
>   ; SI-LABEL: @simple_read2_f64_too_far
>   ; SI-NOT ds_read2_b64
> -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
> -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
> +; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
> +; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
>   ; SI: s_endpgm
>   define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
>     %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
> diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
> index 3e98e59..57309ac 100644
> --- a/test/CodeGen/R600/ds_read2st64.ll
> +++ b/test/CodeGen/R600/ds_read2st64.ll
> @@ -158,8 +158,8 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
>   ; Alignment only
>   
>   ; SI-LABEL: @misaligned_read2st64_f64
> -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
> -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
> +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
> +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
>   ; SI: s_endpgm
>   define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
>     %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
> diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
> index 047cf25..fdb3d39 100644
> --- a/test/CodeGen/R600/shl_add_ptr.ll
> +++ b/test/CodeGen/R600/shl_add_ptr.ll
> @@ -68,7 +68,8 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
>   ; pointer can be used with an offset into the second one.
>   
>   ; SI-LABEL: {{^}}load_shl_base_lds_2:
> -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
> +; SI: s_mov_b32 m0, -1
> +; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
>   ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0]
>   ; SI: s_endpgm
>   define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
> -- 1.8.5.5
>
> 0006-R600-SI-Add-an-s_mov_b32-to-patterns-which-use-the-M.patch
>
>
>  From 7eacc8b9248fd88901d884276ef9ce3b32f4b464 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 31 Oct 2014 13:15:24 -0400
> Subject: [PATCH 6/6] R600/SI: Add an s_mov_b32 to patterns which use the
>   M0RegClass
>
> Without an s_mov_b32 instruction in the pattern, the InstrEmitter will
> emit COPY instruction which CSE will ignore.  We were previously trying
> to CSE these copies in SIInstrInfo::copyPhysReg(), but since this was
> happening after register allocation, so if the scheduler group a
> whole bunch for s_mov_b32 m0, ... instruction together, then we would
> end up spilling SGPRs.
LGTM

The commit sentence and comment are run ons that are kind of hard to follow

> ---
>   lib/Target/R600/SIInstrInfo.cpp   | 20 --------------------
>   lib/Target/R600/SIInstructions.td | 12 ++++++++----
>   2 files changed, 8 insertions(+), 24 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 68d5872..ae80aec 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -326,26 +326,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
>     unsigned Opcode;
>     const int16_t *SubIndices;
>   
> -  if (AMDGPU::M0 == DestReg) {
> -    // Check if M0 isn't already set to this value
> -    for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
> -      I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
> -
> -      if (!I->definesRegister(AMDGPU::M0))
> -        continue;
> -
> -      unsigned Opc = I->getOpcode();
> -      if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
> -        break;
> -
> -      if (!I->readsRegister(SrcReg))
> -        break;
> -
> -      // The copy isn't necessary
> -      return;
> -    }
> -  }
> -
>     if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
>       assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
>       BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index e1eb955..00ce9bf 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -2499,17 +2499,21 @@ def : Pat <
>   /********** Interpolation Paterns **********/
>   /********** ===================== **********/
>   
> +// The value of $params is constant through out the entire kernel.
> +// We need to use S_MOV_B32 $params, because CSE ignores copies, so
> +// without it we end up with a lot of redundant moves.
> +
>   def : Pat <
>     (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
> -  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
> +  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>   >;
>   
>   def : Pat <
> -  (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
> +  (int_SI_fs_interp imm:$attr_chan, imm:$attr, i32:$params, v2i32:$ij),
>     (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
> -                                    imm:$attr_chan, imm:$attr, i32:$params),
> +                                    imm:$attr_chan, imm:$attr, (S_MOV_B32 $params)),
>                      (EXTRACT_SUBREG $ij, sub1),
> -                   imm:$attr_chan, imm:$attr, $params)
> +                   imm:$attr_chan, imm:$attr, (S_MOV_B32 $params))
>   >;
>   
>   /********** ================== **********/
> -- 1.8.5.5

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141118/5ed60b67/attachment.html>


More information about the llvm-commits mailing list