PATCHES: R600/SI: Remove SelectionDAG operand folding

Fri Dec 19 08:14:57 PST 2014

On 12/18/2014 08:02 PM, Tom Stellard wrote:
> Hi,
>
> This series of patches removes the legacy operand folding that was done on
> the SelectionDAG.  The SIFoldOperands MachineInstr pass now provides the
> same functionality.
>
> -Tom
>
> 0001-R600-SI-Use-immediates-in-the-first-operand-in-fabs-.patch
>
>
>  From d6dc4e6bbf378ad86e925982d38297a2622b4ffa Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:34:11 -0500
> Subject: [PATCH 1/7] R600/SI: Use immediates in the first operand in fabs/fneg
>   patterns
>
> This is for the stand-alone patterns that are lowered using v_or_b32
> and v_xor_b32.  Putting the immediate in the first operand ensures
> that it will be folded into the instruction.

I looked at doing this before, and wasn't sure it was the best idea. In 
general I think the folding of literals should be smarter considering
the uses literal has. For example, in the test update, an unrolled 
vector xor with immediate will now have a copy of the 32-bit immediate for
every vector component. It would probably be smarter to move the 
immediate into a register in this case, and re-use for every operand. 
It's 8 * N vs. 8 + 4 * (N - 1) bytes for the whole vector to save only 4 
cycles, so it ends up being smaller. Maybe this should be an -Os vs. -O3 
kind of option. This also applies for SGPR uses.
> ---
>   lib/Target/R600/SIInstructions.td  | 18 +++++++++---------
>   test/CodeGen/R600/fneg-fabs.f64.ll | 26 ++++++++++----------------
>   test/CodeGen/R600/fneg-fabs.ll     | 26 +++++++++-----------------
>   3 files changed, 28 insertions(+), 42 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 463287e..96f75f9 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -2491,7 +2491,7 @@ def : Pat <
>   // FIXME: Should use S_OR_B32
>   def : Pat <
>     (fneg (fabs f32:$src)),
> -  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
> +  (V_OR_B32_e32 (V_MOV_B32_e32 0x80000000), $src) /* Set sign bit */
>   >;
>   
>   // FIXME: Should use S_OR_B32
> @@ -2500,19 +2500,19 @@ def : Pat <
>     (REG_SEQUENCE VReg_64,
>       (i32 (EXTRACT_SUBREG f64:$src, sub0)),
>       sub0,
> -    (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> -                  (V_MOV_B32_e32 0x80000000)), // Set sign bit.
> +    (V_OR_B32_e32 (V_MOV_B32_e32 0x80000000),  // Set sign bit.
> +                  (EXTRACT_SUBREG f64:$src, sub1)),
>       sub1)
>   >;
>   
>   def : Pat <
>     (fabs f32:$src),
> -  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
> +  (V_AND_B32_e32 (V_MOV_B32_e32 0x7fffffff), $src)
>   >;
>   
>   def : Pat <
>     (fneg f32:$src),
> -  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
> +  (V_XOR_B32_e32 (V_MOV_B32_e32 0x80000000), $src)
>   >;
>   
>   def : Pat <
> @@ -2520,8 +2520,8 @@ def : Pat <
>     (REG_SEQUENCE VReg_64,
>       (i32 (EXTRACT_SUBREG f64:$src, sub0)),
>       sub0,
> -    (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> -                   (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
> +    (V_AND_B32_e32 (V_MOV_B32_e32 0x7fffffff), // Set sign bit.
> +                   (EXTRACT_SUBREG f64:$src, sub1)),
>        sub1)
>   >;
>   
> @@ -2530,8 +2530,8 @@ def : Pat <
>     (REG_SEQUENCE VReg_64,
>       (i32 (EXTRACT_SUBREG f64:$src, sub0)),
>       sub0,
> -    (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> -                   (V_MOV_B32_e32 0x80000000)),
> +    (V_XOR_B32_e32 (V_MOV_B32_e32 0x80000000),
> +                   (EXTRACT_SUBREG f64:$src, sub1)),
>       sub1)
>   >;
>   
> diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
> index 555f4cc..6584108 100644
> --- a/test/CodeGen/R600/fneg-fabs.f64.ll
> +++ b/test/CodeGen/R600/fneg-fabs.f64.ll
> @@ -4,8 +4,7 @@
>   ; into 2 modifiers, although theoretically that should work.
>   
>   ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
> -; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
> +; SI: v_and_b32_e32 v[[FABS:[0-9]+]], 0x7fffffff, {{v[0-9]+}}
>   ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
>   define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
>     %fabs = call double @llvm.fabs.f64(double %x)
> @@ -45,8 +44,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
>   }
>   
>   ; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
>     %bc = bitcast i64 %in to double
>     %fabs = call double @fabs(double %bc)
> @@ -58,8 +56,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
>   ; FUNC-LABEL: {{^}}fneg_fabs_f64:
>   ; SI: s_load_dwordx2
>   ; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
> +; SI: v_mov_b32_e32 [[HI_X_V:v[0-9]+]], s[[HI_X]]
> +; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], 0x80000000, [[HI_X_V]]
>   ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
>   ; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
>   define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
> @@ -70,10 +68,8 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
>   }
>   
>   ; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
>     %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
>     %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
> @@ -82,12 +78,10 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
>   }
>   
>   ; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
>     %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
>     %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
> diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
> index 3cc832f..12cc2a6 100644
> --- a/test/CodeGen/R600/fneg-fabs.ll
> +++ b/test/CodeGen/R600/fneg-fabs.ll
> @@ -33,8 +33,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
>   ; R600: |PV.{{[XYZW]}}|
>   ; R600: -PV
>   
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
>     %bc = bitcast i32 %in to float
>     %fabs = call float @llvm.fabs.f32(float %bc)
> @@ -48,8 +47,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
>   ; R600: |PV.{{[XYZW]}}|
>   ; R600: -PV
>   
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
>     %bc = bitcast i32 %in to float
>     %fabs = call float @fabs(float %bc)
> @@ -59,8 +57,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
>   }
>   
>   ; FUNC-LABEL: {{^}}fneg_fabs_f32:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
>     %fabs = call float @llvm.fabs.f32(float %in)
>     %fsub = fsub float -0.000000e+00, %fabs
> @@ -84,11 +81,8 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
>   ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
>   ; R600: -PV
>   
> -; FIXME: SGPR should be used directly for first src operand.
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
>     %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
>     %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
> @@ -98,12 +92,10 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
>   
>   ; FIXME: SGPR should be used directly for first src operand.
>   ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
>   define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
>     %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
>     %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
> -- 1.8.5.5
>
> 0002-R600-SI-Make-sure-non-inline-constants-aren-t-folded.patch
>
>
>  From 9dacc28f2ff57dafa62c091cb3bc5b995a7c2255 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 18 Dec 2014 16:42:54 -0500
> Subject: [PATCH 2/7] R600/SI: Make sure non-inline constants aren't folded
>   into mubuf soffset operand
>
> mubuf instructions now define the soffset field using the SCSrc_32
> register class which indicates that only SGPRs and inline constants
> are allowed.
> ---
>   lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp |  5 ++--
>   lib/Target/R600/SIInstrInfo.td                   | 30 ++++++++++++------------
>   lib/Target/R600/SIRegisterInfo.cpp               |  1 +
>   lib/Target/R600/SIRegisterInfo.td                |  6 +++++
>   test/CodeGen/R600/mubuf.ll                       | 25 ++++++++++++++++++++
>   5 files changed, 50 insertions(+), 17 deletions(-)

LGTM. I didn't know soffset could be a constant. I think more tests 
should be included that use inline immediate values for the constant, 
and at the limits of the offset range (those might already be there)
>
>
> 0003-R600-SI-isLegalOperand-shouldn-t-check-constant-bus-.patch
>
>
>  From 47155e7618b04c2640f3dc1b2f77cfa6c70f6b2a Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:47:07 -0500
> Subject: [PATCH 3/7] R600/SI: isLegalOperand() shouldn't check constant bus
>   for SALU instructions
>
> The constant bus restrictions only apply to VALU instructions.  This
> enables SIFoldOperands to fold immediates into SALU instructions.
> ---
>   lib/Target/R600/SIInstrInfo.cpp | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 08bfc5e..a58a46e 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -1405,7 +1405,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
>     if (!MO)
>       MO = &MI->getOperand(OpIdx);
>   
> -  if (usesConstantBus(MRI, *MO)) {
> +  if (isVALU(InstDesc.Opcode) && usesConstantBus(MRI, *MO)) {
>       unsigned SGPRUsed =
>           MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
>       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
> -- 1.8.5.5
LGTM

>
> 0004-R600-SI-Refactor-SIFoldOperands-to-simplify-immediat.patch
>
>
>  From b25bd5edfda7d9a16682e45ff40faa4411f81a73 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 17 Dec 2014 11:41:55 -0500
> Subject: [PATCH 4/7] R600/SI: Refactor SIFoldOperands to simplify immediate
>   folding
>
> This will make a future patch much less intrusive.
> ---
>   lib/Target/R600/SIFoldOperands.cpp | 79 ++++++++++++++++++++++++++------------
>   1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index 761e866..b172ee8 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -49,6 +49,23 @@ public:
>     }
>   };
>   
> +struct FoldCandidate {
> +  MachineInstr *UseMI;
> +  unsigned UseOpNo;
> +  MachineOperand *OpToFold;
> +  uint64_t ImmToFold;
> +
> +  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
> +      UseMI(MI), UseOpNo(OpNo), OpToFold(FoldOp), ImmToFold(0) { }
> +
> +  FoldCandidate(MachineInstr *MI, unsigned OpNo, uint64_t Imm) :
> +      UseMI(MI), UseOpNo(OpNo), OpToFold(nullptr), ImmToFold(Imm) { }
> +
> +  bool IsImm() const {
Should be isImm()
> +    return !OpToFold;
> +  }
> +};
> +
>   } // End anonymous namespace.
>   
>   INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
> @@ -78,30 +95,24 @@ static bool isSafeToFold(unsigned Opcode) {
>     }
>   }
>   
> -static bool updateOperand(MachineInstr *MI, unsigned OpNo,
> -                          const MachineOperand &New,
> +static bool updateOperand(FoldCandidate &Fold,
>                             const TargetRegisterInfo &TRI) {
> -  MachineOperand &Old = MI->getOperand(OpNo);
> +  MachineInstr *MI = Fold.UseMI;
> +  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
>     assert(Old.isReg());
>   
> -  if (New.isImm()) {
> -    Old.ChangeToImmediate(New.getImm());
> +  if (Fold.IsImm()) {
> +    Old.ChangeToImmediate(Fold.ImmToFold);
>       return true;
>     }
>   
> -  if (New.isFPImm()) {
> -    Old.ChangeToFPImmediate(New.getFPImm());
> +  MachineOperand *New = Fold.OpToFold;
> +  if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
> +      TargetRegisterInfo::isVirtualRegister(New->getReg())) {
> +    Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
>       return true;
>     }
>   
> -  if (New.isReg())  {
> -    if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
> -        TargetRegisterInfo::isVirtualRegister(New.getReg())) {
> -      Old.substVirtReg(New.getReg(), New.getSubReg(), TRI);
> -      return true;
> -    }
> -  }
> -
>     // FIXME: Handle physical registers.
>   
>     return false;
> @@ -133,7 +144,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>              OpToFold.getSubReg()))
>           continue;
>   
> -      std::vector<std::pair<MachineInstr *, unsigned>> FoldList;
> +      std::vector<FoldCandidate> FoldList;
>         for (MachineRegisterInfo::use_iterator
>              Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
>              Use != E; ++Use) {
> @@ -146,10 +157,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>             continue;
>           }
>   
> +        bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
> +
Have you tried replacing all FP immediates with integers instead?
>           // In order to fold immediates into copies, we need to change the
>           // copy to a MOV.
> -        if ((OpToFold.isImm() || OpToFold.isFPImm()) &&
> -             UseMI->getOpcode() == AMDGPU::COPY) {
> +        if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
>             const TargetRegisterClass *TRC =
>                 MRI.getRegClass(UseMI->getOperand(0).getReg());
>   
> @@ -173,9 +185,24 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>               UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
>             continue;
>   
> -        // Normal substitution
> +        if (FoldingImm) {
> +          uint64_t Imm;
> +          if (OpToFold.isFPImm()) {
> +            Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
> +          } else {
> +            Imm = OpToFold.getImm();
> +          }
> +
> +          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
> +          if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
> +            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
> +            continue;
> +          }
> +        }
> +
> +        // Normal substitution with registers
>           if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &OpToFold)) {
> -          FoldList.push_back(std::make_pair(UseMI, Use.getOperandNo()));
> +          FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), &OpToFold));
>             continue;
>           }
>   
> @@ -187,13 +214,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>           // already does this.
>         }
>   
> -      for (std::pair<MachineInstr *, unsigned> Fold : FoldList) {
> -        if (updateOperand(Fold.first, Fold.second, OpToFold, TRI)) {
> +      for (FoldCandidate &Fold : FoldList) {
> +        if (updateOperand(Fold, TRI)) {
>             // Clear kill flags.
> -          if (OpToFold.isReg())
> -            OpToFold.setIsKill(false);
> +          if (!Fold.IsImm()) {
> +            assert(Fold.OpToFold && Fold.OpToFold->isReg());
> +            Fold.OpToFold->setIsKill(false);
> +          }
>             DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
> -                Fold.second << " of " << *Fold.first << '\n');
> +                Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
>           }
>         }
>       }
> -- 1.8.5.5
>
> 0005-R600-SI-Teach-SIFoldOperands-to-split-64-bit-constan.patch
>
>
>  From a8f3e690476d07a0b87adc624c690a6a68b22736 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:49:19 -0500
> Subject: [PATCH 5/7] R600/SI: Teach SIFoldOperands to split 64-bit constants
>   when folding
>
> This allows folding of sequences like:
>
> s[0:1] = s_mov_b64 4
> v_add_i32 v0, s0, v0
> v_addc_u32 v1, s1, v1
>
> into
>
> v_add_i32 v0, 4, v0
> v_add_i32 v1, 0, v1
> ---
>   lib/Target/R600/SIFoldOperands.cpp   | 69 +++++++++++++++++++++++-------------
>   test/CodeGen/R600/operand-folding.ll | 17 +++++++++
>   test/CodeGen/R600/sint_to_fp.f64.ll  |  8 ++---
>   test/CodeGen/R600/uint_to_fp.f64.ll  |  8 ++---
>   4 files changed, 69 insertions(+), 33 deletions(-)
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index b172ee8..c661b0a 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -153,27 +153,51 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>           const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
>   
>           // FIXME: Fold operands with subregs.
> -        if (UseOp.isReg() && UseOp.getSubReg()) {
> +        if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
>             continue;
>           }
>   
>           bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
> +        APInt Imm;
>   
> -        // In order to fold immediates into copies, we need to change the
> -        // copy to a MOV.
> -        if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
> -          const TargetRegisterClass *TRC =
> -              MRI.getRegClass(UseMI->getOperand(0).getReg());
> -
> -          if (TRC->getSize() == 4) {
> -            if (TRI.isSGPRClass(TRC))
> -              UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
> -            else
> -              UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
> -          } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
> -            UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> +        if (FoldingImm) {
> +          const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
> +
> +          if (OpToFold.isFPImm()) {
> +            Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt();
>             } else {
> -            continue;
> +            Imm = APInt(64, OpToFold.getImm());
> +          }
> +
> +          // Split 64-bit constants into 32-bits for folding.
> +          if (UseOp.getSubReg()) {
> +            if (UseRC->getSize() != 8)
> +              continue;
> +
> +            if (UseOp.getSubReg() == AMDGPU::sub0) {
> +              Imm = Imm.getLoBits(32);
> +            } else {
> +              assert(UseOp.getSubReg() == AMDGPU::sub1);
> +              Imm = Imm.getHiBits(32);
> +            }
> +          }
> +
> +          // In order to fold immediates into copies, we need to change the
> +          // copy to a MOV.
> +          if (UseMI->getOpcode() == AMDGPU::COPY) {
> +            const TargetRegisterClass *TRC =
> +                MRI.getRegClass(UseMI->getOperand(0).getReg());
> +
> +            if (TRC->getSize() == 4) {
> +              if (TRI.isSGPRClass(TRC))
> +                UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
> +              else
> +                UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
> +            } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
> +              UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> +            } else {
> +              continue;
> +            }
It would probably be useful to factor this into a getMovOpcode() somewhere
>             }
>           }
>   
> @@ -185,19 +209,14 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>               UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
>             continue;
>   
> -        if (FoldingImm) {
> -          uint64_t Imm;
> -          if (OpToFold.isFPImm()) {
> -            Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
> -          } else {
> -            Imm = OpToFold.getImm();
> -          }
>   
> -          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
> +        if (FoldingImm) {
> +          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
>             if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
> -            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
> -            continue;
> +            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
> +                               Imm.getSExtValue()));
>             }
> +          continue;
>           }
>   
>           // Normal substitution with registers
> diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
> index 05177b4..f62aa09 100644
> --- a/test/CodeGen/R600/operand-folding.ll
> +++ b/test/CodeGen/R600/operand-folding.ll
> @@ -36,5 +36,22 @@ endif:
>     ret void
>   }
>   
> +; CHECK-LABEL: {{^}}fold_64bit_constant_add:
> +; CHECK-NOT: s_mov_b64
> +; FIXME: It would be better if we clud use v_add here and drop the extra
Typo 'clud'
> +; v_mov_b32 instructions.
> +; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
> +; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
> +; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
> +; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
> +; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
> +
> +define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
> +entry:
> +  %tmp0 = add i64 %val, 1
> +  store i64 %tmp0, i64 addrspace(1)* %out
> +  ret void
> +}
> +
One that will use vector adds would be useful if one isn't there already

>   declare i32 @llvm.r600.read.tidig.x() #0
>   attributes #0 = { readnone }
> diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
> index 6e4f87c..efbdf25 100644
> --- a/test/CodeGen/R600/sint_to_fp.f64.ll
> +++ b/test/CodeGen/R600/sint_to_fp.f64.ll
> @@ -12,10 +12,10 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
>   
>   ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
>   ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
> -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
> -; we should be able to fold the SGPRs into the V_CNDMASK instructions.
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
> +; uses an SGPR for [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
>   ; SI: buffer_store_dwordx2
>   ; SI: s_endpgm
>   define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
> diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
> index d16872b..fa70bdf 100644
> --- a/test/CodeGen/R600/uint_to_fp.f64.ll
> +++ b/test/CodeGen/R600/uint_to_fp.f64.ll
> @@ -72,10 +72,10 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
>   
>   ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
>   ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
> -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
> -; we should be able to fold the SGPRs into the V_CNDMASK instructions.
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
> +; uses an SGPR for [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
>   ; SI: buffer_store_dwordx2
>   ; SI: s_endpgm
>   define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
> -- 1.8.5.5
>
> 0006-R600-SI-Add-a-V_MOV_B64-pseudo-instruction.patch
>
>
>  From 8ff850ac46ab082a06882e743946297c2791d6e3 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 17 Dec 2014 16:35:19 -0500
> Subject: [PATCH 6/7] R600/SI: Add a V_MOV_B64 pseudo instruction
>
> This is used to simplify the SIFoldOperands pass and make it easier to
> fold immediates.
> ---
>   lib/Target/R600/SIFoldOperands.cpp         |  3 +++
>   lib/Target/R600/SIInstrInfo.cpp            | 29 +++++++++++++++++++++++++++
>   lib/Target/R600/SIInstructions.td          |  6 ++++++
>   test/CodeGen/R600/atomic_cmp_swap_local.ll | 10 ++++------
>   test/CodeGen/R600/imm.ll                   |  7 ++-----
>   test/CodeGen/R600/local-atomics64.ll       | 32 ++++++++++++------------------
>   6 files changed, 57 insertions(+), 30 deletions(-)

I thought you were avoiding this with the previous patch? Does the pass 
handle constants specially and this is for SGPRs only?

I think the name might need something to indicate it's a 
pseudoinstruction. v_mov_b64 sounds like a reasonable real instruction 
name which I wish would be added which might confuse people who aren't 
aware it doesn't exist.
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index c661b0a..1b0e09e 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -86,6 +86,7 @@ static bool isSafeToFold(unsigned Opcode) {
>     switch(Opcode) {
>     case AMDGPU::V_MOV_B32_e32:
>     case AMDGPU::V_MOV_B32_e64:
> +  case AMDGPU::V_MOV_B64:
>     case AMDGPU::S_MOV_B32:
>     case AMDGPU::S_MOV_B64:
>     case AMDGPU::COPY:
> @@ -195,6 +196,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
>                   UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
>               } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
>                 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> +            } else if (TRC->getSize() == 8 && !TRI.isSGPRClass(TRC)) {
> +              UseMI->setDesc(TII->get(AMDGPU::V_MOV_B64));
>               } else {
>                 continue;
>               }
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index a58a46e..fe29a50 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -662,6 +662,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
>       // This is just a placeholder for register allocation.
>       MI->eraseFromParent();
>       break;
> +
> +  case AMDGPU::V_MOV_B64: {
> +    unsigned Dst = MI->getOperand(0).getReg();
> +    unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
> +    unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
> +
> +    const MachineOperand &SrcOp = MI->getOperand(1);
> +    // FIXME: Will this work for 64-bit floating point immediates?
> +    assert(!SrcOp.isFPImm());
> +    if (SrcOp.isImm()) {
> +      APInt Imm(64, SrcOp.getImm());
> +      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
> +              .addImm(Imm.getLoBits(32).getZExtValue())
> +              .addReg(Dst, RegState::Implicit);
> +      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
> +              .addImm(Imm.getHiBits(32).getZExtValue())
> +              .addReg(Dst, RegState::Implicit);
> +    } else {
> +      assert(SrcOp.isReg());
> +      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
> +              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
> +              .addReg(Dst, RegState::Implicit);
> +      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
> +              .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
> +              .addReg(Dst, RegState::Implicit);
> +    }
> +    MI->eraseFromParent();
> +    break;
> +  }
>     }
>     return true;
>   }
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 96f75f9..dbc090d 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1742,6 +1742,12 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
>   //===----------------------------------------------------------------------===//
>   let isCodeGenOnly = 1, isPseudo = 1 in {
>   
> +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
> +// 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
> +// pass to enable folding of inline immediates.
> +def V_MOV_B64 : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
> +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
> +
>   let hasSideEffects = 1 in {
>   def SGPR_USE : InstSI <(outs),(ins), "", []>;
>   }
> diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
> index 223f4d3..a77627c 100644
> --- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
> +++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
> @@ -20,9 +20,8 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
>   ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
>   ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
>   ; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
> -; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
> -; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
> +; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
>   ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
>   ; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
>   ; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
> @@ -69,9 +68,8 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
>   ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
>   ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
>   ; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
> -; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
> -; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
> +; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
>   ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
>   ; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
>   ; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
> diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
> index 79f36b6..1cc03b8 100644
> --- a/test/CodeGen/R600/imm.ll
> +++ b/test/CodeGen/R600/imm.ll
> @@ -474,12 +474,9 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
>   }
>   
>   
> -; FIXME: These shoudn't bother materializing in SGPRs
> -
>   ; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64
> -; CHECK: s_mov_b64 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]{{\]}}, 0{{$}}
> -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
> -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
> +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
> +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
>   ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
>   define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
>     store double 0.0, double addrspace(1)* %out
> diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
> index ce0cf59..b39581e 100644
> --- a/test/CodeGen/R600/local-atomics64.ll
> +++ b/test/CodeGen/R600/local-atomics64.ll
> @@ -30,9 +30,8 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
>   ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
> +; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
>   ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
>   ; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
>   ; SI: buffer_store_dwordx2 [[RESULT]],
> @@ -45,9 +44,8 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
>   ; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
>   ; SI: buffer_store_dwordx2 [[RESULT]],
>   ; SI: s_endpgm
> @@ -87,9 +85,8 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
>   ; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
>   ; SI: buffer_store_dwordx2 [[RESULT]],
>   ; SI: s_endpgm
> @@ -277,10 +274,9 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
>   ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> -; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
>   ; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
>   ; SI: s_endpgm
>   define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
> @@ -290,9 +286,8 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
>   ; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
>   ; SI: s_endpgm
>   define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
> @@ -327,9 +322,8 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
>   }
>   
>   ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
>   ; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
>   ; SI: s_endpgm
>   define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
> -- 1.8.5.5
>
> 0007-R600-SI-Remove-SIISelLowering-legalizeOperands.patch
>
>
>  From dd174475c08471d665d33c6dbe665dafb35fe47c Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 19:05:21 -0500
> Subject: [PATCH 7/7] R600/SI: Remove SIISelLowering::legalizeOperands()
>
> Its functionality has been replaced by calling
> SIInstrInfo::legalizeOperands() from
> SIISelLowering::AdjstInstrPostInstrSelection() and running the
> SIFoldOperands and SIShrinkInstructions passes.
> ---
>   lib/Target/R600/SIISelLowering.cpp  | 174 +-----------------------------------
>   lib/Target/R600/SIISelLowering.h    |   3 -
>   test/CodeGen/R600/fneg.ll           |   2 +-
>   test/CodeGen/R600/imm.ll            |   4 +-
>   test/CodeGen/R600/seto.ll           |   2 +-
>   test/CodeGen/R600/setuo.ll          |   2 +-
>   test/CodeGen/R600/sint_to_fp.f64.ll |   4 +-
>   test/CodeGen/R600/sint_to_fp.ll     |   2 +-
>   test/CodeGen/R600/uint_to_fp.f64.ll |   4 +-
>   test/CodeGen/R600/uint_to_fp.ll     |   2 +-
>   test/CodeGen/R600/xor.ll            |   2 +-
>   11 files changed, 13 insertions(+), 188 deletions(-)

LGTM
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141219/30ab065d/attachment.html>