PATCH: R600/SI: Add ComplexPattern to match MUBUF variant with no VADDR

Mon Aug 11 12:36:09 PDT 2014

On 08/11/2014 07:09 AM, Tom Stellard wrote:
> Here are some updated patches.  Mostly the same, but I had to
> update the areLoadsFromSameBasePtr function to handle the new
> MUBUF variant.
>
> -Tom
LGTM with a few minor issues


>
> 0001-R600-SI-Fix-broken-test.patch
>
>
>  From fa2f3ea9a592f1b27efe06820ad184c803aa99cd Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 11 Aug 2014 06:37:45 -0400
> Subject: [PATCH 1/5] R600/SI: Fix broken test
>
> ---
>   test/CodeGen/R600/smrd.ll | 8 +++++---
>   1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
> index dec6185..6f05d3e 100644
> --- a/test/CodeGen/R600/smrd.ll
> +++ b/test/CodeGen/R600/smrd.ll
> @@ -64,8 +64,8 @@ main_body:
>     ret void
>   }
>   
> -; SMRD load using the load.const intrinsic with an offset greater largest possible
> -; immediate offset.
> +; SMRD load using the load.const intrinsic with the largest possible immediate
> +; offset.
>   ; CHECK-LABEL: @smrd_load_const1
>   ; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
>   define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
> @@ -76,9 +76,11 @@ main_body:
>     call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
>     ret void
>   }
> -; SMRD load using the load.const intrinsic with the largetst possible
> +; SMRD load using the load.const intrinsic with an offset greater than the
> +; largets possible immediate.
>   ; immediate offset.
>   ; CHECK-LABEL: @smrd_load_const2
Typo: largets

>
> 0002-R600-SI-Clear-lds-bit-on-MUBUF-instructions-used-for.patch
>
>
>  From 48fb3affd4a5c7544ce5b59df14452773088813a Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 6 Aug 2014 11:42:14 -0400
> Subject: [PATCH 2/5] R600/SI: Clear lds bit on MUBUF instructions used for
>   private stores
>
> This bit was left uninitialized, which was causing some random failures
> of piglit tests.
>
> NOTE: This is a candidate for the 3.5 branch.
> ---
>   lib/Target/R600/SIInstrInfo.td      |  1 +
>   test/CodeGen/R600/private-memory.ll | 19 +++++++++----------
>   2 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index eacfbc6..dfc7a59 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -975,6 +975,7 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
>       name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#"$glc"#"$slc"#"$tfe",
>       []
>     > {
> +    let lds = 0;
>       let addr64 = 0;
>     }
>   
> diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
> index 9da3d32..b0f9c98 100644
> --- a/test/CodeGen/R600/private-memory.ll
> +++ b/test/CodeGen/R600/private-memory.ll
> @@ -1,6 +1,6 @@
>   ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
> -; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
> -; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
> +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
> +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
>   
>   declare i32 @llvm.r600.read.tidig.x() nounwind readnone
>   
> @@ -16,8 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
>   ; SI-PROMOTE: DS_READ_B32
>   ; SI-PROMOTE: DS_READ_B32
>   
> -; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
> +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
>   define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
>   entry:
>     %stack = alloca [5 x i32], align 4
> @@ -116,10 +116,9 @@ for.end:
>   
>   ; R600: MOVA_INT
>   
> -; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-PROMOTE-NOT: MOVREL
> -; SI-PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
> +; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
> +; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
>   define void @short_array(i32 addrspace(1)* %out, i32 %index) {
>   entry:
Broken SI_PROMOTE check line

>     %0 = alloca [2 x i16]
> @@ -138,8 +137,8 @@ entry:
>   
>   ; R600: MOVA_INT
>   
> -; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} {{offen$}}
> -; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1
> +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
> +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1 ; encoding: [0x01,0x10,0x60,0xe0
>   define void @char_array(i32 addrspace(1)* %out, i32 %index) {
>   entry:
>     %0 = alloca [2 x i8]
> -- 1.8.1.5
>
>

> 0005-R600-SI-Add-a-ComplexPattern-for-selecting-MUBUF-_OF.patch
>
>
>  From 9aca99829a941739c893a4b35e37e1188d411dec Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 1 Aug 2014 11:00:34 -0400
> Subject: [PATCH 5/5] R600/SI: Add a ComplexPattern for selecting MUBUF _OFFSET
>   variant
>
> This saves us from having to copy a 64-bit 0 value into VGPRs for
> BUFFER_* instruction which only have a 12-bit immediate offset.
> ---
>   lib/Target/R600/AMDGPUISelDAGToDAG.cpp     | 147 ++++++++++------
>   lib/Target/R600/SIInstrInfo.cpp            | 259 +++++++++++++++++------------
>   lib/Target/R600/SIInstrInfo.h              |   1 +
>   lib/Target/R600/SIInstrInfo.td             |  37 ++++-
>   test/CodeGen/R600/ctpop.ll                 |   4 +-
>   test/CodeGen/R600/extload.ll               |  12 +-
>   test/CodeGen/R600/mubuf.ll                 |  14 +-
>   test/CodeGen/R600/private-memory.ll        |   2 +-
>   test/CodeGen/R600/schedule-global-loads.ll |  19 ++-
>   test/CodeGen/R600/sext-in-reg.ll           |   6 +-
>   test/CodeGen/R600/zero_extend.ll           |   2 +-
>   11 files changed, 326 insertions(+), 177 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> index d4c0987..a624b70 100644
> --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> @@ -88,13 +88,16 @@ private:
>                                          SDValue& Offset);
>     bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
>     bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
> -  bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
> -                         SDValue &ImmOffset) const;
> +  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> +                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
> +                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
> +                   SDValue &TFE) const;
> +  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> +                         SDValue &Offset) const;
>     bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
>                             SDValue &SOffset, SDValue &ImmOffset) const;
> -  bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> -                         SDValue &SOffset, SDValue &Offset, SDValue &Offen,
> -                         SDValue &Idxen, SDValue &GLC, SDValue &SLC,
> +  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
> +                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
>                            SDValue &TFE) const;
>     bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
>     bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
> @@ -750,11 +753,23 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
>     return isUInt<12>(Imm->getZExtValue());
>   }
>   
> -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
> -                                           SDValue &Offset,
> -                                           SDValue &ImmOffset) const {
> +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
> +                                     SDValue &VAddr, SDValue &SOffset,
> +                                     SDValue &Offset, SDValue &Offen,
> +                                     SDValue &Idxen, SDValue &Addr64,
> +                                     SDValue &GLC, SDValue &SLC,
> +                                     SDValue &TFE) const {
>     SDLoc DL(Addr);
>   
> +  GLC = CurDAG->getTargetConstant(0, MVT::i1);
> +  SLC = CurDAG->getTargetConstant(0, MVT::i1);
> +  TFE = CurDAG->getTargetConstant(0, MVT::i1);
> +
> +  Idxen = CurDAG->getTargetConstant(0, MVT::i1);
> +  Offen = CurDAG->getTargetConstant(0, MVT::i1);
> +  Addr64 = CurDAG->getTargetConstant(0, MVT::i1);
> +  SOffset = CurDAG->getTargetConstant(0, MVT::i32);
> +
>     if (CurDAG->isBaseWithConstantOffset(Addr)) {
>       SDValue N0 = Addr.getOperand(0);
>       SDValue N1 = Addr.getOperand(1);
> @@ -763,59 +778,88 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
>       if (isLegalMUBUFImmOffset(C1)) {
>   
>         if (N0.getOpcode() == ISD::ADD) {
> -        // (add (add N2, N3), C1)
> +        // (add (add N2, N3), C1) -> addr64
>           SDValue N2 = N0.getOperand(0);
>           SDValue N3 = N0.getOperand(1);
> -        Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
> -        Offset = N3;
> -        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> -        return true;
> +        Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
> +        Ptr = N2;
> +        VAddr = N3;
> +        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> +        return;
>         }
>   
> -      // (add N0, C1)
> -      Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
> -      Offset = N0;
> -      ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> -      return true;
> +      // (add N0, C1) -> offset
> +      VAddr = CurDAG->getTargetConstant(0, MVT::i32);
> +      Ptr = N0;
> +      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> +      return;
>       }
>     }
>     if (Addr.getOpcode() == ISD::ADD) {
> -    // (add N0, N1)
> +    // (add N0, N1) -> addr64
>       SDValue N0 = Addr.getOperand(0);
>       SDValue N1 = Addr.getOperand(1);
> -    Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
> -    Offset = N1;
> -    ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
> -    return true;
> +    Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
> +    Ptr = N0;
> +    VAddr = N1;
> +    Offset = CurDAG->getTargetConstant(0, MVT::i16);
> +    return;
>     }
>   
> -  // default case
> -  Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
> -  Offset = Addr;
> -  ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
> -  return true;
> +  // default case -> offset
> +  VAddr = CurDAG->getTargetConstant(0, MVT::i32);
> +  Ptr = Addr;
> +  Offset = CurDAG->getTargetConstant(0, MVT::i16);
> +
>   }
>   
> -/// \brief Return a resource descriptor with the 'Add TID' bit enabled
> -///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
> -///        of the resource descriptor) to create an offset, which is added to the
> -///        resource ponter.
> -static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
> +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
> +                                           SDValue &VAddr,
> +                                           SDValue &Offset) const {
> +  SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
>   
> -  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
> -                  0xffffffff;
> +  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
> +              GLC, SLC, TFE);
> +
> +  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
> +  if (C->getSExtValue()) {
> +    SDLoc DL(Addr);
> +    SRsrc = wrapAddr64Rsrc(CurDAG, DL, Ptr);
> +    return true;
> +  }
> +  return false;
> +}
> +
> +static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr,
> +                         uint32_t RsrcDword1, uint64_t RsrcDword2And3) {
>   
>     SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
>     SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
> +  if (RsrcDword1)
> +    PtrHi = SDValue(DAG->getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
> +                                    DAG->getConstant(RsrcDword1, MVT::i32)), 0);
> +
>     SDValue DataLo = DAG->getTargetConstant(
> -      Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
> -  SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
> +      RsrcDword2And3 & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
> +  SDValue DataHi = DAG->getTargetConstant(RsrcDword2And3 >> 32, MVT::i32);
>   
>     const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
>     return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
>                                        MVT::v4i32, Ops), 0);
>   }
>   
> +/// \brief Return a resource descriptor with the 'Add TID' bit enabled
> +///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
> +///        of the resource descriptor) to create an offset, which is added to the
> +///        resource ponter.
Typo: ponter

> +static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
> +
> +  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
> +                  0xffffffff; // Size
> +
> +  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
> +}
> +
>   bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
>                                               SDValue &VAddr, SDValue &SOffset,
>                                               SDValue &ImmOffset) const {
> @@ -870,20 +914,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
>     return true;
>   }
>   
> -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
> -                                           SDValue &VAddr, SDValue &SOffset,
> -                                           SDValue &Offset, SDValue &Offen,
> -                                           SDValue &Idxen, SDValue &GLC,
> -                                           SDValue &SLC, SDValue &TFE) const {
> +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
> +                                           SDValue &SOffset, SDValue &Offset,
> +                                           SDValue &GLC, SDValue &SLC,
> +                                           SDValue &TFE) const {
> +  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
>   
> -  GLC = CurDAG->getTargetConstant(0, MVT::i1);
> -  SLC = CurDAG->getTargetConstant(0, MVT::i1);
> -  TFE = CurDAG->getTargetConstant(0, MVT::i1);
> +  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
> +              GLC, SLC, TFE);
>   
> -  Idxen = CurDAG->getTargetConstant(0, MVT::i1);
> -  Offen = CurDAG->getTargetConstant(1, MVT::i1);
> -
> -  return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
> +  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
> +      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
> +      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
> +    uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT |
> +                    APInt::getAllOnesValue(32).getZExtValue(); // Size
> +    SDLoc DL(Addr);
> +    SRsrc = buildRSRC(CurDAG, DL, Ptr, 0, Rsrc);
> +    return true;
> +  }
> +  return false;
>   }
>   
>   bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 28a60aa..c9e121f 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -45,6 +45,33 @@ static SDValue findChainOperand(SDNode *Load) {
>     return LastOp;
>   }
>   
> +/// \brief Returns true if both nodes have the same value for the given
> +///        operand \p Op, or if both nodes do not have this operand.
> +static bool nodesHaveSameOperandValue(SDNode*N0, SDNode*  N1, unsigned OpName) {
* placement

> +  unsigned Opc0 = N0->getMachineOpcode();
> +  unsigned Opc1 = N1->getMachineOpcode();
> +
> +  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
> +  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
> +
> +  if (Op0Idx == -1 && Op1Idx == -1)
> +    return true;
> +
> +
> +  if ((Op0Idx == -1 && Op1Idx != -1) ||
> +      (Op1Idx == -1 && Op0Idx != -1))
> +    return false;
> +
> +  // getNamedOperandIdx returns the index for the MachineInstr's operands,
> +  // which includes the result as the first operand. We are indexing into the
> +  // MachineSDNode's operands, so we need to skip the result operand to get
> +  // the real index.
> +  --Op0Idx;
> +  --Op1Idx;
> +
> +  return N0->getOperand(Op0Idx) == N0->getOperand(Op1Idx);
> +}
> +
>   bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
>                                             int64_t &Offset0,
>                                             int64_t &Offset1) const {
> @@ -98,32 +125,35 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
>   
>     // MUBUF and MTBUF can access the same addresses.
>     if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
> -    // Skip if an SGPR offset is applied. I don't think we ever emit any of
> -    // variants that use this currently.
> -    int SoffsetIdx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::soffset);
> -    if (SoffsetIdx != -1)
> +
> +    // MUBUF and MTBUF have vaddr at different indices.
> +    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
> +        findChainOperand(Load0) != findChainOperand(Load1) ||
> +        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
> +        !nodesHaveSameOperandValue(Load1, Load1, AMDGPU::OpName::srsrc))
>         return false;
>   
> -    // getNamedOperandIdx returns the index for the MachineInstr's operands,
> -    // which includes the result as the first operand. We are indexing into the
> -    // MachineSDNode's operands, so we need to skip the result operand to get
> -    // the real index.
> -    --SoffsetIdx;
> +    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
> +    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
>   
> -    // Check chain.
> -    if (findChainOperand(Load0) != findChainOperand(Load1))
> +    if (OffIdx0 == -1 || OffIdx1 == -1)
>         return false;
>   
> -    // MUBUF and MTBUF have vaddr at different indices.
> -    int VaddrIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::vaddr) - 1;
> -    int VaddrIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::vaddr) - 1;
> -    if (Load0->getOperand(VaddrIdx0) != Load1->getOperand(VaddrIdx1))
> +    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
> +    // inlcude the output in the operand list, but SDNodes don't, we need to
> +    // subtract the index by one.
> +    --OffIdx0;
> +    --OffIdx1;
> +
> +    SDValue Off0 = Load0->getOperand(OffIdx0);
> +    SDValue Off1 = Load1->getOperand(OffIdx1);
> +
> +    // The offset might be a FrameIndexSDNode.
> +    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
>         return false;
>   
> -    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset) - 1;
> -    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset) - 1;
> -    Offset0 = cast<ConstantSDNode>(Load0->getOperand(OffIdx0))->getZExtValue();
> -    Offset1 = cast<ConstantSDNode>(Load1->getOperand(OffIdx1))->getZExtValue();
> +    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
> +    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
>       return true;
>     }
>   
> @@ -1276,105 +1306,128 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
>     // Legalize MUBUF* instructions
>     // FIXME: If we start using the non-addr64 instructions for compute, we
>     // may need to legalize them here.
> +  int SRsrcIdx =
> +      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
> +  if (SRsrcIdx != -1) {
> +    // We have an MUBUF instruction
> +    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
> +    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
> +    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
> +                                             RI.getRegClass(SRsrcRC))) {
> +      // The operands are legal.
> +      // FIXME: We may need to legalize operands besided srsrc.
> +      return;
> +    }
>   
> -  int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
> -                                            AMDGPU::OpName::srsrc);
> -  int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
> -                                             AMDGPU::OpName::vaddr);
> -  if (SRsrcIdx != -1 && VAddrIdx != -1) {
> -    const TargetRegisterClass *VAddrRC =
> -        RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
> -
> -    if(VAddrRC->getSize() == 8 &&
> -       MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
> -      // We have a MUBUF instruction that uses a 64-bit vaddr register and
> -      // srsrc has the incorrect register class.  In order to fix this, we
> -      // need to extract the pointer from the resource descriptor (srsrc),
> -      // add it to the value of vadd,  then store the result in the vaddr
> -      // operand.  Then, we need to set the pointer field of the resource
> -      // descriptor to zero.
> +    MachineBasicBlock &MBB = *MI->getParent();
> +    // Extract the the ptr from the resource descriptor.
>   
> -      MachineBasicBlock &MBB = *MI->getParent();
> -      MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
> -      MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
> -      unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
> -      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> -      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> -      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
> -      unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> -      unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> -      unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> -      unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
> -
> -      // SRsrcPtrLo = srsrc:sub0
> -      SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
> -          &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> -
> -      // SRsrcPtrHi = srsrc:sub1
> -      SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
> -          &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> -
> -      // VAddrLo = vaddr:sub0
> -      VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
> -          &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> -
> -      // VAddrHi = vaddr:sub1
> -      VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
> -          &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> -
> -      // NewVaddrLo = SRsrcPtrLo + VAddrLo
> +    // SRsrcPtrLo = srsrc:sub0
> +    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
> +        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> +
> +    // SRsrcPtrHi = srsrc:sub1
> +    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
> +        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> +
> +    // Create an empty resource descriptor
> +    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> +    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> +    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> +    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
> +
> +    // Zero64 = 0
> +    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
> +            Zero64)
> +            .addImm(0);
> +
> +    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
> +    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> +            SRsrcFormatLo)
> +            .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
> +
> +    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
> +    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> +            SRsrcFormatHi)
> +            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
> +
> +    // NewSRsrc = {Zero64, SRsrcFormat}
> +    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> +            NewSRsrc)
> +            .addReg(Zero64)
> +            .addImm(AMDGPU::sub0_sub1)
> +            .addReg(SRsrcFormatLo)
> +            .addImm(AMDGPU::sub2)
> +            .addReg(SRsrcFormatHi)
> +            .addImm(AMDGPU::sub3);
> +
> +    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
> +    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
> +    unsigned NewVAddrLo;
> +    unsigned NewVAddrHi;
> +    if (VAddr) {
> +      // This is already an ADDR64 instruction so we need to add the pointer
> +      // extracted from the resource descriptor to the current value of VAddr.
> +      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> +      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> +
> +      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
>         BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
>                 NewVAddrLo)
>                 .addReg(SRsrcPtrLo)
> -              .addReg(VAddrLo)
> -              .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
> +              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
> +              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
Can you use a variable for MI->getDebugLoc()? Then you don't have to 
weirdly wrap the output register operand


>   
> -      // NewVaddrHi = SRsrcPtrHi + VAddrHi
> +      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
>         BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
>                 NewVAddrHi)
>                 .addReg(SRsrcPtrHi)
> -              .addReg(VAddrHi)
> +              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
>                 .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
>                 .addReg(AMDGPU::VCC, RegState::Implicit);
>   
> -      // NewVaddr = {NewVaddrHi, NewVaddrLo}
> -      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> -              NewVAddr)
> -              .addReg(NewVAddrLo)
> -              .addImm(AMDGPU::sub0)
> -              .addReg(NewVAddrHi)
> -              .addImm(AMDGPU::sub1);
> -
> -      // Zero64 = 0
> -      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
> -              Zero64)
> -              .addImm(0);
> +    } else {
> +      // This instructions is the _OFFSET variant, so we need to convert it to
> +      // ADDR64.
> +      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
> +      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
> +      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
> +      assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
> +             "with non-zero soffset is not implemented");
> +
> +      // Create the new instruction.
> +      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
> +      MachineInstr *Addr64 =
> +          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
> +                  .addOperand(*VData)
> +                  .addOperand(*SRsrc)
> +                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
> +                                              // This will be replaced later
> +                                              // with the new value of vaddr.
> +                  .addOperand(*Offset);
> +
> +      MI->removeFromParent();
> +      MI = Addr64;
> +
> +      NewVAddrLo = SRsrcPtrLo;
> +      NewVAddrHi = SRsrcPtrHi;
> +      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
> +      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
> +    }
>   
> -      // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
> -      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> -              SRsrcFormatLo)
> -              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
> +    // NewVaddr = {NewVaddrHi, NewVaddrLo}
> +    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> +            NewVAddr)
> +            .addReg(NewVAddrLo)
> +            .addImm(AMDGPU::sub0)
> +            .addReg(NewVAddrHi)
> +            .addImm(AMDGPU::sub1);
>   
> -      // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
> -      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> -              SRsrcFormatHi)
> -              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
>   
> -      // NewSRsrc = {Zero64, SRsrcFormat}
> -      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> -              NewSRsrc)
> -              .addReg(Zero64)
> -              .addImm(AMDGPU::sub0_sub1)
> -              .addReg(SRsrcFormatLo)
> -              .addImm(AMDGPU::sub2)
> -              .addReg(SRsrcFormatHi)
> -              .addImm(AMDGPU::sub3);
> -
> -      // Update the instruction to use NewVaddr
> -      MI->getOperand(VAddrIdx).setReg(NewVAddr);
> -      // Update the instruction to use NewSRsrc
> -      MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
> -    }
> +    // Update the instruction to use NewVaddr
> +    VAddr->setReg(NewVAddr);
> +    // Update the instruction to use NewSRsrc
> +    SRsrc->setReg(NewSRsrc);
>     }
>   }
>   
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index f106063..7085e98 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -209,6 +209,7 @@ namespace AMDGPU {
>     int getCommuteRev(uint16_t Opcode);
>     int getCommuteOrig(uint16_t Opcode);
>     int getMCOpcode(uint16_t Opcode, unsigned Gen);
> +  int getAddr64Inst(uint16_t Opcode);
>   
>     const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
>     const uint64_t RSRC_TID_ENABLE = 1LL << 55;
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 9214ecc..3c5a3a2 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -194,6 +194,7 @@ def tfe : Operand <i1> {
>   def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
>   def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
>   def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
> +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
>   
>   def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
>   def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
> @@ -901,6 +902,11 @@ class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
>     let mayLoad = 1;
>   }
>   
> +class MUBUFAddr64Table <bit is_addr64> {
> +
> +  bit IsAddr64 = is_addr64;
> +}
> +
>   class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
>     op,
>     (outs),
> @@ -927,7 +933,11 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
>                                (ins SReg_128:$srsrc,
>                                mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
>                                slc:$slc, tfe:$tfe),
> -                             asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
> +                             asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
> +                             [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
> +                                                       i32:$soffset, i16:$offset,
> +                                                       i1:$glc, i1:$slc, i1:$tfe)))]>,
> +                     MUBUFAddr64Table<0>;
>         }
>   
>         let offen = 1, idxen = 0  in {
> @@ -959,7 +969,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
>                              (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
>                              asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
>                              [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
> -                                                  i64:$vaddr, i16:$offset)))]>;
> +                                                  i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
>       }
>     }
>   }
> @@ -979,6 +989,18 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
>         []
>       >;
>   
> +    let offen = 0, idxen = 0, vaddr = 0 in {
> +      def _OFFSET : MUBUF <
> +        op, (outs),
> +        (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
> +              SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
> +        name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
> +        [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
> +                                           i16:$offset, i1:$glc, i1:$slc,
> +                                           i1:$tfe))]
> +      >, MUBUFAddr64Table<0>;
> +    } // offen = 0, idxen = 0, vaddr = 0
> +
>       let offen = 1, idxen = 0  in {
>         def _OFFEN  : MUBUF <
>           op, (outs),
> @@ -997,7 +1019,8 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
>       (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
>       name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
>       [(st store_vt:$vdata,
> -     (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]> {
> +     (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
> +     {
>   
>         let mayLoad = 0;
>         let mayStore = 1;
> @@ -1216,4 +1239,12 @@ def getMCOpcode : InstrMapping {
>     let ValueCols = [[!cast<string>(SISubtarget.SI)]];
>   }
>   
> +def getAddr64Inst : InstrMapping {
> +  let FilterClass = "MUBUFAddr64Table";
> +  let RowFields = ["NAME"];
> +  let ColFields = ["IsAddr64"];
> +  let KeyCol = ["0"];
> +  let ValueCols = [["1"]];
> +}
> +
>   include "SIInstructions.td"
> diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
> index f26d30d..c7c406a 100644
> --- a/test/CodeGen/R600/ctpop.ll
> +++ b/test/CodeGen/R600/ctpop.ll
> @@ -236,8 +236,8 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
>   }
>   
>   ; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
> -; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 {{addr64$}}
> -; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} offset:0x10
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:0x10
>   ; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
>   ; SI: BUFFER_STORE_DWORD [[RESULT]],
>   ; SI: S_ENDPGM
> diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
> index dc056e0..9725bbf 100644
> --- a/test/CodeGen/R600/extload.ll
> +++ b/test/CodeGen/R600/extload.ll
> @@ -87,8 +87,8 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
>   }
>   
>   ; FUNC-LABEL: @zextload_global_i8_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
>   ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
>   ; SI: BUFFER_STORE_DWORDX2
>   define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
> @@ -99,8 +99,8 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
>   }
>   
>   ; FUNC-LABEL: @zextload_global_i16_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
>   ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
>   ; SI: BUFFER_STORE_DWORDX2
>   define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
> @@ -111,8 +111,8 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
>   }
>   
>   ; FUNC-LABEL: @zextload_global_i32_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
>   ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
>   ; SI: BUFFER_STORE_DWORDX2
>   define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
> diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
> index 27faacf..bbfd329 100644
> --- a/test/CodeGen/R600/mubuf.ll
> +++ b/test/CodeGen/R600/mubuf.ll
> @@ -6,7 +6,7 @@
>   
>   ; MUBUF load with an immediate byte offset that fits into 12-bits
>   ; CHECK-LABEL: @mubuf_load0
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x30,0xe0
>   define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>   entry:
>     %0 = getelementptr i32 addrspace(1)* %in, i64 1
> @@ -17,7 +17,7 @@ entry:
>   
>   ; MUBUF load with the largest possible immediate offset
>   ; CHECK-LABEL: @mubuf_load1
> -; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0xfff ; encoding: [0xff,0x8f,0x20,0xe0
> +; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x20,0xe0
>   define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
>   entry:
>     %0 = getelementptr i8 addrspace(1)* %in, i64 4095
> @@ -28,7 +28,7 @@ entry:
>   
>   ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
>   ; CHECK-LABEL: @mubuf_load2
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
>   define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
>   entry:
>     %0 = getelementptr i32 addrspace(1)* %in, i64 1024
> @@ -40,7 +40,7 @@ entry:
>   ; MUBUF load with a 12-bit immediate offset and a register offset
>   ; CHECK-LABEL: @mubuf_load3
>   ; CHECK-NOT: ADD
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
>   define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
>   entry:
>     %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
> @@ -56,7 +56,7 @@ entry:
>   
>   ; MUBUF store with an immediate byte offset that fits into 12-bits
>   ; CHECK-LABEL: @mubuf_store0
> -; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x70,0xe0
> +; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x70,0xe0
>   define void @mubuf_store0(i32 addrspace(1)* %out) {
>   entry:
>     %0 = getelementptr i32 addrspace(1)* %out, i64 1
> @@ -66,7 +66,7 @@ entry:
>   
>   ; MUBUF store with the largest possible immediate offset
>   ; CHECK-LABEL: @mubuf_store1
> -; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0xfff ; encoding: [0xff,0x8f,0x60,0xe0
> +; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x60,0xe0
>   
>   define void @mubuf_store1(i8 addrspace(1)* %out) {
>   entry:
> @@ -77,7 +77,7 @@ entry:
>   
>   ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
>   ; CHECK-LABEL: @mubuf_store2
> -; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
> +; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
>   define void @mubuf_store2(i32 addrspace(1)* %out) {
>   entry:
>     %0 = getelementptr i32 addrspace(1)* %out, i64 1024
> diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
> index b0f9c98..505ef6c 100644
> --- a/test/CodeGen/R600/private-memory.ll
> +++ b/test/CodeGen/R600/private-memory.ll
> @@ -118,7 +118,7 @@ for.end:
>   
>   ; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
>   ; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
> -; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
>   define void @short_array(i32 addrspace(1)* %out, i32 %index) {
>   entry:
>     %0 = alloca [2 x i16]
> diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
> index f73d303..fcff65f 100644
> --- a/test/CodeGen/R600/schedule-global-loads.ll
> +++ b/test/CodeGen/R600/schedule-global-loads.ll
> @@ -9,8 +9,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
>   ; ordering the loads so that the lower address loads come first.
>   
>   ; FUNC-LABEL: @cluster_global_arg_loads
> -; SI: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
> -; SI: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
> +; SI-DAG: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x4
>   ; SI: BUFFER_STORE_DWORD [[REG0]]
>   ; SI: BUFFER_STORE_DWORD [[REG1]]
>   define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
> @@ -22,5 +22,20 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
>     ret void
>   }
>   
> +; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
> +; an MUBUF load which does not have a vaddr operand.
> +; FUNC-LABEL: @same_base_ptr_crash
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
> +entry:
> +  %out1 = getelementptr i32 addrspace(1)* %out, i32 %offset
> +  %tmp0 = load i32 addrspace(1)* %out
> +  %tmp1 = load i32 addrspace(1)* %out1
> +  %tmp2 = add i32 %tmp0, %tmp1
> +  store i32 %tmp2, i32 addrspace(1)* %out
> +  ret void
> +}
> +
>   attributes #0 = { nounwind }
>   attributes #1 = { nounwind readnone }
> diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
> index 1b02e4b..14f1cdf 100644
> --- a/test/CodeGen/R600/sext-in-reg.ll
> +++ b/test/CodeGen/R600/sext-in-reg.ll
> @@ -75,9 +75,9 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
>   }
>   
>   ; FUNC-LABEL: @sext_in_reg_i1_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
>   ; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: BUFFER_STORE_DWORDX2
>   define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
>     %c = add i64 %a, %b
> @@ -88,9 +88,9 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
>   }
>   
>   ; FUNC-LABEL: @sext_in_reg_i8_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
>   ; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: BUFFER_STORE_DWORDX2
>   
>   ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
> @@ -112,9 +112,9 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
>   }
>   
>   ; FUNC-LABEL: @sext_in_reg_i16_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
>   ; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
>   ; SI: BUFFER_STORE_DWORDX2
>   
>   ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
> diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
> index 8585d4a..1a0fd73 100644
> --- a/test/CodeGen/R600/zero_extend.ll
> +++ b/test/CodeGen/R600/zero_extend.ll
> @@ -6,7 +6,7 @@
>   ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
>   
>   ; SI-CHECK: @test
> -; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
> +; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0{{$}}
>   ; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
>   ; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
>   define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
> -- 1.8.1.5

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140811/bb28414e/attachment.html>