PATCH: R600/SI: Add ComplexPattern to match MUBUF variant with no VADDR
Matt Arsenault
Matthew.Arsenault at amd.com
Mon Aug 11 12:36:09 PDT 2014
On 08/11/2014 07:09 AM, Tom Stellard wrote:
> Here are some updated patches. Mostly the same, but I had to
> update the areLoadsFromSameBasePtr function to handle the new
> MUBUF variant.
>
> -Tom
LGTM with a few minor issues
>
> 0001-R600-SI-Fix-broken-test.patch
>
>
> From fa2f3ea9a592f1b27efe06820ad184c803aa99cd Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 11 Aug 2014 06:37:45 -0400
> Subject: [PATCH 1/5] R600/SI: Fix broken test
>
> ---
> test/CodeGen/R600/smrd.ll | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
> index dec6185..6f05d3e 100644
> --- a/test/CodeGen/R600/smrd.ll
> +++ b/test/CodeGen/R600/smrd.ll
> @@ -64,8 +64,8 @@ main_body:
> ret void
> }
>
> -; SMRD load using the load.const intrinsic with an offset greater largest possible
> -; immediate offset.
> +; SMRD load using the load.const intrinsic with the largest possible immediate
> +; offset.
> ; CHECK-LABEL: @smrd_load_const1
> ; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
> define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
> @@ -76,9 +76,11 @@ main_body:
> call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
> ret void
> }
> -; SMRD load using the load.const intrinsic with the largetst possible
> +; SMRD load using the load.const intrinsic with an offset greater than the
> +; largets possible immediate.
> ; immediate offset.
> ; CHECK-LABEL: @smrd_load_const2
Typo: largets
>
> 0002-R600-SI-Clear-lds-bit-on-MUBUF-instructions-used-for.patch
>
>
> From 48fb3affd4a5c7544ce5b59df14452773088813a Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 6 Aug 2014 11:42:14 -0400
> Subject: [PATCH 2/5] R600/SI: Clear lds bit on MUBUF instructions used for
> private stores
>
> This bit was left uninitialized, which was causing some random failures
> of piglit tests.
>
> NOTE: This is a candidate for the 3.5 branch.
> ---
> lib/Target/R600/SIInstrInfo.td | 1 +
> test/CodeGen/R600/private-memory.ll | 19 +++++++++----------
> 2 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index eacfbc6..dfc7a59 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -975,6 +975,7 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
> name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#"$glc"#"$slc"#"$tfe",
> []
> > {
> + let lds = 0;
> let addr64 = 0;
> }
>
> diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
> index 9da3d32..b0f9c98 100644
> --- a/test/CodeGen/R600/private-memory.ll
> +++ b/test/CodeGen/R600/private-memory.ll
> @@ -1,6 +1,6 @@
> ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
> -; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
> -; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
> +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
> +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
>
> declare i32 @llvm.r600.read.tidig.x() nounwind readnone
>
> @@ -16,8 +16,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
> ; SI-PROMOTE: DS_READ_B32
> ; SI-PROMOTE: DS_READ_B32
>
> -; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
> +; SI-ALLOCA: BUFFER_STORE_DWORD v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
> define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
> entry:
> %stack = alloca [5 x i32], align 4
> @@ -116,10 +116,9 @@ for.end:
>
> ; R600: MOVA_INT
>
> -; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-PROMOTE: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> -; SI-PROMOTE-NOT: MOVREL
> -; SI-PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
> +; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
> +; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> define void @short_array(i32 addrspace(1)* %out, i32 %index) {
> entry:
Broken SI_PROMOTE check line
> %0 = alloca [2 x i16]
> @@ -138,8 +137,8 @@ entry:
>
> ; R600: MOVA_INT
>
> -; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} {{offen$}}
> -; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1
> +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
> +; SI-DAG: BUFFER_STORE_BYTE v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1 ; encoding: [0x01,0x10,0x60,0xe0
> define void @char_array(i32 addrspace(1)* %out, i32 %index) {
> entry:
> %0 = alloca [2 x i8]
> -- 1.8.1.5
>
>
> 0005-R600-SI-Add-a-ComplexPattern-for-selecting-MUBUF-_OF.patch
>
>
> From 9aca99829a941739c893a4b35e37e1188d411dec Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Fri, 1 Aug 2014 11:00:34 -0400
> Subject: [PATCH 5/5] R600/SI: Add a ComplexPattern for selecting MUBUF _OFFSET
> variant
>
> This saves us from having to copy a 64-bit 0 value into VGPRs for
> BUFFER_* instruction which only have a 12-bit immediate offset.
> ---
> lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 147 ++++++++++------
> lib/Target/R600/SIInstrInfo.cpp | 259 +++++++++++++++++------------
> lib/Target/R600/SIInstrInfo.h | 1 +
> lib/Target/R600/SIInstrInfo.td | 37 ++++-
> test/CodeGen/R600/ctpop.ll | 4 +-
> test/CodeGen/R600/extload.ll | 12 +-
> test/CodeGen/R600/mubuf.ll | 14 +-
> test/CodeGen/R600/private-memory.ll | 2 +-
> test/CodeGen/R600/schedule-global-loads.ll | 19 ++-
> test/CodeGen/R600/sext-in-reg.ll | 6 +-
> test/CodeGen/R600/zero_extend.ll | 2 +-
> 11 files changed, 326 insertions(+), 177 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> index d4c0987..a624b70 100644
> --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> @@ -88,13 +88,16 @@ private:
> SDValue& Offset);
> bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
> bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
> - bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
> - SDValue &ImmOffset) const;
> + void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> + SDValue &SOffset, SDValue &Offset, SDValue &Offen,
> + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
> + SDValue &TFE) const;
> + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> + SDValue &Offset) const;
> bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
> SDValue &SOffset, SDValue &ImmOffset) const;
> - bool SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
> - SDValue &SOffset, SDValue &Offset, SDValue &Offen,
> - SDValue &Idxen, SDValue &GLC, SDValue &SLC,
> + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
> + SDValue &Offset, SDValue &GLC, SDValue &SLC,
> SDValue &TFE) const;
> bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
> bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
> @@ -750,11 +753,23 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
> return isUInt<12>(Imm->getZExtValue());
> }
>
> -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
> - SDValue &Offset,
> - SDValue &ImmOffset) const {
> +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
> + SDValue &VAddr, SDValue &SOffset,
> + SDValue &Offset, SDValue &Offen,
> + SDValue &Idxen, SDValue &Addr64,
> + SDValue &GLC, SDValue &SLC,
> + SDValue &TFE) const {
> SDLoc DL(Addr);
>
> + GLC = CurDAG->getTargetConstant(0, MVT::i1);
> + SLC = CurDAG->getTargetConstant(0, MVT::i1);
> + TFE = CurDAG->getTargetConstant(0, MVT::i1);
> +
> + Idxen = CurDAG->getTargetConstant(0, MVT::i1);
> + Offen = CurDAG->getTargetConstant(0, MVT::i1);
> + Addr64 = CurDAG->getTargetConstant(0, MVT::i1);
> + SOffset = CurDAG->getTargetConstant(0, MVT::i32);
> +
> if (CurDAG->isBaseWithConstantOffset(Addr)) {
> SDValue N0 = Addr.getOperand(0);
> SDValue N1 = Addr.getOperand(1);
> @@ -763,59 +778,88 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
> if (isLegalMUBUFImmOffset(C1)) {
>
> if (N0.getOpcode() == ISD::ADD) {
> - // (add (add N2, N3), C1)
> + // (add (add N2, N3), C1) -> addr64
> SDValue N2 = N0.getOperand(0);
> SDValue N3 = N0.getOperand(1);
> - Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
> - Offset = N3;
> - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> - return true;
> + Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
> + Ptr = N2;
> + VAddr = N3;
> + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> + return;
> }
>
> - // (add N0, C1)
> - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
> - Offset = N0;
> - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> - return true;
> + // (add N0, C1) -> offset
> + VAddr = CurDAG->getTargetConstant(0, MVT::i32);
> + Ptr = N0;
> + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
> + return;
> }
> }
> if (Addr.getOpcode() == ISD::ADD) {
> - // (add N0, N1)
> + // (add N0, N1) -> addr64
> SDValue N0 = Addr.getOperand(0);
> SDValue N1 = Addr.getOperand(1);
> - Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
> - Offset = N1;
> - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
> - return true;
> + Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
> + Ptr = N0;
> + VAddr = N1;
> + Offset = CurDAG->getTargetConstant(0, MVT::i16);
> + return;
> }
>
> - // default case
> - Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
> - Offset = Addr;
> - ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
> - return true;
> + // default case -> offset
> + VAddr = CurDAG->getTargetConstant(0, MVT::i32);
> + Ptr = Addr;
> + Offset = CurDAG->getTargetConstant(0, MVT::i16);
> +
> }
>
> -/// \brief Return a resource descriptor with the 'Add TID' bit enabled
> -/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
> -/// of the resource descriptor) to create an offset, which is added to the
> -/// resource ponter.
> -static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
> +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
> + SDValue &VAddr,
> + SDValue &Offset) const {
> + SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
>
> - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
> - 0xffffffff;
> + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
> + GLC, SLC, TFE);
> +
> + ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
> + if (C->getSExtValue()) {
> + SDLoc DL(Addr);
> + SRsrc = wrapAddr64Rsrc(CurDAG, DL, Ptr);
> + return true;
> + }
> + return false;
> +}
> +
> +static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr,
> + uint32_t RsrcDword1, uint64_t RsrcDword2And3) {
>
> SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
> SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
> + if (RsrcDword1)
> + PtrHi = SDValue(DAG->getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
> + DAG->getConstant(RsrcDword1, MVT::i32)), 0);
> +
> SDValue DataLo = DAG->getTargetConstant(
> - Rsrc & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
> - SDValue DataHi = DAG->getTargetConstant(Rsrc >> 32, MVT::i32);
> + RsrcDword2And3 & APInt::getAllOnesValue(32).getZExtValue(), MVT::i32);
> + SDValue DataHi = DAG->getTargetConstant(RsrcDword2And3 >> 32, MVT::i32);
>
> const SDValue Ops[] = { PtrLo, PtrHi, DataLo, DataHi };
> return SDValue(DAG->getMachineNode(AMDGPU::SI_BUFFER_RSRC, DL,
> MVT::v4i32, Ops), 0);
> }
>
> +/// \brief Return a resource descriptor with the 'Add TID' bit enabled
> +/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
> +/// of the resource descriptor) to create an offset, which is added to the
> +/// resource ponter.
Typo: ponter
> +static SDValue buildScratchRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
> +
> + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
> + 0xffffffff; // Size
> +
> + return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
> +}
> +
> bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
> SDValue &VAddr, SDValue &SOffset,
> SDValue &ImmOffset) const {
> @@ -870,20 +914,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
> return true;
> }
>
> -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr32(SDValue Addr, SDValue &SRsrc,
> - SDValue &VAddr, SDValue &SOffset,
> - SDValue &Offset, SDValue &Offen,
> - SDValue &Idxen, SDValue &GLC,
> - SDValue &SLC, SDValue &TFE) const {
> +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
> + SDValue &SOffset, SDValue &Offset,
> + SDValue &GLC, SDValue &SLC,
> + SDValue &TFE) const {
> + SDValue Ptr, VAddr, Offen, Idxen, Addr64;
>
> - GLC = CurDAG->getTargetConstant(0, MVT::i1);
> - SLC = CurDAG->getTargetConstant(0, MVT::i1);
> - TFE = CurDAG->getTargetConstant(0, MVT::i1);
> + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
> + GLC, SLC, TFE);
>
> - Idxen = CurDAG->getTargetConstant(0, MVT::i1);
> - Offen = CurDAG->getTargetConstant(1, MVT::i1);
> -
> - return SelectMUBUFScratch(Addr, SRsrc, VAddr, SOffset, Offset);
> + if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
> + !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
> + !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
> + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT |
> + APInt::getAllOnesValue(32).getZExtValue(); // Size
> + SDLoc DL(Addr);
> + SRsrc = buildRSRC(CurDAG, DL, Ptr, 0, Rsrc);
> + return true;
> + }
> + return false;
> }
>
> bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 28a60aa..c9e121f 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -45,6 +45,33 @@ static SDValue findChainOperand(SDNode *Load) {
> return LastOp;
> }
>
> +/// \brief Returns true if both nodes have the same value for the given
> +/// operand \p Op, or if both nodes do not have this operand.
> +static bool nodesHaveSameOperandValue(SDNode*N0, SDNode* N1, unsigned OpName) {
* placement
> + unsigned Opc0 = N0->getMachineOpcode();
> + unsigned Opc1 = N1->getMachineOpcode();
> +
> + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
> + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
> +
> + if (Op0Idx == -1 && Op1Idx == -1)
> + return true;
> +
> +
> + if ((Op0Idx == -1 && Op1Idx != -1) ||
> + (Op1Idx == -1 && Op0Idx != -1))
> + return false;
> +
> + // getNamedOperandIdx returns the index for the MachineInstr's operands,
> + // which includes the result as the first operand. We are indexing into the
> + // MachineSDNode's operands, so we need to skip the result operand to get
> + // the real index.
> + --Op0Idx;
> + --Op1Idx;
> +
> + return N0->getOperand(Op0Idx) == N0->getOperand(Op1Idx);
> +}
> +
> bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
> int64_t &Offset0,
> int64_t &Offset1) const {
> @@ -98,32 +125,35 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
>
> // MUBUF and MTBUF can access the same addresses.
> if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
> - // Skip if an SGPR offset is applied. I don't think we ever emit any of
> - // variants that use this currently.
> - int SoffsetIdx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::soffset);
> - if (SoffsetIdx != -1)
> +
> + // MUBUF and MTBUF have vaddr at different indices.
> + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
> + findChainOperand(Load0) != findChainOperand(Load1) ||
> + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
> + !nodesHaveSameOperandValue(Load1, Load1, AMDGPU::OpName::srsrc))
> return false;
>
> - // getNamedOperandIdx returns the index for the MachineInstr's operands,
> - // which includes the result as the first operand. We are indexing into the
> - // MachineSDNode's operands, so we need to skip the result operand to get
> - // the real index.
> - --SoffsetIdx;
> + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
> + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
>
> - // Check chain.
> - if (findChainOperand(Load0) != findChainOperand(Load1))
> + if (OffIdx0 == -1 || OffIdx1 == -1)
> return false;
>
> - // MUBUF and MTBUF have vaddr at different indices.
> - int VaddrIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::vaddr) - 1;
> - int VaddrIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::vaddr) - 1;
> - if (Load0->getOperand(VaddrIdx0) != Load1->getOperand(VaddrIdx1))
> + // getNamedOperandIdx returns the index for MachineInstrs. Since they
> + // inlcude the output in the operand list, but SDNodes don't, we need to
> + // subtract the index by one.
> + --OffIdx0;
> + --OffIdx1;
> +
> + SDValue Off0 = Load0->getOperand(OffIdx0);
> + SDValue Off1 = Load1->getOperand(OffIdx1);
> +
> + // The offset might be a FrameIndexSDNode.
> + if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
> return false;
>
> - int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset) - 1;
> - int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset) - 1;
> - Offset0 = cast<ConstantSDNode>(Load0->getOperand(OffIdx0))->getZExtValue();
> - Offset1 = cast<ConstantSDNode>(Load1->getOperand(OffIdx1))->getZExtValue();
> + Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
> + Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
> return true;
> }
>
> @@ -1276,105 +1306,128 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
> // Legalize MUBUF* instructions
> // FIXME: If we start using the non-addr64 instructions for compute, we
> // may need to legalize them here.
> + int SRsrcIdx =
> + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
> + if (SRsrcIdx != -1) {
> + // We have an MUBUF instruction
> + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
> + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
> + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
> + RI.getRegClass(SRsrcRC))) {
> + // The operands are legal.
> + // FIXME: We may need to legalize operands besided srsrc.
> + return;
> + }
>
> - int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
> - AMDGPU::OpName::srsrc);
> - int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
> - AMDGPU::OpName::vaddr);
> - if (SRsrcIdx != -1 && VAddrIdx != -1) {
> - const TargetRegisterClass *VAddrRC =
> - RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
> -
> - if(VAddrRC->getSize() == 8 &&
> - MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
> - // We have a MUBUF instruction that uses a 64-bit vaddr register and
> - // srsrc has the incorrect register class. In order to fix this, we
> - // need to extract the pointer from the resource descriptor (srsrc),
> - // add it to the value of vadd, then store the result in the vaddr
> - // operand. Then, we need to set the pointer field of the resource
> - // descriptor to zero.
> + MachineBasicBlock &MBB = *MI->getParent();
> + // Extract the the ptr from the resource descriptor.
>
> - MachineBasicBlock &MBB = *MI->getParent();
> - MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
> - MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
> - unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
> - unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> - unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
> - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
> -
> - // SRsrcPtrLo = srsrc:sub0
> - SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
> - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> -
> - // SRsrcPtrHi = srsrc:sub1
> - SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
> - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> -
> - // VAddrLo = vaddr:sub0
> - VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
> - &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> -
> - // VAddrHi = vaddr:sub1
> - VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
> - &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> -
> - // NewVaddrLo = SRsrcPtrLo + VAddrLo
> + // SRsrcPtrLo = srsrc:sub0
> + unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
> + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
> +
> + // SRsrcPtrHi = srsrc:sub1
> + unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
> + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
> +
> + // Create an empty resource descriptor
> + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
> + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
> + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
> +
> + // Zero64 = 0
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
> + Zero64)
> + .addImm(0);
> +
> + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> + SRsrcFormatLo)
> + .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
> +
> + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> + SRsrcFormatHi)
> + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
> +
> + // NewSRsrc = {Zero64, SRsrcFormat}
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> + NewSRsrc)
> + .addReg(Zero64)
> + .addImm(AMDGPU::sub0_sub1)
> + .addReg(SRsrcFormatLo)
> + .addImm(AMDGPU::sub2)
> + .addReg(SRsrcFormatHi)
> + .addImm(AMDGPU::sub3);
> +
> + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
> + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
> + unsigned NewVAddrLo;
> + unsigned NewVAddrHi;
> + if (VAddr) {
> + // This is already an ADDR64 instruction so we need to add the pointer
> + // extracted from the resource descriptor to the current value of VAddr.
> + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> +
> + // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
> BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
> NewVAddrLo)
> .addReg(SRsrcPtrLo)
> - .addReg(VAddrLo)
> - .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
> + .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
> + .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
Can you use a variable for MI->getDebugLoc()? Then you don't have to
weirdly wrap the output register operand
>
> - // NewVaddrHi = SRsrcPtrHi + VAddrHi
> + // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
> BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
> NewVAddrHi)
> .addReg(SRsrcPtrHi)
> - .addReg(VAddrHi)
> + .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
> .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
> .addReg(AMDGPU::VCC, RegState::Implicit);
>
> - // NewVaddr = {NewVaddrHi, NewVaddrLo}
> - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> - NewVAddr)
> - .addReg(NewVAddrLo)
> - .addImm(AMDGPU::sub0)
> - .addReg(NewVAddrHi)
> - .addImm(AMDGPU::sub1);
> -
> - // Zero64 = 0
> - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
> - Zero64)
> - .addImm(0);
> + } else {
> + // This instructions is the _OFFSET variant, so we need to convert it to
> + // ADDR64.
> + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
> + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
> + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
> + assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
> + "with non-zero soffset is not implemented");
> +
> + // Create the new instruction.
> + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
> + MachineInstr *Addr64 =
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
> + .addOperand(*VData)
> + .addOperand(*SRsrc)
> + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
> + // This will be replaced later
> + // with the new value of vaddr.
> + .addOperand(*Offset);
> +
> + MI->removeFromParent();
> + MI = Addr64;
> +
> + NewVAddrLo = SRsrcPtrLo;
> + NewVAddrHi = SRsrcPtrHi;
> + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
> + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
> + }
>
> - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
> - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> - SRsrcFormatLo)
> - .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
> + // NewVaddr = {NewVaddrHi, NewVaddrLo}
> + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> + NewVAddr)
> + .addReg(NewVAddrLo)
> + .addImm(AMDGPU::sub0)
> + .addReg(NewVAddrHi)
> + .addImm(AMDGPU::sub1);
>
> - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
> - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
> - SRsrcFormatHi)
> - .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
>
> - // NewSRsrc = {Zero64, SRsrcFormat}
> - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
> - NewSRsrc)
> - .addReg(Zero64)
> - .addImm(AMDGPU::sub0_sub1)
> - .addReg(SRsrcFormatLo)
> - .addImm(AMDGPU::sub2)
> - .addReg(SRsrcFormatHi)
> - .addImm(AMDGPU::sub3);
> -
> - // Update the instruction to use NewVaddr
> - MI->getOperand(VAddrIdx).setReg(NewVAddr);
> - // Update the instruction to use NewSRsrc
> - MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
> - }
> + // Update the instruction to use NewVaddr
> + VAddr->setReg(NewVAddr);
> + // Update the instruction to use NewSRsrc
> + SRsrc->setReg(NewSRsrc);
> }
> }
>
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index f106063..7085e98 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -209,6 +209,7 @@ namespace AMDGPU {
> int getCommuteRev(uint16_t Opcode);
> int getCommuteOrig(uint16_t Opcode);
> int getMCOpcode(uint16_t Opcode, unsigned Gen);
> + int getAddr64Inst(uint16_t Opcode);
>
> const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
> const uint64_t RSRC_TID_ENABLE = 1LL << 55;
> diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
> index 9214ecc..3c5a3a2 100644
> --- a/lib/Target/R600/SIInstrInfo.td
> +++ b/lib/Target/R600/SIInstrInfo.td
> @@ -194,6 +194,7 @@ def tfe : Operand <i1> {
> def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
> def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
> def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
> +def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
>
> def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
> def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
> @@ -901,6 +902,11 @@ class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
> let mayLoad = 1;
> }
>
> +class MUBUFAddr64Table <bit is_addr64> {
> +
> + bit IsAddr64 = is_addr64;
> +}
> +
> class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
> op,
> (outs),
> @@ -927,7 +933,11 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
> (ins SReg_128:$srsrc,
> mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
> slc:$slc, tfe:$tfe),
> - asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
> + asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
> + [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
> + i32:$soffset, i16:$offset,
> + i1:$glc, i1:$slc, i1:$tfe)))]>,
> + MUBUFAddr64Table<0>;
> }
>
> let offen = 1, idxen = 0 in {
> @@ -959,7 +969,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
> (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
> asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
> [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
> - i64:$vaddr, i16:$offset)))]>;
> + i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
> }
> }
> }
> @@ -979,6 +989,18 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
> []
> >;
>
> + let offen = 0, idxen = 0, vaddr = 0 in {
> + def _OFFSET : MUBUF <
> + op, (outs),
> + (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
> + SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
> + name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
> + [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
> + i16:$offset, i1:$glc, i1:$slc,
> + i1:$tfe))]
> + >, MUBUFAddr64Table<0>;
> + } // offen = 0, idxen = 0, vaddr = 0
> +
> let offen = 1, idxen = 0 in {
> def _OFFEN : MUBUF <
> op, (outs),
> @@ -997,7 +1019,8 @@ multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass
> (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
> name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
> [(st store_vt:$vdata,
> - (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]> {
> + (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
> + {
>
> let mayLoad = 0;
> let mayStore = 1;
> @@ -1216,4 +1239,12 @@ def getMCOpcode : InstrMapping {
> let ValueCols = [[!cast<string>(SISubtarget.SI)]];
> }
>
> +def getAddr64Inst : InstrMapping {
> + let FilterClass = "MUBUFAddr64Table";
> + let RowFields = ["NAME"];
> + let ColFields = ["IsAddr64"];
> + let KeyCol = ["0"];
> + let ValueCols = [["1"]];
> +}
> +
> include "SIInstructions.td"
> diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
> index f26d30d..c7c406a 100644
> --- a/test/CodeGen/R600/ctpop.ll
> +++ b/test/CodeGen/R600/ctpop.ll
> @@ -236,8 +236,8 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
> }
>
> ; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
> -; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 {{addr64$}}
> -; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} offset:0x10
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:0x10
> ; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
> ; SI: BUFFER_STORE_DWORD [[RESULT]],
> ; SI: S_ENDPGM
> diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
> index dc056e0..9725bbf 100644
> --- a/test/CodeGen/R600/extload.ll
> +++ b/test/CodeGen/R600/extload.ll
> @@ -87,8 +87,8 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
> }
>
> ; FUNC-LABEL: @zextload_global_i8_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
> ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
> ; SI: BUFFER_STORE_DWORDX2
> define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
> @@ -99,8 +99,8 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
> }
>
> ; FUNC-LABEL: @zextload_global_i16_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
> ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
> ; SI: BUFFER_STORE_DWORDX2
> define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
> @@ -111,8 +111,8 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
> }
>
> ; FUNC-LABEL: @zextload_global_i32_to_i64
> -; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
> -; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
> +; SI-DAG: S_MOV_B32 [[ZERO:s[0-9]+]], 0{{$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
> ; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
> ; SI: BUFFER_STORE_DWORDX2
> define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
> diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
> index 27faacf..bbfd329 100644
> --- a/test/CodeGen/R600/mubuf.ll
> +++ b/test/CodeGen/R600/mubuf.ll
> @@ -6,7 +6,7 @@
>
> ; MUBUF load with an immediate byte offset that fits into 12-bits
> ; CHECK-LABEL: @mubuf_load0
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x30,0xe0
> define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
> entry:
> %0 = getelementptr i32 addrspace(1)* %in, i64 1
> @@ -17,7 +17,7 @@ entry:
>
> ; MUBUF load with the largest possible immediate offset
> ; CHECK-LABEL: @mubuf_load1
> -; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0xfff ; encoding: [0xff,0x8f,0x20,0xe0
> +; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x20,0xe0
> define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
> entry:
> %0 = getelementptr i8 addrspace(1)* %in, i64 4095
> @@ -28,7 +28,7 @@ entry:
>
> ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
> ; CHECK-LABEL: @mubuf_load2
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
> define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
> entry:
> %0 = getelementptr i32 addrspace(1)* %in, i64 1024
> @@ -40,7 +40,7 @@ entry:
> ; MUBUF load with a 12-bit immediate offset and a register offset
> ; CHECK-LABEL: @mubuf_load3
> ; CHECK-NOT: ADD
> -; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
> +; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
> define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
> entry:
> %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
> @@ -56,7 +56,7 @@ entry:
>
> ; MUBUF store with an immediate byte offset that fits into 12-bits
> ; CHECK-LABEL: @mubuf_store0
> -; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x70,0xe0
> +; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x70,0xe0
> define void @mubuf_store0(i32 addrspace(1)* %out) {
> entry:
> %0 = getelementptr i32 addrspace(1)* %out, i64 1
> @@ -66,7 +66,7 @@ entry:
>
> ; MUBUF store with the largest possible immediate offset
> ; CHECK-LABEL: @mubuf_store1
> -; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0xfff ; encoding: [0xff,0x8f,0x60,0xe0
> +; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x60,0xe0
>
> define void @mubuf_store1(i8 addrspace(1)* %out) {
> entry:
> @@ -77,7 +77,7 @@ entry:
>
> ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
> ; CHECK-LABEL: @mubuf_store2
> -; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
> +; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
> define void @mubuf_store2(i32 addrspace(1)* %out) {
> entry:
> %0 = getelementptr i32 addrspace(1)* %out, i64 1024
> diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
> index b0f9c98..505ef6c 100644
> --- a/test/CodeGen/R600/private-memory.ll
> +++ b/test/CodeGen/R600/private-memory.ll
> @@ -118,7 +118,7 @@ for.end:
>
> ; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
> ; SI-PROMOTE-DAG: BUFFER_STORE_SHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
> -; SI_PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> +; SI-PROMOTE: BUFFER_LOAD_SSHORT v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
> define void @short_array(i32 addrspace(1)* %out, i32 %index) {
> entry:
> %0 = alloca [2 x i16]
> diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
> index f73d303..fcff65f 100644
> --- a/test/CodeGen/R600/schedule-global-loads.ll
> +++ b/test/CodeGen/R600/schedule-global-loads.ll
> @@ -9,8 +9,8 @@ declare i32 @llvm.r600.read.tidig.x() #1
> ; ordering the loads so that the lower address loads come first.
>
> ; FUNC-LABEL: @cluster_global_arg_loads
> -; SI: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
> -; SI: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
> +; SI-DAG: BUFFER_LOAD_DWORD [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
> +; SI-DAG: BUFFER_LOAD_DWORD [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x4
> ; SI: BUFFER_STORE_DWORD [[REG0]]
> ; SI: BUFFER_STORE_DWORD [[REG1]]
> define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
> @@ -22,5 +22,20 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
> ret void
> }
>
> +; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
> +; an MUBUF load which does not have a vaddr operand.
> +; FUNC-LABEL: @same_base_ptr_crash
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
> +entry:
> + %out1 = getelementptr i32 addrspace(1)* %out, i32 %offset
> + %tmp0 = load i32 addrspace(1)* %out
> + %tmp1 = load i32 addrspace(1)* %out1
> + %tmp2 = add i32 %tmp0, %tmp1
> + store i32 %tmp2, i32 addrspace(1)* %out
> + ret void
> +}
> +
> attributes #0 = { nounwind }
> attributes #1 = { nounwind readnone }
> diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
> index 1b02e4b..14f1cdf 100644
> --- a/test/CodeGen/R600/sext-in-reg.ll
> +++ b/test/CodeGen/R600/sext-in-reg.ll
> @@ -75,9 +75,9 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
> }
>
> ; FUNC-LABEL: @sext_in_reg_i1_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
> ; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: BUFFER_STORE_DWORDX2
> define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
> %c = add i64 %a, %b
> @@ -88,9 +88,9 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
> }
>
> ; FUNC-LABEL: @sext_in_reg_i8_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
> ; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: BUFFER_STORE_DWORDX2
>
> ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
> @@ -112,9 +112,9 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
> }
>
> ; FUNC-LABEL: @sext_in_reg_i16_to_i64
> +; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
> ; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
> -; SI: S_MOV_B32 {{s[0-9]+}}, -1
> ; SI: BUFFER_STORE_DWORDX2
>
> ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
> diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
> index 8585d4a..1a0fd73 100644
> --- a/test/CodeGen/R600/zero_extend.ll
> +++ b/test/CodeGen/R600/zero_extend.ll
> @@ -6,7 +6,7 @@
> ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
>
> ; SI-CHECK: @test
> -; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
> +; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0{{$}}
> ; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
> ; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
> define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
> -- 1.8.1.5
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140811/bb28414e/attachment.html>
More information about the llvm-commits
mailing list