[PATCH] R600: Fix mishandling of load / store chains.
Tom Stellard
tom at stellard.net
Fri Jul 4 13:21:07 PDT 2014
On Sat, Jun 28, 2014 at 03:28:18AM +0000, Matt Arsenault wrote:
> Fixes various bugs with reordering loads and stores. Scalarized vector loads weren't collecting the chains at all.
LGTM.
>
> http://reviews.llvm.org/D4335
>
> Files:
> lib/Target/R600/AMDGPUISelLowering.cpp
> lib/Target/R600/SIISelLowering.cpp
> test/CodeGen/R600/reorder-stores.ll
> Index: lib/Target/R600/AMDGPUISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/AMDGPUISelLowering.cpp
> +++ lib/Target/R600/AMDGPUISelLowering.cpp
> @@ -997,22 +997,36 @@
> SelectionDAG &DAG) const {
> LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
> EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
> + EVT LoadVT = Op.getValueType();
> EVT EltVT = Op.getValueType().getVectorElementType();
> EVT PtrVT = Load->getBasePtr().getValueType();
> +
> unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
> SmallVector<SDValue, 8> Loads;
> + SmallVector<SDValue, 8> Chains;
> +
> SDLoc SL(Op);
>
> for (unsigned i = 0, e = NumElts; i != e; ++i) {
> SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
> DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
> - Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
> - Load->getChain(), Ptr,
> - MachinePointerInfo(Load->getMemOperand()->getValue()),
> - MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
> - Load->getAlignment()));
> +
> + SDValue NewLoad
> + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
> + Load->getChain(), Ptr,
> + MachinePointerInfo(Load->getMemOperand()->getValue()),
> + MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
> + Load->getAlignment());
> + Loads.push_back(NewLoad.getValue(0));
> + Chains.push_back(NewLoad.getValue(1));
> }
> - return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
> +
> + SDValue Ops[] = {
> + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
> + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
> + };
> +
> + return DAG.getMergeValues(Ops, SL);
> }
>
> SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
> @@ -1115,7 +1129,13 @@
> Load->getBasePtr(),
> MemVT,
> Load->getMemOperand());
> - return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
> +
> + SDValue Ops[] = {
> + DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
> + ExtLoad32.getValue(1)
> + };
> +
> + return DAG.getMergeValues(Ops, DL);
> }
>
> if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
> @@ -1129,7 +1149,13 @@
>
> SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
> BasePtr, MVT::i8, MMO);
> - return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
> +
> + SDValue Ops[] = {
> + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
> + NewLD.getValue(1)
> + };
> +
> + return DAG.getMergeValues(Ops, DL);
> }
>
> // Lower loads constant address space global variable loads
> @@ -1137,11 +1163,12 @@
> isa<GlobalVariable>(
> GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
>
> +
> SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
> getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
> Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
> DAG.getConstant(2, MVT::i32));
> - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
> + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
> Load->getChain(), Ptr,
> DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
> }
> @@ -1168,10 +1195,21 @@
> EVT MemEltVT = MemVT.getScalarType();
> if (ExtType == ISD::SEXTLOAD) {
> SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
> - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
> +
> + SDValue Ops[] = {
> + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
> + Load->getChain()
> + };
> +
> + return DAG.getMergeValues(Ops, DL);
> }
>
> - return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
> + SDValue Ops[] = {
> + DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
> + Load->getChain()
> + };
> +
> + return DAG.getMergeValues(Ops, DL);
> }
>
> SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
> Index: lib/Target/R600/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/SIISelLowering.cpp
> +++ lib/Target/R600/SIISelLowering.cpp
> @@ -607,13 +607,13 @@
> Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
> (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
> Op.getValueType().getVectorNumElements() > 4))) {
> - SDValue MergedValues[2] = {
> - SplitVectorLoad(Op, DAG),
> - Load->getChain()
> - };
> - return DAG.getMergeValues(MergedValues, SDLoc(Op));
> + return SplitVectorLoad(Op, DAG);
> } else {
> - return LowerLOAD(Op, DAG);
> + SDValue Result = LowerLOAD(Op, DAG);
> + assert((!Result.getNode() ||
> + Result.getNode()->getNumValues() == 2) &&
> + "Load should return a value and a chain");
> + return Result;
> }
> }
>
> @@ -830,13 +830,9 @@
> SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
> SDLoc DL(Op);
> LoadSDNode *Load = cast<LoadSDNode>(Op);
> - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
> - SDValue MergedValues[2];
> - MergedValues[1] = Load->getChain();
> - if (Ret.getNode()) {
> - MergedValues[0] = Ret;
> - return DAG.getMergeValues(MergedValues, DL);
> - }
> + SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
> + if (Lowered.getNode())
> + return Lowered;
>
> if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
> return SDValue();
> @@ -849,25 +845,38 @@
>
> SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
> DAG.getConstant(2, MVT::i32));
> - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> - Load->getChain(), Ptr,
> - DAG.getTargetConstant(0, MVT::i32),
> - Op.getOperand(2));
> +
> + // FIXME: REGISTER_LOAD should probably have a chain result.
> + SDValue Chain = Load->getChain();
> + SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> + Chain, Ptr,
> + DAG.getTargetConstant(0, MVT::i32),
> + Op.getOperand(2));
> +
> + SDValue Ret = LoLoad.getValue(0);
> if (MemVT.getSizeInBits() == 64) {
> + // TODO: This needs a test to make sure the right thing is happening with
> + // the chain. That is hard without general function support.
> +
> SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
> DAG.getConstant(1, MVT::i32));
>
> - SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> - Load->getChain(), IncPtr,
> - DAG.getTargetConstant(0, MVT::i32),
> - Op.getOperand(2));
> + SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> + Chain, IncPtr,
> + DAG.getTargetConstant(0, MVT::i32),
> + Op.getOperand(2));
>
> - Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
> + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
> + // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
> + // LoLoad.getValue(1), HiLoad.getValue(1));
> }
>
> - MergedValues[0] = Ret;
> - return DAG.getMergeValues(MergedValues, DL);
> + SDValue Ops[] = {
> + Ret,
> + Chain
> + };
>
> + return DAG.getMergeValues(Ops, DL);
> }
>
> SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
> Index: test/CodeGen/R600/reorder-stores.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/R600/reorder-stores.ll
> @@ -0,0 +1,106 @@
> +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
> +
> +; SI-LABEL: @no_reorder_v2f64_global_load_store
> +; SI: BUFFER_LOAD_DWORDX2
> +; SI: BUFFER_LOAD_DWORDX2
> +; SI: BUFFER_LOAD_DWORDX2
> +; SI: BUFFER_LOAD_DWORDX2
> +; SI: BUFFER_STORE_DWORDX2
> +; SI: BUFFER_STORE_DWORDX2
> +; SI: BUFFER_STORE_DWORDX2
> +; SI: BUFFER_STORE_DWORDX2
> +; SI: S_ENDPGM
> +define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
> + %tmp1 = load <2 x double> addrspace(1)* %x, align 16
> + %tmp4 = load <2 x double> addrspace(1)* %y, align 16
> + store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
> + store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
> + ret void
> +}
> +
> +; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
> +; SI: DS_READ_B64
> +; SI: DS_READ_B64
> +; SI: DS_WRITE_B64
> +; SI: DS_WRITE_B64
> +; SI: S_ENDPGM
> +define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
> + %tmp1 = load <2 x double> addrspace(3)* %x, align 16
> + %tmp4 = load <2 x double> addrspace(3)* %y, align 16
> + store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
> + store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
> + ret void
> +}
> +
> +; SI-LABEL: @no_reorder_split_v8i32_global_load_store
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +; SI: BUFFER_LOAD_DWORD
> +
> +
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: BUFFER_STORE_DWORD
> +; SI: S_ENDPGM
> +define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
> + %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
> + %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
> + store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
> + store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
> + ret void
> +}
> +
> +; SI-LABEL: @no_reorder_extload_64
> +; SI: DS_READ_B32
> +; SI: DS_READ_B32
> +; SI: DS_READ_B32
> +; SI: DS_READ_B32
> +; SI: DS_WRITE_B64
> +; SI-NOT: DS_READ
> +; SI: DS_WRITE_B64
> +; SI: S_ENDPGM
> +define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
> + %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
> + %tmp4 = load <2 x i32> addrspace(3)* %y, align 8
> + %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
> + %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
> + %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
> + %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
> + %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
> + %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
> + store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
> + store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
> + ret void
> +}
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list