[PATCH] R600/SI: Fix 64-bit private loads.
Tom Stellard
tom at stellard.net
Mon Mar 24 07:43:32 PDT 2014
On Fri, Mar 21, 2014 at 06:05:28PM -0700, Matt Arsenault wrote:
> http://llvm-reviews.chandlerc.com/D3145
>
LGTM.
> Files:
> lib/Target/R600/SIISelLowering.cpp
> test/CodeGen/R600/indirect-private-64.ll
>
> Index: lib/Target/R600/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/SIISelLowering.cpp
> +++ lib/Target/R600/SIISelLowering.cpp
> @@ -737,12 +737,28 @@
> return SDValue();
> }
>
> + EVT MemVT = Load->getMemoryVT();
> +
> + assert(!MemVT.isVector() && "Private loads should be scalarized");
> + assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int");
> +
> SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
> DAG.getConstant(2, MVT::i32));
> - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
> + Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> Load->getChain(), Ptr,
> DAG.getTargetConstant(0, MVT::i32),
> Op.getOperand(2));
> + if (MemVT.getSizeInBits() == 64) {
> + SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
> + DAG.getConstant(1, MVT::i32));
> +
> + SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> + Load->getChain(), IncPtr,
> + DAG.getTargetConstant(0, MVT::i32),
> + Op.getOperand(2));
> +
> + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
> + }
>
> MergedValues[0] = Ret;
> return DAG.getMergeValues(MergedValues, 2, DL);
> Index: test/CodeGen/R600/indirect-private-64.ll
> ===================================================================
> --- test/CodeGen/R600/indirect-private-64.ll
> +++ test/CodeGen/R600/indirect-private-64.ll
> @@ -1,12 +1,13 @@
> -; REQUIRES: asserts
> -; XFAIL: *
> ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
>
> declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
>
> -; SI-LABEL: @indirect_access_f64_alloca:
> -; SI: BUFFER_STORE_DWORD
> -define void @f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
> +; SI-LABEL: @private_access_f64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
> %val = load double addrspace(1)* %in, align 8
> %array = alloca double, i32 16, align 8
> %ptr = getelementptr double* %array, i32 %b
> @@ -17,9 +18,16 @@
> ret void
> }
>
> -; SI-LABEL: @indirect_access_v2f64_alloca:
> -; SI: BUFFER_STORE_DWORDX4
> -define void @v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
> +; SI-LABEL: @private_access_v2f64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
> %val = load <2 x double> addrspace(1)* %in, align 16
> %array = alloca <2 x double>, i32 16, align 16
> %ptr = getelementptr <2 x double>* %array, i32 %b
> @@ -29,3 +37,39 @@
> store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
> ret void
> }
> +
> +; SI-LABEL: @private_access_i64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
> + %val = load i64 addrspace(1)* %in, align 8
> + %array = alloca i64, i32 16, align 8
> + %ptr = getelementptr i64* %array, i32 %b
> + store i64 %val, i64* %ptr, align 8
> + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
> + %result = load i64* %ptr, align 8
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; SI-LABEL: @private_access_v2i64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
> + %val = load <2 x i64> addrspace(1)* %in, align 16
> + %array = alloca <2 x i64>, i32 16, align 16
> + %ptr = getelementptr <2 x i64>* %array, i32 %b
> + store <2 x i64> %val, <2 x i64>* %ptr, align 16
> + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
> + %result = load <2 x i64>* %ptr, align 16
> + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
> + ret void
> +}
> Index: lib/Target/R600/SIISelLowering.cpp
> ===================================================================
> --- lib/Target/R600/SIISelLowering.cpp
> +++ lib/Target/R600/SIISelLowering.cpp
> @@ -737,12 +737,28 @@
> return SDValue();
> }
>
> + EVT MemVT = Load->getMemoryVT();
> +
> + assert(!MemVT.isVector() && "Private loads should be scalarized");
> + assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int");
> +
> SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
> DAG.getConstant(2, MVT::i32));
> - Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
> + Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> Load->getChain(), Ptr,
> DAG.getTargetConstant(0, MVT::i32),
> Op.getOperand(2));
> + if (MemVT.getSizeInBits() == 64) {
> + SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
> + DAG.getConstant(1, MVT::i32));
> +
> + SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
> + Load->getChain(), IncPtr,
> + DAG.getTargetConstant(0, MVT::i32),
> + Op.getOperand(2));
> +
> + Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
> + }
>
> MergedValues[0] = Ret;
> return DAG.getMergeValues(MergedValues, 2, DL);
> Index: test/CodeGen/R600/indirect-private-64.ll
> ===================================================================
> --- test/CodeGen/R600/indirect-private-64.ll
> +++ test/CodeGen/R600/indirect-private-64.ll
> @@ -1,12 +1,13 @@
> -; REQUIRES: asserts
> -; XFAIL: *
> ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
>
> declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
>
> -; SI-LABEL: @indirect_access_f64_alloca:
> -; SI: BUFFER_STORE_DWORD
> -define void @f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
> +; SI-LABEL: @private_access_f64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
> %val = load double addrspace(1)* %in, align 8
> %array = alloca double, i32 16, align 8
> %ptr = getelementptr double* %array, i32 %b
> @@ -17,9 +18,16 @@
> ret void
> }
>
> -; SI-LABEL: @indirect_access_v2f64_alloca:
> -; SI: BUFFER_STORE_DWORDX4
> -define void @v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
> +; SI-LABEL: @private_access_v2f64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
> %val = load <2 x double> addrspace(1)* %in, align 16
> %array = alloca <2 x double>, i32 16, align 16
> %ptr = getelementptr <2 x double>* %array, i32 %b
> @@ -29,3 +37,39 @@
> store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
> ret void
> }
> +
> +; SI-LABEL: @private_access_i64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
> + %val = load i64 addrspace(1)* %in, align 8
> + %array = alloca i64, i32 16, align 8
> + %ptr = getelementptr i64* %array, i32 %b
> + store i64 %val, i64* %ptr, align 8
> + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
> + %result = load i64* %ptr, align 8
> + store i64 %result, i64 addrspace(1)* %out, align 8
> + ret void
> +}
> +
> +; SI-LABEL: @private_access_v2i64_alloca:
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELD_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +; SI: V_MOVRELS_B32_e32
> +define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
> + %val = load <2 x i64> addrspace(1)* %in, align 16
> + %array = alloca <2 x i64>, i32 16, align 16
> + %ptr = getelementptr <2 x i64>* %array, i32 %b
> + store <2 x i64> %val, <2 x i64>* %ptr, align 16
> + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
> + %result = load <2 x i64>* %ptr, align 16
> + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
> + ret void
> +}
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list