PATCH: R600 OpenCV fixes
Tom Stellard
tom at stellard.net
Thu Dec 19 21:01:39 PST 2013
Hi,
There patches fix a few bugs uncovered by the OpenCV test suite.
Please Review.
-Tom
-------------- next part --------------
>From d80137396c6285ad315ca146d655d91e499d535e Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 11 Dec 2013 17:09:34 -0500
Subject: [PATCH 1/5] R600/SI: Add support for i8 and i16 private loads/stores
---
lib/Target/R600/AMDGPUISelLowering.cpp | 78 ++++++++++++++++++++++++++++++++++
lib/Target/R600/AMDGPUISelLowering.h | 1 +
lib/Target/R600/R600ISelLowering.cpp | 13 ++++++
lib/Target/R600/SIISelLowering.cpp | 60 +++++++++++++++++++++-----
test/CodeGen/R600/extload.ll | 14 +++---
test/CodeGen/R600/private-memory.ll | 59 +++++++++++++++++++++----
6 files changed, 200 insertions(+), 25 deletions(-)
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index d7ee8e6..1d1701f 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -587,18 +587,96 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
}
+SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ ISD::LoadExtType ExtType = Load->getExtensionType();
+
+ if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+ ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
+ return SDValue();
+
+
+ EVT VT = Op.getValueType();
+ EVT MemVT = Load->getMemoryVT();
+ unsigned Mask = 0;
+ if (Load->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Load->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
+ DAG.getConstant(2, MVT::i32));
+ SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(), Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
+ Load->getBasePtr(),
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+ Ret = DAG.getNode(ISD::AND, DL, MVT::i32, Ret,
+ DAG.getConstant(Mask, MVT::i32));
+ if (ExtType == ISD::SEXTLOAD) {
+ SDValue SExtShift = DAG.getConstant(
+ VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
+ Ret = DAG.getNode(ISD::SHL, DL, MVT::i32, Ret, SExtShift);
+ Ret = DAG.getNode(ISD::SRA, DL, MVT::i32, Ret, SExtShift);
+ }
+
+ return Ret;
+}
+
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
if (Result.getNode()) {
return Result;
}
StoreSDNode *Store = cast<StoreSDNode>(Op);
+ SDValue Chain = Store->getChain();
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
return SplitVectorStore(Op, DAG);
}
+
+ if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+ Store->getMemoryVT().bitsLT(MVT::i32)) {
+ unsigned Mask = 0;
+ if (Store->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Store->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+ SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+ DAG.getConstant(2, MVT::i32));
+ SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, TruncPtr,
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
+ Store->getValue());
+ SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, SExtValue,
+ DAG.getConstant(Mask, MVT::i32));
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ MaskedValue, ShiftAmt);
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
+ ShiftAmt);
+ DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
+ DAG.getConstant(0xffffffff, MVT::i32));
+ Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+
+ SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+ return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+ Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
+ }
return SDValue();
}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 2dfd3cf..fd6e3a5 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -54,6 +54,7 @@ protected:
/// \brief Split a vector load into multiple scalar loads.
SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 8d71919..03feabe 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -1113,6 +1113,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+ SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+ if (Ret.getNode()) {
+ return Ret;
+ }
// Lowering for indirect addressing
const MachineFunction &MF = DAG.getMachineFunction();
@@ -1204,6 +1208,15 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
SDValue Ptr = Op.getOperand(1);
SDValue LoweredLoad;
+ SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ if (Ret.getNode()) {
+ SDValue Ops[2];
+ Ops[0] = Ret;
+ Ops[1] = Chain;
+ return DAG.getMergeValues(Ops, 2, DL);
+ }
+
+
if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 4fb8444..9430689 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -125,11 +125,17 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+ setTruncStoreAction(MVT::i32, MVT::i16, Custom);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i128, MVT::i64, Expand);
@@ -700,21 +706,26 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
+ SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ SDValue MergedValues[2];
+ MergedValues[1] = Load->getChain();
+ if (Ret.getNode()) {
+ MergedValues[0] = Ret;
+ return DAG.getMergeValues(MergedValues, 2, DL);
+ }
- if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
+ }
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
DAG.getConstant(2, MVT::i32));
+ Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(), Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(), Ptr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
- SDValue MergedValues[2] = {
- Ret,
- Load->getChain()
- };
+ MergedValues[0] = Ret;
return DAG.getMergeValues(MergedValues, 2, DL);
}
@@ -796,7 +807,34 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Store->getChain();
SmallVector<SDValue, 8> Values;
- if (VT == MVT::i64) {
+ if (Store->isTruncatingStore()) {
+ unsigned Mask = 0;
+ if (Store->getMemoryVT() == MVT::i8) {
+ Mask = 0xff;
+ } else if (Store->getMemoryVT() == MVT::i16) {
+ Mask = 0xffff;
+ }
+ SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Store->getBasePtr(),
+ DAG.getConstant(0, MVT::i32));
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
+ DAG.getConstant(0x3, MVT::i32));
+ SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
+ DAG.getConstant(3, MVT::i32));
+ SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
+ DAG.getConstant(Mask, MVT::i32));
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ MaskedValue, ShiftAmt);
+ SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ DAG.getConstant(32, MVT::i32), ShiftAmt);
+ SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
+ DAG.getConstant(Mask, MVT::i32),
+ RotrAmt);
+ Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+ Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
+
+ Values.push_back(Dst);
+ } else if (VT == MVT::i64) {
for (unsigned i = 0; i < 2; ++i) {
Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
Store->getValue(), DAG.getConstant(i, MVT::i32)));
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index aa660b3..f78cdc4 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -2,7 +2,7 @@
; EG-LABEL: @anyext_load_i8:
; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
%load = load i32 addrspace(1)* %cast, align 1
@@ -14,8 +14,9 @@ define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspac
; EG-LABEL: @anyext_load_i16:
; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
%load = load i32 addrspace(1)* %cast, align 1
@@ -27,7 +28,7 @@ define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrs
; EG-LABEL: @anyext_load_lds_i8:
; EG: AND_INT
-; EG-NEXT: 255
+; EG: 255
define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
%load = load i32 addrspace(3)* %cast, align 1
@@ -39,8 +40,9 @@ define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addr
; EG-LABEL: @anyext_load_lds_i16:
; EG: AND_INT
-; EG: LSHL
-; EG: 65535
+; EG: AND_INT
+; EG-DAG: 65535
+; EG-DAG: -65536
define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
%cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
%load = load i32 addrspace(3)* %cast, align 1
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 48a013c..3fd67d7 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
; This test checks that uses and defs of the AR register happen in the same
; instruction clause.
-; R600-CHECK-LABEL: @mova_same_clause
+; FUNC-LABEL: @mova_same_clause
+
; R600-CHECK: MOVA_INT
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
@@ -12,7 +13,6 @@
; R600-CHECK-NOT: ALU clause
; R600-CHECK: 0 + AR.x
-; SI-CHECK-LABEL: @mova_same_clause
; SI-CHECK: V_READFIRSTLANE
; SI-CHECK: V_MOVRELD
; SI-CHECK: S_CBRANCH
@@ -46,9 +46,8 @@ entry:
; XXX: This generated code has unnecessary MOVs, we should be able to optimize
; this.
-; R600-CHECK-LABEL: @multiple_structs
+; FUNC-LABEL: @multiple_structs
; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @multiple_structs
; SI-CHECK-NOT: V_MOVREL
%struct.point = type { i32, i32 }
@@ -77,9 +76,8 @@ entry:
; loads and stores should be lowered to copies, so there shouldn't be any
; MOVA instructions.
-; R600-CHECK-LABLE: @direct_loop
+; FUNC-LABEL: @direct_loop
; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-LABEL: @direct_loop
; SI-CHECK-NOT: V_MOVREL
define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -113,3 +111,48 @@ for.end:
store i32 %value, i32 addrspace(1)* %out
ret void
}
+
+; FUNC-LABEL: @short_array
+
+; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
+; R600-CHECK: 65536
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @short_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = alloca [2 x i16]
+ %1 = getelementptr [2 x i16]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i16]* %0, i32 0, i32 1
+ store i16 0, i16* %1
+ store i16 1, i16* %2
+ %3 = getelementptr [2 x i16]* %0, i32 0, i32 %index
+ %4 = load i16* %3
+ %5 = sext i16 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @char_array
+
+; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
+; R600-CHECK: 256
+; R600-CHECK: MOVA_INT
+
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_MOVRELS_B32_e32
+define void @char_array(i32 addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = alloca [2 x i8]
+ %1 = getelementptr [2 x i8]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i8]* %0, i32 0, i32 1
+ store i8 0, i8* %1
+ store i8 1, i8* %2
+ %3 = getelementptr [2 x i8]* %0, i32 0, i32 %index
+ %4 = load i8* %3
+ %5 = sext i8 %4 to i32
+ store i32 %5, i32 addrspace(1)* %out
+ ret void
+
+}
--
1.8.1.4
-------------- next part --------------
>From fef01503df7ff9868d64014058727ce381d96204 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 17 Dec 2013 13:06:34 -0800
Subject: [PATCH 2/5] R600: Begin private memory at the second GPR.
This way private memory does not over-write work group information
stored in GPRs 0 and 1.
---
lib/Target/R600/AMDGPUFrameLowering.cpp | 5 ++++-
test/CodeGen/R600/private-memory.ll | 25 +++++++++++++++++++++++++
2 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 40f14d2..40cc908 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -74,7 +74,10 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
int FI) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
- unsigned Offset = 0;
+ // Start the offset at 2 so we don't overwrite work group information.
+ // XXX: We should only do this when the shader actually uses this
+ // information.
+ unsigned Offset = 2;
int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 3fd67d7..b25fc7b 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -156,3 +156,28 @@ entry:
ret void
}
+
+; Make sure we don't overwrite workitem information with private memory
+
+; FUNC-LABEL: @work_item_info
+; R600-CHECK-NOT: MOV T0.X
+; Additional check in case the move ends up in the last slot
+; R600-CHECK-NOT: MOV * TO.X
+
+; SI-CHECK-NOT: V_MOV_B32_e{{(32|64)}} v0
+define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %0 = alloca [2 x i32]
+ %1 = getelementptr [2 x i32]* %0, i32 0, i32 0
+ %2 = getelementptr [2 x i32]* %0, i32 0, i32 1
+ store i32 0, i32* %1
+ store i32 1, i32* %2
+ %3 = getelementptr [2 x i32]* %0, i32 0, i32 %in
+ %4 = load i32* %3
+ %5 = call i32 @llvm.r600.read.tidig.x()
+ %6 = add i32 %4, %5
+ store i32 %6, i32 addrspace(1)* %out
+ ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
--
1.8.1.4
-------------- next part --------------
>From aefcc2b133f3da7dd7a2b823a463b2b2da42407c Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 17 Dec 2013 13:08:32 -0800
Subject: [PATCH 3/5] R600: Add support for global addresses with constant
initializers
---
lib/Target/R600/AMDGPUISelLowering.cpp | 118 ++++++++++++++++++++++++++++----
lib/Target/R600/AMDGPUISelLowering.h | 4 ++
lib/Target/R600/SIISelLowering.cpp | 1 +
test/CodeGen/R600/gv-const-addrspace.ll | 41 +++++++++++
4 files changed, 149 insertions(+), 15 deletions(-)
create mode 100644 test/CodeGen/R600/gv-const-addrspace.ll
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 1d1701f..8ce4e44 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -21,6 +21,7 @@
#include "AMDILIntrinsicInfo.h"
#include "R600MachineFunctionInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -274,32 +275,106 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
return Op;
}
+SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
+ const GlobalValue *GV,
+ const SDValue &InitPtr,
+ SDValue Chain,
+ SelectionDAG &DAG) const {
+ const DataLayout *TD = getTargetMachine().getDataLayout();
+ SDLoc DL(InitPtr);
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
+ EVT VT = EVT::getEVT(CI->getType());
+ PointerType *PtrTy = PointerType::get(CI->getType(), 0);
+ return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
+ MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+ TD->getPrefTypeAlignment(CI->getType()));
+ } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+ EVT VT = EVT::getEVT(CFP->getType());
+ PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
+ return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
+ MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
+ TD->getPrefTypeAlignment(CFP->getType()));
+ } else if (Init->getType()->isAggregateType()) {
+ EVT PtrVT = InitPtr.getValueType();
+ unsigned NumElements = Init->getType()->getArrayNumElements();
+ SmallVector<SDValue, 8> Chains;
+ for (unsigned i = 0; i < NumElements; ++i) {
+ SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
+ Init->getType()->getArrayElementType()), PtrVT);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+ Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
+ GV, Ptr, Chain, DAG));
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
+ Chains.size());
+ } else {
+ Init->dump();
+ llvm_unreachable("Unhandled constant initializer");
+ }
+}
+
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
SDValue Op,
SelectionDAG &DAG) const {
const DataLayout *TD = getTargetMachine().getDataLayout();
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = G->getGlobal();
- assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS);
- // XXX: What does the value of G->getOffset() mean?
- assert(G->getOffset() == 0 &&
+ switch (G->getAddressSpace()) {
+ default: llvm_unreachable("Global Address lowering not implemented for this "
+ "address space");
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ // XXX: What does the value of G->getOffset() mean?
+ assert(G->getOffset() == 0 &&
"Do not know what to do with an non-zero offset");
- const GlobalValue *GV = G->getGlobal();
+ unsigned Offset;
+ if (MFI->LocalMemoryObjects.count(GV) == 0) {
+ uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+ Offset = MFI->LDSSize;
+ MFI->LocalMemoryObjects[GV] = Offset;
+ // XXX: Account for alignment?
+ MFI->LDSSize += Size;
+ } else {
+ Offset = MFI->LocalMemoryObjects[GV];
+ }
- unsigned Offset;
- if (MFI->LocalMemoryObjects.count(GV) == 0) {
- uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
- Offset = MFI->LDSSize;
- MFI->LocalMemoryObjects[GV] = Offset;
- // XXX: Account for alignment?
- MFI->LDSSize += Size;
- } else {
- Offset = MFI->LocalMemoryObjects[GV];
+ return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+ Type *EltType = GV->getType()->getElementType();
+ unsigned Size = TD->getTypeAllocSize(EltType);
+ unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+
+ const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+ const Constant *Init = Var->getInitializer();
+ int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
+ SDValue InitPtr = DAG.getFrameIndex(FI,
+ getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+ SmallVector<SDNode*, 8> WorkList;
+
+ for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
+ E = DAG.getEntryNode()->use_end(); I != E; ++I) {
+ if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
+ continue;
+ WorkList.push_back(*I);
+ }
+ SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
+ for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
+ E = WorkList.end(); I != E; ++I) {
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(Chain);
+ for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
+ Ops.push_back((*I)->getOperand(i));
+ }
+ DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+ }
+ return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
+ getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+ }
}
-
- return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
}
void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
@@ -592,6 +667,19 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
ISD::LoadExtType ExtType = Load->getExtensionType();
+ // Lower loads constant address space global variable loads
+ if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+
+ SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
+ getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+ Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+ DAG.getConstant(2, MVT::i32));
+ return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ Load->getChain(), Ptr,
+ DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
+ }
+
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
return SDValue();
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index fd6e3a5..9782b5e 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -28,6 +28,10 @@ private:
void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Args,
unsigned Start, unsigned Count) const;
+ SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
+ const SDValue &InitPtr,
+ SDValue Chain,
+ SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 9430689..a66f289 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -143,6 +143,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
setTargetDAGCombine(ISD::SELECT_CC);
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
new file mode 100644
index 0000000..cda7ab1
--- /dev/null
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+
+; XXX: Test on SI once 64-bit adds are supportes.
+
+ at float_gv = internal addrspace(2) unnamed_addr constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
+
+; FUNC-LABEL: @float
+
+; R600-DAG: MOV {{\** *}}T2.X
+; R600-DAG: MOV {{\** *}}T3.X
+; R600-DAG: MOV {{\** *}}T4.X
+; R600-DAG: MOV {{\** *}}T5.X
+; R600-DAG: MOV {{\** *}}T6.X
+; R600: MOVA_INT
+
+define void @float(float addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+ %1 = load float addrspace(2)* %0
+ store float %1, float addrspace(1)* %out
+ ret void
+}
+
+ at i32_gv = internal addrspace(2) unnamed_addr constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
+
+; FUNC-LABEL: @i32
+
+; R600-DAG: MOV {{\** *}}T2.X
+; R600-DAG: MOV {{\** *}}T3.X
+; R600-DAG: MOV {{\** *}}T4.X
+; R600-DAG: MOV {{\** *}}T5.X
+; R600-DAG: MOV {{\** *}}T6.X
+; R600: MOVA_INT
+
+define void @i32(i32 addrspace(1)* %out, i32 %index) {
+entry:
+ %0 = getelementptr inbounds [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
+ %1 = load i32 addrspace(2)* %0
+ store i32 %1, i32 addrspace(1)* %out
+ ret void
+}
--
1.8.1.4
-------------- next part --------------
>From f59520423f341788ce8a6bdeae5580bc53072b85 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 17 Dec 2013 20:39:23 -0800
Subject: [PATCH 4/5] R600: Take alignment into account when calculating the
stack offset
---
lib/Target/R600/AMDGPUFrameLowering.cpp | 15 +++++++++++----
test/CodeGen/R600/private-memory.ll | 31 +++++++++++++++++++++++++++++++
2 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 40cc908..0325a00 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -77,14 +77,21 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
// Start the offset at 2 so we don't overwrite work group information.
// XXX: We should only do this when the shader actually uses this
// information.
- unsigned Offset = 2;
+ unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
- unsigned Size = MFI->getObjectSize(i);
- Offset += (Size / (getStackWidth(MF) * 4));
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i));
+ OffsetBytes += MFI->getObjectSize(i);
+ // Each regiter holds 4 bytes, so we must always align the offset to at
+ // least 4 bytes, so that 2 frame objects won't share the same register.
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, 4);
}
- return Offset;
+
+ if (FI != -1)
+ OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI));
+
+ return OffsetBytes / (getStackWidth(MF) * 4);
}
const TargetFrameLowering::SpillSlot *
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index b25fc7b..1bd17bf 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -180,4 +180,35 @@ entry:
ret void
}
+; Test that two stack objects are not stored in the same register
+; The second stack object should be in T3.X
+; FUNC-LABEL: @no_overlap
+; R600-CHECK: MOV {{\** *}}T3.X
+; SI-CHECK: V_MOV_B32_e32 v3
+define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
+entry:
+ %0 = alloca [3 x i8], align 1
+ %1 = alloca [2 x i8], align 1
+ %2 = getelementptr [3 x i8]* %0, i32 0, i32 0
+ %3 = getelementptr [3 x i8]* %0, i32 0, i32 1
+ %4 = getelementptr [3 x i8]* %0, i32 0, i32 2
+ %5 = getelementptr [2 x i8]* %1, i32 0, i32 0
+ %6 = getelementptr [2 x i8]* %1, i32 0, i32 1
+ store i8 0, i8* %2
+ store i8 1, i8* %3
+ store i8 2, i8* %4
+ store i8 1, i8* %5
+ store i8 0, i8* %6
+ %7 = getelementptr [3 x i8]* %0, i32 0, i32 %in
+ %8 = getelementptr [2 x i8]* %1, i32 0, i32 %in
+ %9 = load i8* %7
+ %10 = load i8* %8
+ %11 = add i8 %9, %10
+ %12 = sext i8 %11 to i32
+ store i32 %12, i32 addrspace(1)* %out
+ ret void
+}
+
+
+
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
--
1.8.1.4
-------------- next part --------------
>From 7b7ab610be8e5f0c25aecfde716edd22f9ea64d7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 17 Dec 2013 20:40:08 -0800
Subject: [PATCH 5/5] R600: MOVA is vector only
---
lib/Target/R600/R600Instructions.td | 2 +-
test/CodeGen/R600/private-memory.ll | 4 +++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 26dfc28..2b2794f 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1553,7 +1553,7 @@ let Predicates = [isEGorCayman] in {
defm CUBE_eg : CUBE_Common<0xC0>;
let hasSideEffects = 1 in {
- def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>;
+ def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
}
def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 1bd17bf..e22c718 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -116,6 +116,7 @@ for.end:
; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
; R600-CHECK: 65536
+; R600-CHECK: *
; R600-CHECK: MOVA_INT
; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
@@ -138,7 +139,8 @@ entry:
; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
; R600-CHECK: 256
-; R600-CHECK: MOVA_INT
+; R600-CHECK: *
+; R600-CHECK-NEXT: MOVA_INT
; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
; SI-CHECK: V_MOVRELS_B32_e32
--
1.8.1.4
More information about the llvm-commits
mailing list