R600/SI Patches: Compute shader improvements
Tom Stellard
tom at stellard.net
Fri May 24 11:04:42 PDT 2013
Hi,
The attached patches improve compute shader support in the R600/SI
backend. With the associated mesa patches, it should be possible to
run bfgminer with the radeonsi gallium driver.
Please Review.
-Tom
-------------- next part --------------
>From be992b59b4206602db81738638e929ffcec356e3 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 17 May 2013 12:54:38 -0400
Subject: [PATCH 1/9] R600/SI: Rework MUBUF store instructions
The lowering of stores is now mostly handled in the tablegen files. No
more BUFFER_STORE nodes I generated during legalization.
---
lib/Target/R600/AMDGPUISelLowering.h | 1 -
lib/Target/R600/SIISelLowering.cpp | 60 ++++++++++++++++++------------------
lib/Target/R600/SIISelLowering.h | 1 -
lib/Target/R600/SIInstrInfo.td | 12 ++------
lib/Target/R600/SIInstructions.td | 39 ++++++++++++++++++++++-
5 files changed, 71 insertions(+), 42 deletions(-)
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 8a4db6f..45d585f 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -115,7 +115,6 @@ enum {
RET_FLAG,
BRANCH_COND,
// End AMDIL ISD Opcodes
- BUFFER_STORE,
DWORDADDR,
FRACT,
FMAX,
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 6bcc384..444adbe 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -25,6 +25,8 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/Function.h"
+const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+
using namespace llvm;
SITargetLowering::SITargetLowering(TargetMachine &TM) :
@@ -72,9 +74,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
- setOperationAction(ISD::STORE, MVT::i32, Custom);
- setOperationAction(ISD::STORE, MVT::i64, Custom);
-
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
@@ -214,10 +213,38 @@ SDValue SITargetLowering::LowerFormalArguments(
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
+ MachineBasicBlock::iterator I = *MI;
+
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
+ case AMDGPU::SI_ADDR64_RSRC: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ unsigned SuperReg = MI->getOperand(0).getReg();
+ unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
+ .addOperand(MI->getOperand(1));
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
+ .addImm(0);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
+ .addImm(RSRC_DATA_FORMAT >> 32);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
+ .addReg(SubRegHiLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(SubRegHiHi)
+ .addImm(AMDGPU::sub1);
+ BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
+ .addReg(SubRegLo)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SubRegHi)
+ .addImm(AMDGPU::sub2_sub3);
+ MI->eraseFromParent();
+ break;
+ }
}
return BB;
}
@@ -239,7 +266,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
- case ISD::STORE: return LowerSTORE(Op, DAG);
}
return SDValue();
}
@@ -338,32 +364,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
-const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-
-SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
- SDValue Chain = Op.getOperand(0);
- SDValue Value = Op.getOperand(1);
- SDValue VirtualAddress = Op.getOperand(2);
- DebugLoc DL = Op.getDebugLoc();
-
- if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) {
- return SDValue();
- }
-
- SDValue Zero = DAG.getConstant(0, MVT::i64);
- SDValue Format = DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64);
- SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Zero, Format);
-
- SDValue Ops[2];
- Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain,
- Value, SrcSrc, VirtualAddress);
- Ops[1] = Chain;
-
- return DAG.getMergeValues(Ops, 2, DL);
-
-}
-
SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 62dfeda..1389a1e 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -24,7 +24,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
const SIInstrInfo * TII;
const TargetRegisterInfo * TRI;
- SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 59ab8d4..19d9de1 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -26,10 +26,6 @@ def HI32 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
}]>;
-def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE",
- SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
- [SDNPHasChain, SDNPMayStore]>;
-
def IMM8bitDWORD : ImmLeaf <
i32, [{
return (Imm & ~0x3FC) == 0;
@@ -327,16 +323,14 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
ValueType VT> :
- MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr),
- name#" $vdata, $srsrc + $vaddr",
- [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc),
- (i64 VReg_64:$vaddr))]> {
+ MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+ name#" $vdata, $srsrc + $vaddr + $offset",
+ []> {
let mayLoad = 0;
let mayStore = 1;
// Encoding
- let offset = 0;
let offen = 0;
let idxen = 0;
let glc = 0;
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 03eced0..c739e2a 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -416,7 +416,10 @@ def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
>;
-//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
+
+def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+ 0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32
+>;
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
@@ -1200,6 +1203,19 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+// This psuedo instruction takes a pointer as input and outputs a resource
+// constant that can be used with the ADDR64 MUBUF instructions.
+
+let usesCustomInserter = 1 in {
+
+def SI_ADDR64_RSRC : InstSI <
+ (outs SReg_128:$srsrc),
+ (ins SReg_64:$ptr),
+ "", []
+>;
+
+} // end usesCustomInserter
+
} // end IsCodeGenOnly, isPseudo
def : Pat<
@@ -1591,6 +1607,27 @@ defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt> {
+
+ def : Pat <
+ (global_store vt:$value, i64:$ptr),
+ (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+ >;
+
+ def : Pat <
+ (global_store vt:$value, (add i64:$ptr, i64:$offset)),
+ (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
+ >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32>;
+
/********** ====================== **********/
/********** Indirect adressing **********/
/********** ====================== **********/
--
1.8.1.5
-------------- next part --------------
>From 6bbac618e7c3cfdfbc67993e8053df00a4ac5226 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 7 May 2013 12:56:51 -0400
Subject: [PATCH 2/9] R600/SI: Add support for global loads
---
lib/Target/R600/AMDGPUInstructions.td | 4 +++
lib/Target/R600/SIInstrInfo.td | 9 ++++--
lib/Target/R600/SIInstructions.td | 30 +++++++++++++++++++-
test/CodeGen/R600/load.ll | 52 +++++++++++++++++++++++++++++++++--
4 files changed, 88 insertions(+), 7 deletions(-)
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 54df7d0..29df374 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -90,6 +90,10 @@ def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
+def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
+ return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 19d9de1..cb159ba 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -35,9 +35,12 @@ def IMM8bitDWORD : ImmLeaf <
}]>
>;
-def IMM12bit : ImmLeaf <
- i16,
- [{return isUInt<12>(Imm);}]
+def as_i16imm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
+}]>;
+
+def IMM12bit : PatLeaf <(imm),
+ [{return isUInt<12>(N->getZExtValue());}]
>;
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index c739e2a..b6db815 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -399,7 +399,7 @@ defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMA
//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
-//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>;
//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
@@ -1611,6 +1611,34 @@ defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
// MUBUF Patterns
//===----------------------------------------------------------------------===//
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+ PatFrag global_ld, PatFrag constant_ld> {
+ def : Pat <
+ (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
+ (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+ >;
+
+ def : Pat <
+ (vt (global_ld i64:$ptr)),
+ (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+ >;
+
+ def : Pat <
+ (vt (global_ld (add i64:$ptr, i64:$offset))),
+ (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+ >;
+
+ def : Pat <
+ (vt (constant_ld (add i64:$ptr, i64:$offset))),
+ (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+ >;
+}
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
+ global_load, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
+ zextloadi8_global, zextloadi8_constant>;
+
multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt> {
def : Pat <
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index b03245a..ff774ec 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
; Load an i8 value from the global address space.
-; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; R600-CHECK: @load_i8
+; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_i8
+; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
%1 = load i8 addrspace(1)* %in
%2 = zext i8 %1 to i32
@@ -10,9 +14,51 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
ret void
}
+; load an i32 value from the global address space.
+; R600-CHECK: @load_i32
+; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI-CHECK: @load_i32
+; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+ %0 = load i32 addrspace(1)* %in
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
+; load a f32 value from the global address space.
+; R600-CHECK: @load_f32
+; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI-CHECK: @load_f32
+; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+ %0 = load float addrspace(1)* %in
+ store float %0, float addrspace(1)* %out
+ ret void
+}
+
+; Load an i32 value from the constant address space.
+; R600-CHECK: @load_const_addrspace_i32
+; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+
+; SI-CHECK: @load_const_addrspace_i32
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
+define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+entry:
+ %0 = load i32 addrspace(2)* %in
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
+
; Load a f32 value from the constant address space.
-; CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
+; R600-CHECK: @load_const_addrspace_f32
+; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
+; SI-CHECK: @load_const_addrspace_f32
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
%1 = load float addrspace(2)* %in
store float %1, float addrspace(1)* %out
--
1.8.1.5
-------------- next part --------------
>From 4e18f017db4318e090d8fb9e84b6cb744435989c Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 19:00:07 -0400
Subject: [PATCH 3/9] R600/SI: Fixup CopyToReg register class in
PostprocessISelDAG()
The CopyToReg nodes will sometimes try to copy a value from a VGPR to an
SGPR. This kind of copy is not possible, so we need to detect
VGPR->SGPR copies and do something else. The current strategy is to
replace these copies with VGPR->VGPR copies and hope that all the users
of CopyToReg can accept VGPRs as arguments.
---
lib/Target/R600/AMDILISelDAGToDAG.cpp | 38 ++++++++++++++++++++++++++++++-----
1 file changed, 33 insertions(+), 5 deletions(-)
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index 9b0e518..83d94d3 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
#include "R600InstrInfo.h"
#include "SIISelLowering.h"
#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
@@ -649,18 +650,45 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+ if (Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX) {
+ return;
+ }
+
// Go over all selected nodes and try to fold them a bit more
const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI);
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ++I) {
- MachineSDNode *Node = dyn_cast<MachineSDNode>(I);
- if (!Node)
+ SDNode *Node = I;
+ switch (Node->getOpcode()) {
+ // Fix the register class in copy to CopyToReg nodes - ISel will always
+ // use SReg classes for 64-bit copies, but this is not always what we want.
+ case ISD::CopyToReg: {
+ unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+ SDValue Val = Node->getOperand(2);
+ const TargetRegisterClass *RC = RegInfo->getRegClass(Reg);
+ if (RC != &AMDGPU::SReg_64RegClass) {
+ continue;
+ }
+
+ if (!Val.getNode()->isMachineOpcode()) {
+ continue;
+ }
+
+ const MCInstrDesc Desc = TM.getInstrInfo()->get(Val.getNode()->getMachineOpcode());
+ const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ RegInfo->setRegClass(Reg, TRI->getRegClass(Desc.OpInfo[0].RegClass));
+ continue;
+ }
+ }
+
+ MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+ if (!MachineNode)
continue;
- SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG);
- if (ResNode != Node)
+ SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+ if (ResNode != Node) {
ReplaceUses(Node, ResNode);
+ }
}
}
-
--
1.8.1.5
-------------- next part --------------
>From 7a7d9cdbbdf4d153f89625f997dcb361b5b93ac4 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 20 May 2013 12:32:42 -0400
Subject: [PATCH 4/9] R600/SI: Handle nodes with glue results correctly
SITargetLowering::foldOperands()
---
lib/Target/R600/SIISelLowering.cpp | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 444adbe..d616632 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -700,6 +700,22 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
Ops.push_back(Node->getOperand(i));
+ // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
+ // this case a brand new node is always be created, even if the operands
+ // are the same as before. So, manually check if anything has been changed.
+ if (Desc->Opcode == Opcode) {
+ bool Changed = false;
+ for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
+ if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
+ Changed = true;
+ break;
+ }
+ }
+ if (!Changed) {
+ return Node;
+ }
+ }
+
// Create a complete new instruction
return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
Node->getVTList(), Ops);
--
1.8.1.5
-------------- next part --------------
>From ee583443e600479b550f92ebfdcd7a98dbae5115 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 21:45:39 -0400
Subject: [PATCH 5/9] R600/SI: Handle REG_SEQUENCE in fitsRegClass()
---
lib/Target/R600/SIISelLowering.cpp | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index d616632..dfc0b79 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -523,10 +523,20 @@ bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) {
const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode());
int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
- if (OpClassID == -1)
- OpClass = getRegClassFor(Op.getSimpleValueType());
- else
+ if (OpClassID == -1) {
+ switch (MN->getMachineOpcode()) {
+ case AMDGPU::REG_SEQUENCE:
+ // Operand 0 is the register class id for REG_SEQUENCE instructions.
+ OpClass = TRI->getRegClass(
+ cast<ConstantSDNode>(MN->getOperand(0))->getZExtValue());
+ break;
+ default:
+ OpClass = getRegClassFor(Op.getSimpleValueType());
+ break;
+ }
+ } else {
OpClass = TRI->getRegClass(OpClassID);
+ }
} else if (Node->getOpcode() == ISD::CopyFromReg) {
RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode());
--
1.8.1.5
-------------- next part --------------
>From d4ec19f5674ec846ddba415428dc00f83bb11a91 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 21:48:28 -0400
Subject: [PATCH 6/9] R600/SI: Adjust some instructions' out register class
after ISel
This is necessary to avoid generating VGPR to SGPR copies in some
cases.
---
lib/Target/R600/SIISelLowering.cpp | 51 ++++++++++++++++++++++++++++++++++++++
lib/Target/R600/SIISelLowering.h | 1 +
2 files changed, 52 insertions(+)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index dfc0b79..e5fd7c8 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -810,6 +810,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
/// \brief Fold the instructions after slecting them
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
+ Node = AdjustRegClass(Node, DAG);
if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
adjustWritemask(Node, DAG);
@@ -841,3 +842,53 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
MRI.setRegClass(VReg, RC);
}
+
+MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
+ SelectionDAG &DAG) const {
+
+ DebugLoc DL = N->getDebugLoc();
+ unsigned NewOpcode = N->getMachineOpcode();
+
+ switch (N->getMachineOpcode()) {
+ default: return N;
+ case AMDGPU::REG_SEQUENCE: {
+ // MVT::i128 only use SGPRs, so i128 REG_SEQUENCEs don't need to be
+ // rewritten.
+ if (N->getValueType(0) == MVT::i128) {
+ return N;
+ }
+ const SDValue Ops[] = {
+ DAG.getTargetConstant(AMDGPU::VReg_64RegClassID, MVT::i32),
+ N->getOperand(1) , N->getOperand(2),
+ N->getOperand(3), N->getOperand(4)
+ };
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::i64, Ops);
+ }
+
+ case AMDGPU::S_LOAD_DWORD_IMM:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+ // Fall-through
+ case AMDGPU::S_LOAD_DWORDX2_SGPR:
+ if (NewOpcode == N->getMachineOpcode()) {
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+ }
+ // Fall-through
+ case AMDGPU::S_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+ if (NewOpcode == N->getMachineOpcode()) {
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+ }
+ if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
+ return N;
+ }
+ ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
+ SDValue Ops[] = {
+ SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
+ DAG.getConstant(0, MVT::i64)), 0),
+ N->getOperand(0),
+ DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
+ };
+ return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
+ }
+ }
+}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 1389a1e..284a6fc 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -36,6 +36,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+ MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
public:
SITargetLowering(TargetMachine &tm);
--
1.8.1.5
-------------- next part --------------
>From 6e47bb8ad6c44620b4d0768ba8ff42b8714381b0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 21:43:00 -0400
Subject: [PATCH 7/9] R600/SI: Custom lower i64 sign_extend
---
lib/Target/R600/SIISelLowering.cpp | 18 ++++++++++++++++++
lib/Target/R600/SIISelLowering.h | 1 +
test/CodeGen/R600/sign_extend.ll | 12 ++++++++++++
3 files changed, 31 insertions(+)
create mode 100644 test/CodeGen/R600/sign_extend.ll
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index e5fd7c8..127d056 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -74,6 +74,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
+
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
@@ -266,6 +268,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
}
return SDValue();
}
@@ -383,6 +386,21 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
}
+SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ DebugLoc DL = Op.getDebugLoc();
+
+ if (VT != MVT::i64) {
+ return SDValue();
+ }
+
+ SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
+ DAG.getConstant(31, MVT::i32));
+
+ return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 284a6fc..be79404 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -25,6 +25,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
const TargetRegisterInfo * TRI;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
bool foldImm(SDValue &Operand, int32_t &Immediate,
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
new file mode 100644
index 0000000..e4ef534
--- /dev/null
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -0,0 +1,12 @@
+
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+; CHECK: V_ASHR
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+ %0 = mul i32 %a, %b
+ %1 = add i32 %0, %c
+ %2 = sext i32 %1 to i64
+ store i64 %2, i64 addrspace(1)* %out
+ ret void
+}
--
1.8.1.5
-------------- next part --------------
>From 8c48a409739ae42f3aaf7a8edcda9e61496f7183 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 6 May 2013 22:12:41 -0400
Subject: [PATCH 8/9] R600/SI: Add a calling convention for compute shaders
---
lib/Target/R600/AMDGPUCallingConv.td | 18 +++++++++++-------
lib/Target/R600/AMDGPUISelLowering.cpp | 2 ++
lib/Target/R600/SIISelLowering.cpp | 28 ++++++++++++++++++++++++++--
test/CodeGen/R600/bfi_int.ll | 4 ++--
test/CodeGen/R600/lshl.ll | 2 +-
test/CodeGen/R600/lshr.ll | 2 +-
test/CodeGen/R600/mulhu.ll | 4 ++--
test/CodeGen/R600/rotr.ll | 4 ++--
test/CodeGen/R600/seto.ll | 2 +-
test/CodeGen/R600/setuo.ll | 2 +-
10 files changed, 49 insertions(+), 19 deletions(-)
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 9c30515..e57b5cd 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -32,17 +32,21 @@ def CC_SI : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
- ]>>>,
+ ]>>>
+
+]>;
- // This is the default for i64 values.
- // XXX: We should change this once clang understands the CC_AMDGPU.
- CCIfType<[i64], CCAssignToRegWithShadow<
- [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
- [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
- >>
+// Calling convention for SI compute kernels
+def CC_SI_Kernel : CallingConv<[
+ CCIfType<[i64], CCAssignToStack <8, 4>>,
+ CCIfType<[i32, f32], CCAssignToStack <4, 4>>,
+ CCIfType<[i16], CCAssignToStack <2, 4>>,
+ CCIfType<[i8], CCAssignToStack <1, 4>>
]>;
def CC_AMDGPU : CallingConv<[
+ CCIf<"State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
+ "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_SI_Kernel>>,
CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
"->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
]>;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 128454c..25abe3d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -14,9 +14,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDILIntrinsicInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 127d056..a82898c 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -121,7 +121,7 @@ SDValue SITargetLowering::LowerFormalArguments(
}
// Second split vertices into their elements
- if (Arg.VT.isVector()) {
+ if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) {
ISD::InputArg NewArg = Arg;
NewArg.Flags.setSplit();
NewArg.VT = Arg.VT.getVectorElementType();
@@ -153,6 +153,14 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
+ unsigned ArgReg = 0;
+ // The pointer to the list of arguments is stored in SGPR0, SGPR1
+ if (Info->ShaderType == ShaderType::COMPUTE) {
+ CCInfo.AllocateReg(AMDGPU::SGPR0);
+ CCInfo.AllocateReg(AMDGPU::SGPR1);
+ ArgReg = MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
@@ -164,10 +172,26 @@ SDValue SITargetLowering::LowerFormalArguments(
}
CCValAssign &VA = ArgLocs[ArgIdx++];
+ EVT VT = VA.getLocVT();
+
+ if (VA.isMemLoc()) {
+ assert(ArgReg);
+ PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
+ SDValue BasePtr = DAG.getCopyFromReg(DAG.getRoot(), DL,
+ ArgReg, MVT::i64);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(VA.getLocMemOffset(), MVT::i64));
+ SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), Ptr,
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ VA.getValVT(), false, false, ArgVT.getSizeInBits() >> 3);
+ InVals.push_back(Arg);
+ continue;
+ }
assert(VA.isRegLoc() && "Parameter must be in a register!");
unsigned Reg = VA.getLocReg();
- MVT VT = VA.getLocVT();
if (VT == MVT::i64) {
// For now assume it is a pointer
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index f51060f..e8790b4 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -38,8 +38,8 @@ entry:
; R600-CHECK: @bfi_sha256_ma
; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[xyzw]}}, {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: V_XOR_B32_e32 [[DST:VGPR[0-9]+]], {{VGPR[0-9]+, VGPR[0-9]+}}
-; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{VGPR[0-9]+, VGPR[0-9]+}}
+; SI-CHECK: V_XOR_B32_e64 [[DST:VGPR[0-9]+]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
+; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
entry:
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index fb698da..9e29b0d 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-;CHECK: V_LSHLREV_B32_e32 VGPR0, 1, VGPR0
+;CHECK: V_LSHL_B32_e64 VGPR{{[0-9]+}}, {{[SV]GPR[0-9]+}}, 1
define void @test(i32 %p) {
%i = mul i32 %p, 2
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index e0ed3ac..eab3fbf 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-;CHECK: V_LSHRREV_B32_e32 VGPR0, 1, VGPR0
+;CHECK: V_LSHR_B32_e64 {{VGPR[0-9]+}}, {{[SV]GPR[0-9]+}}, 1
define void @test(i32 %p) {
%i = udiv i32 %p, 2
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index bc17a59..eb379d1 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,7 +1,7 @@
;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-;CHECK: V_MOV_B32_e32 VGPR1, -1431655765
-;CHECK-NEXT: V_MUL_HI_U32 VGPR0, VGPR0, VGPR1, 0, 0, 0, 0, 0
+;CHECK: V_MOV_B32_e32 VGPR{{[0-9]+}}, -1431655765
+;CHECK: V_MUL_HI_U32 VGPR0, {{[SV]GPR[0-9]+}}, {{VGPR[0-9]+}}
;CHECK-NEXT: V_LSHRREV_B32_e32 VGPR0, 1, VGPR0
define void @test(i32 %p) {
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index ff4da41..14af409 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -22,8 +22,8 @@ entry:
; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PV.[xyzw]}}
; SI-CHECK: @rotl
-; SI-CHECK: V_SUB_I32_e32 [[DST:VGPR[0-9]+]], 32, {{VGPR[0-9]+}}
-; SI-CHECK: V_ALIGNBIT_B32 {{VGPR[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}, [[DST]]
+; SI-CHECK: V_SUB_I32_e64 [[DST:VGPR[0-9]+]], 32, {{[SV]GPR[0-9]+}}
+; SI-CHECK: V_ALIGNBIT_B32 {{VGPR[0-9]+, [SV]GPR[0-9]+, VGPR[0-9]+}}, [[DST]]
define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
entry:
%0 = shl i32 %x, %y
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 4622203..19716f8 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-;CHECK: V_CMP_O_F32_e64 SGPR0_SGPR1, VGPR0, VGPR0, 0, 0, 0, 0
+;CHECK: V_CMP_O_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
define void @main(float %p) {
main_body:
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index 0bf5801..929dbb1 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,6 +1,6 @@
;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-;CHECK: V_CMP_U_F32_e64 SGPR0_SGPR1, VGPR0, VGPR0, 0, 0, 0, 0
+;CHECK: V_CMP_U_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
define void @main(float %p) {
main_body:
--
1.8.1.5
-------------- next part --------------
>From 3da9d65178e126e4ac14071c78e6fe63f769d1e8 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 13 May 2013 21:51:56 -0400
Subject: [PATCH 9/9] R600/SI: Add support for work item and work group
intrinsics
---
lib/Target/R600/AMDGPUISelLowering.h | 5 +-
lib/Target/R600/SIISelLowering.cpp | 94 +++++++++++++++++++++++++++++++-----
lib/Target/R600/SIISelLowering.h | 4 ++
3 files changed, 88 insertions(+), 15 deletions(-)
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 45d585f..9617725 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -33,8 +33,9 @@ protected:
/// MachineFunction.
///
/// \returns a RegisterSDNode representing Reg.
- SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
- unsigned Reg, EVT VT) const;
+ virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const;
bool isHWTrueValue(SDValue Op) const;
bool isHWFalseValue(SDValue Op) const;
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index a82898c..a586743 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -76,6 +76,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
@@ -83,6 +85,23 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setSchedulingPreference(Sched::RegPressure);
}
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT,
+ DebugLoc DL, SDValue Chain,
+ unsigned Offset) const {
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
+ SDValue BasePtr = DAG.getCopyFromReg(Chain, DL,
+ MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
+ SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(Offset, MVT::i64));
+ return DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ VT, false, false, ArgVT.getSizeInBits() >> 3);
+
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain,
CallingConv::ID CallConv,
@@ -153,12 +172,11 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
- unsigned ArgReg = 0;
// The pointer to the list of arguments is stored in SGPR0, SGPR1
if (Info->ShaderType == ShaderType::COMPUTE) {
CCInfo.AllocateReg(AMDGPU::SGPR0);
CCInfo.AllocateReg(AMDGPU::SGPR1);
- ArgReg = MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+ MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
}
AnalyzeFormalArguments(CCInfo, Splits);
@@ -175,17 +193,10 @@ SDValue SITargetLowering::LowerFormalArguments(
EVT VT = VA.getLocVT();
if (VA.isMemLoc()) {
- assert(ArgReg);
- PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
- AMDGPUAS::CONSTANT_ADDRESS);
- EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
- SDValue BasePtr = DAG.getCopyFromReg(DAG.getRoot(), DL,
- ArgReg, MVT::i64);
- SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
- DAG.getConstant(VA.getLocMemOffset(), MVT::i64));
- SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), Ptr,
- MachinePointerInfo(UndefValue::get(PtrTy)),
- VA.getValVT(), false, false, ArgVT.getSizeInBits() >> 3);
+ // The first 36 bytes of the input buffer contains information about
+ // thread group and global sizes.
+ SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(),
+ 36 + VA.getLocMemOffset());
InVals.push_back(Arg);
continue;
}
@@ -293,6 +304,54 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ EVT VT = Op.getValueType();
+ DebugLoc DL = Op.getDebugLoc();
+ //XXX: Hardcoded we only use two to store the pointer to the parameters.
+ unsigned NumUserSGPRs = 2;
+ switch (IntrinsicID) {
+ default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+ case Intrinsic::r600_read_ngroups_x:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0);
+ case Intrinsic::r600_read_ngroups_y:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4);
+ case Intrinsic::r600_read_ngroups_z:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8);
+ case Intrinsic::r600_read_global_size_x:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12);
+ case Intrinsic::r600_read_global_size_y:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16);
+ case Intrinsic::r600_read_global_size_z:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20);
+ case Intrinsic::r600_read_local_size_x:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24);
+ case Intrinsic::r600_read_local_size_y:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28);
+ case Intrinsic::r600_read_local_size_z:
+ return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32);
+ case Intrinsic::r600_read_tgid_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
+ case Intrinsic::r600_read_tgid_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
+ case Intrinsic::r600_read_tgid_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+ AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
+ case Intrinsic::r600_read_tidig_x:
+ return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ AMDGPU::VGPR0, VT);
+ case Intrinsic::r600_read_tidig_y:
+ return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ AMDGPU::VGPR1, VT);
+ case Intrinsic::r600_read_tidig_z:
+ return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+ AMDGPU::VGPR2, VT);
+
+ }
+ }
}
return SDValue();
}
@@ -934,3 +993,12 @@ MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
}
}
}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+ const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const {
+ SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+ return DAG.getCopyFromReg(DAG.getEntryNode(), DebugLoc(),
+ cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index be79404..002e48c 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -24,6 +24,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
const SIInstrInfo * TII;
const TargetRegisterInfo * TRI;
+ SDValue LowerParameter(SelectionDAG &DAG, EVT VT, DebugLoc DL,
+ SDValue Chain, unsigned Offset) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -59,6 +61,8 @@ public:
SDNode *Node) const;
int32_t analyzeImmediate(const SDNode *N) const;
+ SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+ unsigned Reg, EVT VT) const;
};
} // End namespace llvm
--
1.8.1.5
More information about the llvm-commits
mailing list