R600 Patches (Plus one for SI): Various fixes to get the piglit all_cl tests to stop hanging
Tom Stellard
tom at stellard.net
Fri Apr 12 13:52:11 PDT 2013
Hi,
Attached are several patches to get the piglit test suite to complete a
full run with all_cl.tests without locking up on my Juniper.
Please Review.
-Tom
-------------- next part --------------
>From dbcfd9c4ae491745ece761d97a3495bde0e54660 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov <dcherkassov at gmail.com>
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/8] R600: Add 64-bit float load/store support
* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 -> v2f32 conversions
Signed-off-by: Dmitry Cherkassov <dcherkassov at gmail.com>
Tom Stellard:
- Mark vec2 operations as expand. The addition of a vec2 register
class made them all legal.
---
lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++
lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 ++
lib/Target/R600/R600ISelLowering.cpp | 17 +++++++++
lib/Target/R600/R600InstrInfo.cpp | 19 ++++++----
lib/Target/R600/R600Instructions.td | 44 ++++++++++++++++++++++
lib/Target/R600/R600RegisterInfo.td | 16 ++++++++
test/CodeGen/R600/64bit-kernel-args.ll | 41 ++++++++++++++++++++
test/CodeGen/R600/fadd.ll | 10 +++++
test/CodeGen/R600/fdiv.ll | 37 +++++++++++++-----
test/CodeGen/R600/fmul.ll | 10 +++++
test/CodeGen/R600/fp_to_sint.ll | 10 +++++
test/CodeGen/R600/fp_to_uint.ll | 10 +++++
test/CodeGen/R600/fsub.ll | 20 +++++++---
test/CodeGen/R600/setcc.ll | 18 +++++++--
test/CodeGen/R600/sint_to_fp.ll | 10 +++++
test/CodeGen/R600/udiv.ll | 20 +++++++---
test/CodeGen/R600/uint_to_fp.ll | 10 +++++
test/CodeGen/R600/urem.ll | 21 ++++++++---
18 files changed, 283 insertions(+), 39 deletions(-)
create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+ setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 469a8ad..e529f76 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -152,6 +152,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
break;
}
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
EmitByte(INSTR_NATIVE, OS);
@@ -162,9 +163,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case AMDGPU::VTX_READ_PARAM_8_eg:
case AMDGPU::VTX_READ_PARAM_16_eg:
case AMDGPU::VTX_READ_PARAM_32_eg:
+ case AMDGPU::VTX_READ_PARAM_64_eg:
case AMDGPU::VTX_READ_PARAM_128_eg:
case AMDGPU::VTX_READ_GLOBAL_8_eg:
case AMDGPU::VTX_READ_GLOBAL_32_eg:
+ case AMDGPU::VTX_READ_GLOBAL_64_eg:
case AMDGPU::VTX_READ_GLOBAL_128_eg:
case AMDGPU::TEX_VTX_CONSTBUF:
case AMDGPU::TEX_VTX_TEXBUF : {
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 53e6e51..665909d 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -32,22 +32,38 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+ addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+ addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
computeRegisterProperties();
setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+ setOperationAction(ISD::FADD, MVT::v2f32, Expand);
setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+ setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+ setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
setOperationAction(ISD::ADD, MVT::v4i32, Expand);
+ setOperationAction(ISD::ADD, MVT::v2i32, Expand);
setOperationAction(ISD::AND, MVT::v4i32, Expand);
+ setOperationAction(ISD::AND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+ setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+ setOperationAction(ISD::UREM, MVT::v2i32, Expand);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+ setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -157,6 +173,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b232188..3603401 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -49,9 +49,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, DebugLoc DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
- if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
- for (unsigned I = 0; I < 4; I++) {
+ unsigned VectorComponents = 0;
+ if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+ VectorComponents = 4;
+ } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+ AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+ VectorComponents = 2;
+ }
+
+ if (VectorComponents > 0) {
+ for (unsigned I = 0; I < VectorComponents; I++) {
unsigned SubRegIndex = RI.getSubRegFromChannel(I);
buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
RI.getSubReg(DestReg, SubRegIndex),
@@ -60,11 +68,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
RegState::Define | RegState::Implicit);
}
} else {
-
- // We can't copy vec4 registers
- assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
- && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
DestReg, SrcReg);
NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8ede6cc..00f674f 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1719,6 +1719,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
[(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
>;
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+ (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+ 0x3, "RAT_WRITE_CACHELESS_64_eg",
+ [(global_store (v2i32 R600_Reg64:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
//128-bit store
def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
@@ -1824,6 +1831,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
let Constraints = "$ptr.ptr = $dst";
}
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+ : VTX_READ_eg <"VTX_READ_64", buffer_id, (outs R600_Reg64:$dst),
+ pattern> {
+
+ let MEGA_FETCH_COUNT = 8;
+ let DST_SEL_X = 0;
+ let DST_SEL_Y = 1;
+ let DST_SEL_Z = 7;
+ let DST_SEL_W = 7;
+ let DATA_FORMAT = 0x1D; // COLOR_32_32
+}
+
class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
: VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
pattern> {
@@ -1857,6 +1876,11 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
[(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
>;
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+ [(set (v2i32 R600_Reg64:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+
def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
[(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
>;
@@ -1875,6 +1899,12 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
[(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
>;
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+ [(set (v2i32 R600_Reg64:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+
// 128-bit reads
def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
[(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
@@ -2353,10 +2383,24 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 0, sub0>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, R600_Reg64, 0, sub0>;
+def : Extract_Element <i32, v2i32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 1, sub1>;
+
// bitconvert patterns
def : BitConvert <i32, f32, R600_Reg32>;
def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
def : BitConvert <v4f32, v4i32, R600_Reg128>;
def : BitConvert <v4i32, v4f32, R600_Reg128>;
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 03f4976..33593bc 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
let HWEncoding = encoding;
}
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+ RegisterWithSubRegs<n, subregs> {
+ let Namespace = "AMDGPU";
+ let SubRegIndices = [sub0, sub1];
+ let HWEncoding = encoding;
+}
+
+
foreach Index = 0-127 in {
foreach Chan = [ "X", "Y", "Z", "W" ] in {
// 32-bit Temporary Registers
@@ -41,6 +49,11 @@ foreach Index = 0-127 in {
!cast<Register>("T"#Index#"_Z"),
!cast<Register>("T"#Index#"_W")],
Index>;
+
+ def T#Index#_XY : R600Reg_64 <"T"#Index#".XY",
+ [!cast<Register>("T"#Index#"_X"),
+ !cast<Register>("T"#Index#"_Y")],
+ Index>;
}
// KCACHE_BANK0
@@ -178,6 +191,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
let CopyCost = -1;
}
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+ (add (sequence "T%u_XY", 0, 63))>;
+
//===----------------------------------------------------------------------===//
// Register classes for indirect addressing
//===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
new file mode 100644
index 0000000..6f03b68
--- /dev/null
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @v2i32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_extract_store(i32 addrspace(1)* nocapture %out, <2 x i32> %in) {
+entry:
+ %0 = extractelement <2 x i32> %in, i32 0
+ store i32 %0, i32 addrspace(1)* %out, align 4
+ %1 = extractelement <2 x i32> %in, i32 1
+ %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+ store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+ ret void
+}
+
+; CHECK: @v2f32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_extract_store(float addrspace(1)* nocapture %out, <2 x float> %in) {
+entry:
+ %0 = extractelement <2 x float> %in, i32 0
+ store float %0, float addrspace(1)* %out, align 4
+ %1 = extractelement <2 x float> %in, i32 1
+ %arrayidx1 = getelementptr inbounds float addrspace(1)* %out, i32 1
+ store float %1, float addrspace(1)* %arrayidx1, align 4
+ ret void
+}
+
+; CHECK: @v2i32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_store(<2 x i32> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+ store <2 x i32> %in, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @v2f32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_store(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+ store <2 x float> %in, <2 x float> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 81a4fa5..1e51c35 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
+; CHECK: @fadd_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fadd <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fadd_v4f32
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 79e677f..240f1e5 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,15 +1,32 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; CHECK: @fdiv_v2f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fdiv <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @fdiv_v4f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float> addrspace(1) * %in
%b = load <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 7fd22d8..74c277d 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
+; CHECK: @fmul_v2f32
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fmul <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fmul_v4f32
; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 9c21ad2..dabfe41 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @fp_to_sint_v2i32
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+ %result = fptosi <2 x float> %in to <2 x i32>
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fp_to_sint_v4i32
; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index d91098f..95c62f7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @fp_to_uint_v2i32
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+ %result = fptoui <2 x float> %in to <2 x i32>
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @fp_to_uint_v4i32
; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 812388b..f93212c 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -2,7 +2,6 @@
; CHECK: @fsub_f32
; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
define void @fsub_f32() {
%r0 = call float @llvm.R600.load.input(i32 0)
%r1 = call float @llvm.R600.load.input(i32 1)
@@ -15,12 +14,21 @@ declare float @llvm.R600.load.input(i32) readnone
declare void @llvm.AMDGPU.store.output(float, i32)
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fsub_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+ %0 = fsub <2 x float> %a, %b
+ store <2 x float> %0, <2 x float> addrspace(1)* %out
+ ret void
+}
+; CHECK: @fsub_v4f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
%a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 0752f2e..ba8fca7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @setcc_v2i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = icmp eq <2 x i32> %a, %b
+ %sext = sext <2 x i1> %result to <2 x i32>
+ store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @setcc_v4i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 6a56db3..dc163da 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @sint_to_fp_v2i32
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+ %result = sitofp <2 x i32> %in to <2 x float>
+ store <2 x float> %result, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @sint_to_fp_v4i32
; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 47657a6..f7820fe 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,11 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
-;CHECK: RETURN
+; The code generated by udiv is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail on udiv
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @udiv_v2i32
+; CHECK: RETURN
+define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = udiv <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @udiv_v4i32
+; CHECK: RETURN
+define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index ae8fc8e..791f117 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,5 +1,15 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; CHECK: @uint_to_fp_v2i32
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+ %result = uitofp <2 x i32> %in to <2 x float>
+ store <2 x float> %result, <2 x float> addrspace(1)* %out
+ ret void
+}
+
; CHECK: @uint_to_fp_v4i32
; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index 2e7388c..15957e8 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,11 +1,20 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;The code generated by urem is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 urem
-;CHECK: RETURN
+; The code generated by urem is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail when it gets
+; a urem.
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @urem_v2i32
+; CHECK: RETURN
+define void @urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+ %result = urem <2 x i32> %a, %b
+ store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+ ret void
+}
+
+; CHECK: @urem_v4i32
+; CHECK: RETURN
+define void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
%a = load <4 x i32> addrspace(1) * %in
%b = load <4 x i32> addrspace(1) * %b_ptr
--
1.8.1.5
-------------- next part --------------
>From ed6a12cc1cc356258b44a7959017c7a787112056 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 9 Apr 2013 12:30:09 -0400
Subject: [PATCH 2/8] R600/SI: Use InstFlag for VOP3 modifier operands
InstFlag has a default value of 0 and will simplify the VOP3 patterns.
---
lib/Target/R600/SIInstrInfo.td | 4 ++--
lib/Target/R600/SIInstructions.td | 25 ++++++++++++-------------
2 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index a97dbaa..aafc331 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -259,14 +259,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_32:$dst),
(ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
- i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+ InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
>, VOP <opName>;
class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_64:$dst),
(ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
- i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+ InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
>, VOP <opName>;
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index e481ef9..2ab3486 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -990,17 +990,17 @@ def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
def : Pat <
(mul VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
>;
def : Pat <
(mulhu VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
>;
def : Pat <
(mulhs VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
>;
def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
@@ -1475,20 +1475,20 @@ def : Pat <
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
(V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
(EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub0),
+ (EXTRACT_SUBREG VReg_128:$src, sub2)),
+ sub0),
(V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
(EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub1),
+ (EXTRACT_SUBREG VReg_128:$src, sub2)),
+ sub1),
(V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
(EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub2),
+ (EXTRACT_SUBREG VReg_128:$src, sub2)),
+ sub2),
(V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
(EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub3)
+ (EXTRACT_SUBREG VReg_128:$src, sub2)),
+ sub3)
>;
def : Pat <
@@ -1527,8 +1527,7 @@ def : Pat <
/********** ================== **********/
def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)),
- (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
- 0, 0, 0, 0)>;
+ (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2)>;
/********** ================== **********/
/********** SMRD Patterns **********/
--
1.8.1.5
-------------- next part --------------
>From 6136720f46a26693b961a849f897419d9599f9ba Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 9 Apr 2013 12:31:14 -0400
Subject: [PATCH 3/8] R600: Add pattern for the BFI_INT instruction
---
lib/Target/R600/AMDGPUInstructions.td | 20 ++++++++++++++++++++
lib/Target/R600/R600Instructions.td | 3 +++
lib/Target/R600/SIInstructions.td | 1 +
test/CodeGen/R600/bfi_int.ll | 34 ++++++++++++++++++++++++++++++++++
4 files changed, 58 insertions(+)
create mode 100644 test/CodeGen/R600/bfi_int.ll
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index fa890c1..4b37a53 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -261,6 +261,26 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
(vt rc:$addr)
>;
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT> {
+
+ // Definition from ISA doc:
+ // (y & x) | (z & ~x)
+ def : Pat <
+ (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+ // SHA-256 Ch function
+ // z ^ (x & (y ^ z))
+ def : Pat <
+ (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+}
+
include "R600Instructions.td"
include "SIInstrInfo.td"
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 00f674f..fbf9f40 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1568,6 +1568,9 @@ let Predicates = [isEGorCayman] in {
VecALU
>;
+ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>;
+ defm : BFIPatterns <BFI_INT_eg>;
+
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
[(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
R600_Reg32:$src2))],
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 2ab3486..9faf89b 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -948,6 +948,7 @@ def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+defm : BFIPatterns <V_BFI_B32>;
def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
new file mode 100644
index 0000000..c9015a6
--- /dev/null
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+
+; BFI_INT Definition pattern from ISA docs
+; (y & x) | (z & ~x)
+;
+; R600-CHECK: @bfi_def
+; R600-CHECK: BFI_INT
+; SI-CHECK: @bfi_def
+; SI-CHECK: V_BFI_B32
+define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+ %0 = xor i32 %x, -1
+ %1 = and i32 %z, %0
+ %2 = and i32 %y, %x
+ %3 = or i32 %1, %2
+ store i32 %3, i32 addrspace(1)* %out
+ ret void
+}
+
+; SHA-256 Ch function
+; z ^ (x & (y ^ z))
+; R600-CHECK: @bfi_sha256_ch
+; R600-CHECK: BFI_INT
+; SI-CHECK: @bfi_sha256_ch
+; SI-CHECK: V_BFI_B32
+define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+ %0 = xor i32 %y, %z
+ %1 = and i32 %x, %0
+ %2 = xor i32 %z, %1
+ store i32 %2, i32 addrspace(1)* %out
+ ret void
+}
--
1.8.1.5
-------------- next part --------------
>From ca217819c518e4de2f2dcc3e9044ccb2375b1474 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 10 Apr 2013 14:53:58 -0400
Subject: [PATCH 4/8] R600: Emit RETURN instructions as a NOP with End of
Program bit set
---
lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index e529f76..1c27d2b 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -141,7 +141,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups) const {
if (isFCOp(MI.getOpcode())){
EmitFCInstr(MI, OS);
- } else if (MI.getOpcode() == AMDGPU::RETURN ||
+ } else if (MI.getOpcode() == AMDGPU::RETURN) {
+ EmitByte(INSTR_NATIVE, OS);
+ uint64_t Nop = 1ULL << 53;
+ Emit(Nop, OS);
+ } else if(
MI.getOpcode() == AMDGPU::BUNDLE ||
MI.getOpcode() == AMDGPU::KILL) {
return;
--
1.8.1.5
-------------- next part --------------
>From c0cb56a4e2bb43e0caaf0b6c080b72a34bc4a075 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 10 Apr 2013 14:55:16 -0400
Subject: [PATCH 5/8] R600: limit vtx clauses to one instructions
This fixes incorrect address calculations when multiple vtx instructions
are in the same clause.
---
lib/Target/R600/R600ControlFlowFinalizer.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 2350130..6a8fcb7 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -174,9 +174,9 @@ public:
ST(tm.getSubtarget<AMDGPUSubtarget>()) {
const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
- MaxFetchInst = 8;
+ MaxFetchInst = 1;
else
- MaxFetchInst = 16;
+ MaxFetchInst = 1;
}
virtual bool runOnMachineFunction(MachineFunction &MF) {
--
1.8.1.5
-------------- next part --------------
>From 4e17edf6af594ef103fdd2568864cd84bdadcd7a Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 11 Apr 2013 13:22:30 -0700
Subject: [PATCH 6/8] R600: CONSTANT_LOAD_eg should be considered a fetch
instruction
It is really a vertex fetch.
---
lib/Target/R600/R600ControlFlowFinalizer.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 6a8fcb7..1d3f4b3 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -49,6 +49,7 @@ private:
bool isFetch(const MachineInstr *MI) const {
switch (MI->getOpcode()) {
+ case AMDGPU::CONSTANT_LOAD_eg:
case AMDGPU::TEX_VTX_CONSTBUF:
case AMDGPU::TEX_VTX_TEXBUF:
case AMDGPU::TEX_LD:
--
1.8.1.5
-------------- next part --------------
>From 276532ce6cd42b2c900511727946b93a65c61292 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 12 Apr 2013 13:35:54 -0400
Subject: [PATCH 7/8] R600: Clean up instruction class definitions
---
lib/Target/R600/R600Instructions.td | 37 ++++++++++++++-----------------------
1 file changed, 14 insertions(+), 23 deletions(-)
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index fbf9f40..d42ad95 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -13,7 +13,7 @@
include "R600Intrinsics.td"
-class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
InstrItinClass itin>
: AMDGPUInst <outs, ins, asm, pattern> {
@@ -26,8 +26,6 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
bit Op2 = 0;
bit HasNativeOperands = 0;
- bits<11> op_code = inst;
- //let Inst = inst;
let Namespace = "AMDGPU";
let OutOperandList = outs;
let InOperandList = ins;
@@ -48,8 +46,7 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
}
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
- AMDGPUInst <outs, ins, asm, pattern> {
- field bits<64> Inst;
+ InstR600 <outs, ins, asm, pattern, NullALU> {
let Namespace = "AMDGPU";
}
@@ -346,8 +343,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
// and R600InstrInfo::getOperandIdx().
class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <0,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
@@ -385,8 +381,7 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <inst,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
@@ -423,8 +418,7 @@ class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
// R600InstrInfo::getOperandIdx().
class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <0,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
@@ -450,8 +444,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
InstrItinClass itin = VecALU> :
- InstR600 <inst,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
ins,
asm,
pattern,
@@ -459,8 +452,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <inst,
- (outs R600_Reg128:$DST_GPR),
+ InstR600 <(outs R600_Reg128:$DST_GPR),
(ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
!strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
pattern,
@@ -1272,7 +1264,6 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
multiclass CUBE_Common <bits<11> inst> {
def _pseudo : InstR600 <
- inst,
(outs R600_Reg128:$dst),
(ins R600_Reg128:$src),
"CUBE $dst $src",
@@ -1987,21 +1978,21 @@ def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
let isPseudo = 1 in {
def PRED_X : InstR600 <
- 0, (outs R600_Predicate_Bit:$dst),
+ (outs R600_Predicate_Bit:$dst),
(ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
"", [], NullALU> {
let FlagOperandIdx = 3;
}
let isTerminator = 1, isBranch = 1 in {
-def JUMP_COND : InstR600 <0x10,
+def JUMP_COND : InstR600 <
(outs),
(ins brtarget:$target, R600_Predicate_Bit:$p),
"JUMP $target ($p)",
[], AnyALU
>;
-def JUMP : InstR600 <0x10,
+def JUMP : InstR600 <
(outs),
(ins brtarget:$target),
"JUMP $target",
@@ -2028,18 +2019,18 @@ def MASK_WRITE : AMDGPUShaderInst <
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
-def TXD: AMDGPUShaderInst <
+def TXD: InstR600 <
(outs R600_Reg128:$dst),
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
- [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))], NullALU> {
>;
-def TXD_SHADOW: AMDGPUShaderInst <
+def TXD_SHADOW: InstR600 <
(outs R600_Reg128:$dst),
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
- [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+ [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], NullALU
>;
} // End isPseudo = 1
--
1.8.1.5
-------------- next part --------------
>From 7720b0caf13aae5ac63ed70a4b94a466254b58a4 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 11 Apr 2013 13:47:43 -0700
Subject: [PATCH 8/8] R600: Add FetchInst bit to instruction defs to denote
vertex/tex instructions
---
lib/Target/R600/R600ControlFlowFinalizer.cpp | 38 ++--------------------------
lib/Target/R600/R600Defines.h | 12 ++++++++-
lib/Target/R600/R600Instructions.td | 16 +++++++++---
3 files changed, 26 insertions(+), 40 deletions(-)
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 1d3f4b3..d67d13b 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -47,40 +47,6 @@ private:
unsigned MaxFetchInst;
const AMDGPUSubtarget &ST;
- bool isFetch(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- case AMDGPU::CONSTANT_LOAD_eg:
- case AMDGPU::TEX_VTX_CONSTBUF:
- case AMDGPU::TEX_VTX_TEXBUF:
- case AMDGPU::TEX_LD:
- case AMDGPU::TEX_GET_TEXTURE_RESINFO:
- case AMDGPU::TEX_GET_GRADIENTS_H:
- case AMDGPU::TEX_GET_GRADIENTS_V:
- case AMDGPU::TEX_SET_GRADIENTS_H:
- case AMDGPU::TEX_SET_GRADIENTS_V:
- case AMDGPU::TEX_SAMPLE:
- case AMDGPU::TEX_SAMPLE_C:
- case AMDGPU::TEX_SAMPLE_L:
- case AMDGPU::TEX_SAMPLE_C_L:
- case AMDGPU::TEX_SAMPLE_LB:
- case AMDGPU::TEX_SAMPLE_C_LB:
- case AMDGPU::TEX_SAMPLE_G:
- case AMDGPU::TEX_SAMPLE_C_G:
- case AMDGPU::TXD:
- case AMDGPU::TXD_SHADOW:
- case AMDGPU::VTX_READ_GLOBAL_8_eg:
- case AMDGPU::VTX_READ_GLOBAL_32_eg:
- case AMDGPU::VTX_READ_GLOBAL_128_eg:
- case AMDGPU::VTX_READ_PARAM_8_eg:
- case AMDGPU::VTX_READ_PARAM_16_eg:
- case AMDGPU::VTX_READ_PARAM_32_eg:
- case AMDGPU::VTX_READ_PARAM_128_eg:
- return true;
- default:
- return false;
- }
- }
-
bool IsTrivialInst(MachineInstr *MI) const {
switch (MI->getOpcode()) {
case AMDGPU::KILL:
@@ -145,7 +111,7 @@ private:
for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
if (IsTrivialInst(I))
continue;
- if (!isFetch(I))
+ if (!R600::isFetch(I))
break;
AluInstCount ++;
if (AluInstCount > MaxFetchInst)
@@ -197,7 +163,7 @@ public:
}
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E;) {
- if (isFetch(I)) {
+ if (R600::isFetch(I)) {
DEBUG(dbgs() << CfCount << ":"; I->dump(););
I = MakeFetchClause(MBB, I, 0);
CfCount++;
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 16cfcf5..be7f64b 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -12,6 +12,7 @@
#define R600DEFINES_H_
#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
// Operand Flags
#define MO_FLAG_CLAMP (1 << 0)
@@ -39,7 +40,8 @@ namespace R600_InstFlag {
//FlagOperand bits 7, 8
NATIVE_OPERANDS = (1 << 9),
OP1 = (1 << 10),
- OP2 = (1 << 11)
+ OP2 = (1 << 11),
+ FETCH_INST = (1 << 12)
};
}
@@ -94,4 +96,12 @@ namespace R600Operands {
}
+namespace R600 {
+
+inline static bool isFetch(const llvm::MachineInstr *MI) {
+ return MI->getDesc().TSFlags & R600_InstFlag::FETCH_INST;
+}
+
+} // End namepsace R600
+
#endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d42ad95..9941dd4 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -25,6 +25,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
bit Op1 = 0;
bit Op2 = 0;
bit HasNativeOperands = 0;
+ bit FetchInst = 0;
let Namespace = "AMDGPU";
let OutOperandList = outs;
@@ -43,6 +44,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
let TSFlags{9} = HasNativeOperands;
let TSFlags{10} = Op1;
let TSFlags{11} = Op2;
+ let TSFlags{12} = FetchInst;
}
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -478,6 +480,8 @@ class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
let COORD_TYPE_Y = 0;
let COORD_TYPE_Z = 0;
let COORD_TYPE_W = 0;
+
+ let FetchInst = 1;
}
} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
@@ -1778,6 +1782,8 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+
+ let FetchInst = 1;
}
class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
@@ -2024,15 +2030,17 @@ def TXD: InstR600 <
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
[(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))], NullALU> {
->;
+ let FetchInst = 1;
+}
def TXD_SHADOW: InstR600 <
(outs R600_Reg128:$dst),
(ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
[(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], NullALU
->;
-
+> {
+ let FetchInst = 1;
+}
} // End isPseudo = 1
} // End usesCustomInserter = 1
@@ -2118,6 +2126,7 @@ def TEX_VTX_CONSTBUF :
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+ let FetchInst = 1;
}
def TEX_VTX_TEXBUF:
@@ -2171,6 +2180,7 @@ let Inst{63-32} = Word1;
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+ let FetchInst = 1;
}
--
1.8.1.5
More information about the llvm-commits
mailing list