R600 Patches (Plus one for SI): Various fixes to get the piglit all_cl tests to stop hanging

Fri Apr 12 13:52:11 PDT 2013

Hi,

Attached are several patches to get the piglit test suite to complete a
full run with all_cl.tests without locking up on my Juniper.

Please Review.

-Tom
-------------- next part --------------
>From dbcfd9c4ae491745ece761d97a3495bde0e54660 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov <dcherkassov at gmail.com>
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/8] R600: Add 64-bit float load/store support

* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 -> v2f32 conversions

Signed-off-by: Dmitry Cherkassov <dcherkassov at gmail.com>

Tom Stellard:
  - Mark vec2 operations as expand.  The addition of a vec2 register
    class made them all legal.
---
 lib/Target/R600/AMDGPUISelLowering.cpp             |  6 +++
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  3 ++
 lib/Target/R600/R600ISelLowering.cpp               | 17 +++++++++
 lib/Target/R600/R600InstrInfo.cpp                  | 19 ++++++----
 lib/Target/R600/R600Instructions.td                | 44 ++++++++++++++++++++++
 lib/Target/R600/R600RegisterInfo.td                | 16 ++++++++
 test/CodeGen/R600/64bit-kernel-args.ll             | 41 ++++++++++++++++++++
 test/CodeGen/R600/fadd.ll                          | 10 +++++
 test/CodeGen/R600/fdiv.ll                          | 37 +++++++++++++-----
 test/CodeGen/R600/fmul.ll                          | 10 +++++
 test/CodeGen/R600/fp_to_sint.ll                    | 10 +++++
 test/CodeGen/R600/fp_to_uint.ll                    | 10 +++++
 test/CodeGen/R600/fsub.ll                          | 20 +++++++---
 test/CodeGen/R600/setcc.ll                         | 18 +++++++--
 test/CodeGen/R600/sint_to_fp.ll                    | 10 +++++
 test/CodeGen/R600/udiv.ll                          | 20 +++++++---
 test/CodeGen/R600/uint_to_fp.ll                    | 10 +++++
 test/CodeGen/R600/urem.ll                          | 21 ++++++++---
 18 files changed, 283 insertions(+), 39 deletions(-)
 create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 469a8ad..e529f76 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -152,6 +152,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
       break;
     }
     case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+    case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
     case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
       uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
       EmitByte(INSTR_NATIVE, OS);
@@ -162,9 +163,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case AMDGPU::VTX_READ_PARAM_8_eg:
     case AMDGPU::VTX_READ_PARAM_16_eg:
     case AMDGPU::VTX_READ_PARAM_32_eg:
+    case AMDGPU::VTX_READ_PARAM_64_eg:
     case AMDGPU::VTX_READ_PARAM_128_eg:
     case AMDGPU::VTX_READ_GLOBAL_8_eg:
     case AMDGPU::VTX_READ_GLOBAL_32_eg:
+    case AMDGPU::VTX_READ_GLOBAL_64_eg:
     case AMDGPU::VTX_READ_GLOBAL_128_eg:
     case AMDGPU::TEX_VTX_CONSTBUF:
     case AMDGPU::TEX_VTX_TEXBUF : {
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 53e6e51..665909d 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -32,22 +32,38 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
   computeRegisterProperties();
 
   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+  setOperationAction(ISD::FADD, MVT::v2f32, Expand);
   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+  setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+  setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+  setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
 
   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
+  setOperationAction(ISD::ADD,  MVT::v2i32, Expand);
   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
+  setOperationAction(ISD::AND,  MVT::v2i32, Expand);
   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+  setOperationAction(ISD::UREM, MVT::v2i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -157,6 +173,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b232188..3603401 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -49,9 +49,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const {
-  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
-      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
-    for (unsigned I = 0; I < 4; I++) {
+  unsigned VectorComponents = 0;
+  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+    VectorComponents = 4;
+  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
       unsigned SubRegIndex = RI.getSubRegFromChannel(I);
       buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
@@ -60,11 +68,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                       RegState::Define | RegState::Implicit);
     }
   } else {
-
-    // We can't copy vec4 registers
-    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
-           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
     MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                                                   DestReg, SrcReg);
     NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8ede6cc..00f674f 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1719,6 +1719,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
   [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
 >;
 
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0x3, "RAT_WRITE_CACHELESS_64_eg",
+  [(global_store (v2i32 R600_Reg64:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
 //128-bit store
 def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
@@ -1824,6 +1831,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
   let Constraints = "$ptr.ptr = $dst";
 }
 
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64", buffer_id, (outs R600_Reg64:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  7;
+  let DST_SEL_W        =  7;
+  let DATA_FORMAT      =  0x1D; // COLOR_32_32
+}
+
 class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
     : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
                    pattern> {
@@ -1857,6 +1876,11 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
   [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
 >;
 
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set (v2i32 R600_Reg64:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+
 def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
   [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
 >;
@@ -1875,6 +1899,12 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
   [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
 >;
 
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set (v2i32 R600_Reg64:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+
 // 128-bit reads
 def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
   [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
@@ -2353,10 +2383,24 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
 def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
 def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
 
+def : Extract_Element <f32, v2f32, R600_Reg64, 0, sub0>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, R600_Reg64, 0, sub0>;
+def : Extract_Element <i32, v2i32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 1, sub1>;
+
 // bitconvert patterns
 
 def : BitConvert <i32, f32, R600_Reg32>;
 def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
 def : BitConvert <v4f32, v4i32, R600_Reg128>;
 def : BitConvert <v4i32, v4f32, R600_Reg128>;
 
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 03f4976..33593bc 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
   let HWEncoding = encoding;
 }
 
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+}
+
+
 foreach Index = 0-127 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
     // 32-bit Temporary Registers
@@ -41,6 +49,11 @@ foreach Index = 0-127 in {
                                     !cast<Register>("T"#Index#"_Z"),
                                     !cast<Register>("T"#Index#"_W")],
                                    Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#".XY",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
 }
 
 // KCACHE_BANK0
@@ -178,6 +191,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   let CopyCost = -1;
 }
 
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
+
 //===----------------------------------------------------------------------===//
 // Register classes for indirect addressing
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
new file mode 100644
index 0000000..6f03b68
--- /dev/null
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @v2i32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_extract_store(i32 addrspace(1)* nocapture %out, <2 x i32> %in)  {
+entry:
+  %0 = extractelement <2 x i32> %in, i32 0
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  %1 = extractelement <2 x i32> %in, i32 1
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: @v2f32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_extract_store(float addrspace(1)* nocapture %out, <2 x float> %in)  {
+entry:
+  %0 = extractelement <2 x float> %in, i32 0
+  store float %0, float addrspace(1)* %out, align 4
+  %1 = extractelement <2 x float> %in, i32 1
+  %arrayidx1 = getelementptr inbounds float addrspace(1)* %out, i32 1
+  store float %1, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: @v2i32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_store(<2 x i32> addrspace(1)* %out, <2 x i32>  %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @v2f32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_store(<2 x float> addrspace(1)* %out, <2 x float>  %in) {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 81a4fa5..1e51c35 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fadd_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fadd <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fadd_v4f32
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 79e677f..240f1e5 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,15 +1,32 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; CHECK: @fdiv_v2f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fdiv_v4f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
   %b = load <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 7fd22d8..74c277d 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fmul_v2f32
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fmul <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fmul_v4f32
 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 9c21ad2..dabfe41 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_sint_v2i32
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptosi <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fp_to_sint_v4i32
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index d91098f..95c62f7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_uint_v2i32
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptoui <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fp_to_uint_v4i32
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 812388b..f93212c 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -2,7 +2,6 @@
 
 ; CHECK: @fsub_f32
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
 define void @fsub_f32() {
    %r0 = call float @llvm.R600.load.input(i32 0)
    %r1 = call float @llvm.R600.load.input(i32 1)
@@ -15,12 +14,21 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fsub_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fsub <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
 
+; CHECK: @fsub_v4f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 0752f2e..ba8fca7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @setcc_v2i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = icmp eq <2 x i32> %a, %b
+  %sext = sext <2 x i1> %result to <2 x i32>
+  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @setcc_v4i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 6a56db3..dc163da 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @sint_to_fp_v2i32
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = sitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @sint_to_fp_v4i32
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 47657a6..f7820fe 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,11 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
-;CHECK: RETURN
+; The code generated by udiv is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail on udiv
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @udiv_v2i32
+; CHECK: RETURN
+define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = udiv <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @udiv_v4i32
+; CHECK: RETURN
+define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index ae8fc8e..791f117 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @uint_to_fp_v2i32
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = uitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @uint_to_fp_v4i32
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index 2e7388c..15957e8 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,11 +1,20 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;The code generated by urem is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 urem
-;CHECK: RETURN
+; The code generated by urem is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail when it gets
+; a urem.
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @urem_v2i32
+; CHECK: RETURN
+define void @urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = urem <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @urem_v4i32
+; CHECK: RETURN
+define void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
-- 
1.8.1.5

-------------- next part --------------
>From ed6a12cc1cc356258b44a7959017c7a787112056 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 9 Apr 2013 12:30:09 -0400
Subject: [PATCH 2/8] R600/SI: Use InstFlag for VOP3 modifier operands

InstFlag has a default value of 0 and will simplify the VOP3 patterns.
---
 lib/Target/R600/SIInstrInfo.td    |  4 ++--
 lib/Target/R600/SIInstructions.td | 25 ++++++++++++-------------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index a97dbaa..aafc331 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -259,14 +259,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
 class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
-   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
   opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
-   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
   opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index e481ef9..2ab3486 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -990,17 +990,17 @@ def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 
 def : Pat <
   (mul VSrc_32:$src0, VReg_32:$src1),
-  (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+  (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
 >;
 
 def : Pat <
   (mulhu VSrc_32:$src0, VReg_32:$src1),
-  (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+  (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
 >;
 
 def : Pat <
   (mulhs VSrc_32:$src0, VReg_32:$src1),
-  (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+  (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0))
 >;
 
 def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
@@ -1475,20 +1475,20 @@ def : Pat <
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
     (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
                   (EXTRACT_SUBREG VReg_128:$src, sub1),
-                  (EXTRACT_SUBREG VReg_128:$src, sub2),
-                  0, 0, 0, 0), sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2)),
+                   sub0),
     (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
                   (EXTRACT_SUBREG VReg_128:$src, sub1),
-                  (EXTRACT_SUBREG VReg_128:$src, sub2),
-                  0, 0, 0, 0), sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2)),
+                   sub1),
     (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
                   (EXTRACT_SUBREG VReg_128:$src, sub1),
-                  (EXTRACT_SUBREG VReg_128:$src, sub2),
-                  0, 0, 0, 0), sub2),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2)),
+                   sub2),
     (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
                   (EXTRACT_SUBREG VReg_128:$src, sub1),
-                  (EXTRACT_SUBREG VReg_128:$src, sub2),
-                  0, 0, 0, 0), sub3)
+                  (EXTRACT_SUBREG VReg_128:$src, sub2)),
+                   sub3)
 >;
 
 def : Pat <
@@ -1527,8 +1527,7 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)),
-           (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
-            0, 0, 0, 0)>;
+           (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2)>;
 
 /********** ================== **********/
 /**********   SMRD Patterns    **********/
-- 
1.8.1.5

-------------- next part --------------
>From 6136720f46a26693b961a849f897419d9599f9ba Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 9 Apr 2013 12:31:14 -0400
Subject: [PATCH 3/8] R600: Add pattern for the BFI_INT instruction

---
 lib/Target/R600/AMDGPUInstructions.td | 20 ++++++++++++++++++++
 lib/Target/R600/R600Instructions.td   |  3 +++
 lib/Target/R600/SIInstructions.td     |  1 +
 test/CodeGen/R600/bfi_int.ll          | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+)
 create mode 100644 test/CodeGen/R600/bfi_int.ll

diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index fa890c1..4b37a53 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -261,6 +261,26 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
   (vt rc:$addr)
 >;
 
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT> {
+
+  // Definition from ISA doc:
+  // (y & x) | (z & ~x)
+  def : Pat <
+    (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+  // SHA-256 Ch function
+  // z ^ (x & (y ^ z))
+  def : Pat <
+    (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+    (BFI_INT $x, $y, $z)
+  >;
+
+}
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 00f674f..fbf9f40 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1568,6 +1568,9 @@ let Predicates = [isEGorCayman] in {
     VecALU
   >;
 
+  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>;
+  defm : BFIPatterns <BFI_INT_eg>;
+
   def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
     [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
                                           R600_Reg32:$src2))],
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 2ab3486..9faf89b 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -948,6 +948,7 @@ def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
 def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
 def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
 def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+defm : BFIPatterns <V_BFI_B32>;
 def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
 def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
new file mode 100644
index 0000000..c9015a6
--- /dev/null
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+
+; BFI_INT Definition pattern from ISA docs
+; (y & x) | (z & ~x)
+;
+; R600-CHECK: @bfi_def
+; R600-CHECK: BFI_INT
+; SI-CHECK:   @bfi_def
+; SI-CHECK:   V_BFI_B32
+define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %x, -1
+  %1 = and i32 %z, %0
+  %2 = and i32 %y, %x
+  %3 = or i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; SHA-256 Ch function
+; z ^ (x & (y ^ z))
+; R600-CHECK: @bfi_sha256_ch
+; R600-CHECK: BFI_INT
+; SI-CHECK:   @bfi_sha256_ch
+; SI-CHECK:   V_BFI_B32
+define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = xor i32 %y, %z
+  %1 = and i32 %x, %0
+  %2 = xor i32 %z, %1
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.8.1.5

-------------- next part --------------
>From ca217819c518e4de2f2dcc3e9044ccb2375b1474 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 10 Apr 2013 14:53:58 -0400
Subject: [PATCH 4/8] R600: Emit RETURN instructions as a NOP with End of
 Program bit set

---
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index e529f76..1c27d2b 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -141,7 +141,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
   if (isFCOp(MI.getOpcode())){
     EmitFCInstr(MI, OS);
-  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+  } else if (MI.getOpcode() == AMDGPU::RETURN) {
+    EmitByte(INSTR_NATIVE, OS);
+    uint64_t Nop = 1ULL << 53;
+    Emit(Nop, OS);
+  } else if(
     MI.getOpcode() == AMDGPU::BUNDLE ||
     MI.getOpcode() == AMDGPU::KILL) {
     return;
-- 
1.8.1.5

-------------- next part --------------
>From c0cb56a4e2bb43e0caaf0b6c080b72a34bc4a075 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 10 Apr 2013 14:55:16 -0400
Subject: [PATCH 5/8] R600: limit vtx clauses to one instructions

This fixes incorrect address calculations when multiple vtx instructions
are in the same clause.
---
 lib/Target/R600/R600ControlFlowFinalizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 2350130..6a8fcb7 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -174,9 +174,9 @@ public:
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
       if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
-        MaxFetchInst = 8;
+        MaxFetchInst = 1;
       else
-        MaxFetchInst = 16;
+        MaxFetchInst = 1;
   }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
-- 
1.8.1.5

-------------- next part --------------
>From 4e17edf6af594ef103fdd2568864cd84bdadcd7a Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 11 Apr 2013 13:22:30 -0700
Subject: [PATCH 6/8] R600: CONSTANT_LOAD_eg should be considered a fetch
 instruction

It is really a vertex fetch.
---
 lib/Target/R600/R600ControlFlowFinalizer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 6a8fcb7..1d3f4b3 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -49,6 +49,7 @@ private:
 
   bool isFetch(const MachineInstr *MI) const {
     switch (MI->getOpcode()) {
+    case AMDGPU::CONSTANT_LOAD_eg:
     case AMDGPU::TEX_VTX_CONSTBUF:
     case AMDGPU::TEX_VTX_TEXBUF:
     case AMDGPU::TEX_LD:
-- 
1.8.1.5

-------------- next part --------------
>From 276532ce6cd42b2c900511727946b93a65c61292 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 12 Apr 2013 13:35:54 -0400
Subject: [PATCH 7/8] R600: Clean up instruction class definitions

---
 lib/Target/R600/R600Instructions.td | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index fbf9f40..d42ad95 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -13,7 +13,7 @@
 
 include "R600Intrinsics.td"
 
-class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
                 InstrItinClass itin>
     : AMDGPUInst <outs, ins, asm, pattern> {
 
@@ -26,8 +26,6 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
   bit Op2 = 0;
   bit HasNativeOperands = 0;
 
-  bits<11> op_code = inst;
-  //let Inst = inst;
   let Namespace = "AMDGPU";
   let OutOperandList = outs;
   let InOperandList = ins;
@@ -48,8 +46,7 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
 }
 
 class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst <outs, ins, asm, pattern> {
-  field bits<64> Inst;
+    InstR600 <outs, ins, asm, pattern, NullALU> {
 
   let Namespace = "AMDGPU";
 }
@@ -346,8 +343,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 // and R600InstrInfo::getOperandIdx().
 class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
                 InstrItinClass itin = AnyALU> :
-    InstR600 <0,
-              (outs R600_Reg32:$dst),
+    InstR600 <(outs R600_Reg32:$dst),
               (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
                    R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
                    LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
@@ -385,8 +381,7 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
 // R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
 class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
                 InstrItinClass itin = AnyALU> :
-  InstR600 <inst,
-          (outs R600_Reg32:$dst),
+  InstR600 <(outs R600_Reg32:$dst),
           (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
                OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
                R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
@@ -423,8 +418,7 @@ class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
 // R600InstrInfo::getOperandIdx().
 class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
                 InstrItinClass itin = AnyALU> :
-  InstR600 <0,
-          (outs R600_Reg32:$dst),
+  InstR600 <(outs R600_Reg32:$dst),
           (ins REL:$dst_rel, CLAMP:$clamp,
                R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
                R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
@@ -450,8 +444,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
 
 class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
                       InstrItinClass itin = VecALU> :
-  InstR600 <inst,
-          (outs R600_Reg32:$dst),
+  InstR600 <(outs R600_Reg32:$dst),
           ins,
           asm,
           pattern,
@@ -459,8 +452,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
 
 class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
                 InstrItinClass itin = AnyALU> :
-  InstR600 <inst,
-          (outs R600_Reg128:$DST_GPR),
+  InstR600 <(outs R600_Reg128:$DST_GPR),
           (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
           !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
           pattern,
@@ -1272,7 +1264,6 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 multiclass CUBE_Common <bits<11> inst> {
 
   def _pseudo : InstR600 <
-    inst,
     (outs R600_Reg128:$dst),
     (ins R600_Reg128:$src),
     "CUBE $dst $src",
@@ -1987,21 +1978,21 @@ def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
 let isPseudo = 1 in {
 
 def PRED_X : InstR600 <
-  0, (outs R600_Predicate_Bit:$dst),
+  (outs R600_Predicate_Bit:$dst),
   (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
   "", [], NullALU> {
   let FlagOperandIdx = 3;
 }
 
 let isTerminator = 1, isBranch = 1 in {
-def JUMP_COND : InstR600 <0x10,
+def JUMP_COND : InstR600 <
           (outs),
           (ins brtarget:$target, R600_Predicate_Bit:$p),
           "JUMP $target ($p)",
           [], AnyALU
   >;
 
-def JUMP : InstR600 <0x10,
+def JUMP : InstR600 <
           (outs),
           (ins brtarget:$target),
           "JUMP $target",
@@ -2028,18 +2019,18 @@ def MASK_WRITE : AMDGPUShaderInst <
 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
 
 
-def TXD: AMDGPUShaderInst <
+def TXD: InstR600 <
   (outs R600_Reg128:$dst),
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))], NullALU> {
 >;
 
-def TXD_SHADOW: AMDGPUShaderInst <
+def TXD_SHADOW: InstR600 <
   (outs R600_Reg128:$dst),
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
-  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], NullALU
 >;
 
 } // End isPseudo = 1
-- 
1.8.1.5

-------------- next part --------------
>From 7720b0caf13aae5ac63ed70a4b94a466254b58a4 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 11 Apr 2013 13:47:43 -0700
Subject: [PATCH 8/8] R600: Add FetchInst bit to instruction defs to denote
 vertex/tex instructions

---
 lib/Target/R600/R600ControlFlowFinalizer.cpp | 38 ++--------------------------
 lib/Target/R600/R600Defines.h                | 12 ++++++++-
 lib/Target/R600/R600Instructions.td          | 16 +++++++++---
 3 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 1d3f4b3..d67d13b 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -47,40 +47,6 @@ private:
   unsigned MaxFetchInst;
   const AMDGPUSubtarget &ST;
 
-  bool isFetch(const MachineInstr *MI) const {
-    switch (MI->getOpcode()) {
-    case AMDGPU::CONSTANT_LOAD_eg:
-    case AMDGPU::TEX_VTX_CONSTBUF:
-    case AMDGPU::TEX_VTX_TEXBUF:
-    case AMDGPU::TEX_LD:
-    case AMDGPU::TEX_GET_TEXTURE_RESINFO:
-    case AMDGPU::TEX_GET_GRADIENTS_H:
-    case AMDGPU::TEX_GET_GRADIENTS_V:
-    case AMDGPU::TEX_SET_GRADIENTS_H:
-    case AMDGPU::TEX_SET_GRADIENTS_V:
-    case AMDGPU::TEX_SAMPLE:
-    case AMDGPU::TEX_SAMPLE_C:
-    case AMDGPU::TEX_SAMPLE_L:
-    case AMDGPU::TEX_SAMPLE_C_L:
-    case AMDGPU::TEX_SAMPLE_LB:
-    case AMDGPU::TEX_SAMPLE_C_LB:
-    case AMDGPU::TEX_SAMPLE_G:
-    case AMDGPU::TEX_SAMPLE_C_G:
-    case AMDGPU::TXD:
-    case AMDGPU::TXD_SHADOW:
-    case AMDGPU::VTX_READ_GLOBAL_8_eg:
-    case AMDGPU::VTX_READ_GLOBAL_32_eg:
-    case AMDGPU::VTX_READ_GLOBAL_128_eg:
-    case AMDGPU::VTX_READ_PARAM_8_eg:
-    case AMDGPU::VTX_READ_PARAM_16_eg:
-    case AMDGPU::VTX_READ_PARAM_32_eg:
-    case AMDGPU::VTX_READ_PARAM_128_eg:
-     return true;
-    default:
-      return false;
-    }
-  }
-
   bool IsTrivialInst(MachineInstr *MI) const {
     switch (MI->getOpcode()) {
     case AMDGPU::KILL:
@@ -145,7 +111,7 @@ private:
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
       if (IsTrivialInst(I))
         continue;
-      if (!isFetch(I))
+      if (!R600::isFetch(I))
         break;
       AluInstCount ++;
       if (AluInstCount > MaxFetchInst)
@@ -197,7 +163,7 @@ public:
       }
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
-        if (isFetch(I)) {
+        if (R600::isFetch(I)) {
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           I = MakeFetchClause(MBB, I, 0);
           CfCount++;
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 16cfcf5..be7f64b 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -12,6 +12,7 @@
 #define R600DEFINES_H_
 
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 
 // Operand Flags
 #define MO_FLAG_CLAMP (1 << 0)
@@ -39,7 +40,8 @@ namespace R600_InstFlag {
     //FlagOperand bits 7, 8
     NATIVE_OPERANDS = (1 << 9),
     OP1 = (1 << 10),
-    OP2 = (1 << 11)
+    OP2 = (1 << 11),
+    FETCH_INST  = (1 << 12)
   };
 }
 
@@ -94,4 +96,12 @@ namespace R600Operands {
 
 }
 
+namespace R600 {
+
+inline static bool isFetch(const llvm::MachineInstr *MI) {
+  return MI->getDesc().TSFlags & R600_InstFlag::FETCH_INST;
+}
+
+} // End namepsace R600
+
 #endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d42ad95..9941dd4 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -25,6 +25,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bit Op1 = 0;
   bit Op2 = 0;
   bit HasNativeOperands = 0;
+  bit FetchInst = 0;
 
   let Namespace = "AMDGPU";
   let OutOperandList = outs;
@@ -43,6 +44,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let TSFlags{9} = HasNativeOperands;
   let TSFlags{10} = Op1;
   let TSFlags{11} = Op2;
+  let TSFlags{12} = FetchInst;
 }
 
 class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -478,6 +480,8 @@ class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
     let COORD_TYPE_Y = 0;
     let COORD_TYPE_Z = 0;
     let COORD_TYPE_W = 0;
+
+    let FetchInst = 1;
   }
 
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
@@ -1778,6 +1782,8 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
   // VTX_WORD3 (Padding)
   //
   // Inst{127-96} = 0;
+
+  let FetchInst = 1;
 }
 
 class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
@@ -2024,15 +2030,17 @@ def TXD: InstR600 <
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
   [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))], NullALU> {
->;
+  let FetchInst = 1;
+}
 
 def TXD_SHADOW: InstR600 <
   (outs R600_Reg128:$dst),
   (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
   "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
   [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], NullALU
->;
-
+> {
+  let FetchInst = 1;
+}
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
@@ -2118,6 +2126,7 @@ def TEX_VTX_CONSTBUF :
 // VTX_WORD3 (Padding)
 //
 // Inst{127-96} = 0;
+  let FetchInst = 1;
 }
 
 def TEX_VTX_TEXBUF:
@@ -2171,6 +2180,7 @@ let Inst{63-32} = Word1;
 // VTX_WORD3 (Padding)
 //
 // Inst{127-96} = 0;
+  let FetchInst = 1;
 }
 
 
-- 
1.8.1.5