R600 Patchset: Optimizations for bfgminer

Mon Apr 29 13:24:17 PDT 2013

Hi,

The attached patchset implements a few optimizations for the bfgminer
bitcoin mining program.

Please Review.

-Tom
-------------- next part --------------
>From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov <dcherkassov at gmail.com>
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/3] R600: Add 64-bit load/store support

* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 -> v2f32 conversions

Signed-off-by: Dmitry Cherkassov <dcherkassov at gmail.com>

Tom Stellard:
  - Mark vec2 operations as expand.  The addition of a vec2 register
    class made them all legal.
---
 lib/Target/R600/AMDGPUISelLowering.cpp             |  6 +++
 lib/Target/R600/AMDILISelDAGToDAG.cpp              | 10 ++++-
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  3 ++
 lib/Target/R600/R600ISelLowering.cpp               | 17 +++++++++
 lib/Target/R600/R600InstrInfo.cpp                  | 19 ++++++----
 lib/Target/R600/R600Instructions.td                | 44 ++++++++++++++++++++++
 lib/Target/R600/R600RegisterInfo.td                | 16 ++++++++
 test/CodeGen/R600/64bit-kernel-args.ll             | 41 ++++++++++++++++++++
 test/CodeGen/R600/fadd.ll                          | 10 +++++
 test/CodeGen/R600/fdiv.ll                          | 37 +++++++++++++-----
 test/CodeGen/R600/fmul.ll                          | 10 +++++
 test/CodeGen/R600/fp_to_sint.ll                    | 10 +++++
 test/CodeGen/R600/fp_to_uint.ll                    | 10 +++++
 test/CodeGen/R600/fsub.ll                          | 20 +++++++---
 test/CodeGen/R600/setcc.ll                         | 18 +++++++--
 test/CodeGen/R600/sint_to_fp.ll                    | 10 +++++
 test/CodeGen/R600/udiv.ll                          | 20 +++++++---
 test/CodeGen/R600/uint_to_fp.ll                    | 10 +++++
 test/CodeGen/R600/urem.ll                          | 21 ++++++++---
 19 files changed, 292 insertions(+), 40 deletions(-)
 create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index ba75a44..198cd7e 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
       break;
     }
+    unsigned RegSequenceClassID;
+    EVT VT = N->getValueType(0);
+    assert(VT.isVector());
+    switch (VT.getVectorNumElements()) {
+    case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break;
+    case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break;
+    default: llvm_unreachable("Unhandled vector width in BUILD_VECTOR");
+    }
     // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
     // that adds a 128 bits reg copy when going through TwoAddressInstructions
     // pass. We want to avoid 128 bits copies as much as possible because they
     // can't be bundled by our scheduler.
     SDValue RegSeqArgs[9] = {
-      CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+      CurDAG->getTargetConstant(RegSequenceClassID, MVT::i32),
       SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
       SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
       SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..030fc87 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   } else {
     switch(MI.getOpcode()) {
     case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+    case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
     case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
       uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
       EmitByte(INSTR_NATIVE, OS);
@@ -160,9 +161,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case AMDGPU::VTX_READ_PARAM_8_eg:
     case AMDGPU::VTX_READ_PARAM_16_eg:
     case AMDGPU::VTX_READ_PARAM_32_eg:
+    case AMDGPU::VTX_READ_PARAM_64_eg:
     case AMDGPU::VTX_READ_PARAM_128_eg:
     case AMDGPU::VTX_READ_GLOBAL_8_eg:
     case AMDGPU::VTX_READ_GLOBAL_32_eg:
+    case AMDGPU::VTX_READ_GLOBAL_64_eg:
     case AMDGPU::VTX_READ_GLOBAL_128_eg:
     case AMDGPU::TEX_VTX_CONSTBUF:
     case AMDGPU::TEX_VTX_TEXBUF : {
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a66baca..b6b7c32 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -32,22 +32,38 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
   computeRegisterProperties();
 
   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+  setOperationAction(ISD::FADD, MVT::v2f32, Expand);
   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+  setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+  setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+  setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
 
   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
+  setOperationAction(ISD::ADD,  MVT::v2i32, Expand);
   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
+  setOperationAction(ISD::AND,  MVT::v2i32, Expand);
   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+  setOperationAction(ISD::UREM, MVT::v2i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -158,6 +174,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 85288c9..eb09665 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -50,9 +50,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const {
-  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
-      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
-    for (unsigned I = 0; I < 4; I++) {
+  unsigned VectorComponents = 0;
+  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+    VectorComponents = 4;
+  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
       unsigned SubRegIndex = RI.getSubRegFromChannel(I);
       buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
@@ -61,11 +69,6 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                       RegState::Define | RegState::Implicit);
     }
   } else {
-
-    // We can't copy vec4 registers
-    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
-           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
     MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                                                   DestReg, SrcReg);
     NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index ea8ee05..b1e8d1c 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1794,6 +1794,13 @@ def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
   [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
 >;
 
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0x3, "RAT_WRITE_CACHELESS_64_eg",
+  [(global_store (v2i32 R600_Reg64:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
 //128-bit store
 def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
@@ -1901,6 +1908,18 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
   let Constraints = "$ptr.ptr = $dst";
 }
 
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64", buffer_id, (outs R600_Reg64:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  7;
+  let DST_SEL_W        =  7;
+  let DATA_FORMAT      =  0x1D; // COLOR_32_32
+}
+
 class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
     : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
                    pattern> {
@@ -1934,6 +1953,11 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
   [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
 >;
 
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set (v2i32 R600_Reg64:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+
 def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
   [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
 >;
@@ -1952,6 +1976,12 @@ def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
   [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
 >;
 
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set (v2i32 R600_Reg64:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+
 // 128-bit reads
 def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
   [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
@@ -2439,10 +2469,24 @@ def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
 def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
 def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
 
+def : Extract_Element <f32, v2f32, R600_Reg64, 0, sub0>;
+def : Extract_Element <f32, v2f32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <f32, v2f32, R600_Reg32, R600_Reg64, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, R600_Reg64, 0, sub0>;
+def : Extract_Element <i32, v2i32, R600_Reg64, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 0, sub0>;
+def : Insert_Element <i32, v2i32, R600_Reg32, R600_Reg64, 1, sub1>;
+
 // bitconvert patterns
 
 def : BitConvert <i32, f32, R600_Reg32>;
 def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
 def : BitConvert <v4f32, v4i32, R600_Reg128>;
 def : BitConvert <v4i32, v4f32, R600_Reg128>;
 
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 5a2e65c..6bde923 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
   let HWEncoding = encoding;
 }
 
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+}
+
+
 foreach Index = 0-127 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
     // 32-bit Temporary Registers
@@ -41,6 +49,11 @@ foreach Index = 0-127 in {
                                     !cast<Register>("T"#Index#"_Z"),
                                     !cast<Register>("T"#Index#"_W")],
                                    Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#".XY",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
 }
 
 // KCACHE_BANK0
@@ -184,6 +197,9 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   let CopyCost = -1;
 }
 
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
+
 //===----------------------------------------------------------------------===//
 // Register classes for indirect addressing
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
new file mode 100644
index 0000000..6f03b68
--- /dev/null
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -0,0 +1,41 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @v2i32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_extract_store(i32 addrspace(1)* nocapture %out, <2 x i32> %in)  {
+entry:
+  %0 = extractelement <2 x i32> %in, i32 0
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  %1 = extractelement <2 x i32> %in, i32 1
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: @v2f32_load_extract_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_extract_store(float addrspace(1)* nocapture %out, <2 x float> %in)  {
+entry:
+  %0 = extractelement <2 x float> %in, i32 0
+  store float %0, float addrspace(1)* %out, align 4
+  %1 = extractelement <2 x float> %in, i32 1
+  %arrayidx1 = getelementptr inbounds float addrspace(1)* %out, i32 1
+  store float %1, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: @v2i32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2i32_load_store(<2 x i32> addrspace(1)* %out, <2 x i32>  %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @v2f32_load_store
+; CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 40
+define void @v2f32_load_store(<2 x float> addrspace(1)* %out, <2 x float>  %in) {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 81a4fa5..1e51c35 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fadd_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fadd <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fadd_v4f32
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 79e677f..240f1e5 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,15 +1,32 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; CHECK: @fdiv_v2f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fdiv_v4f32
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
   %b = load <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 7fd22d8..74c277d 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fmul_v2f32
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fmul <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fmul_v4f32
 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 9c21ad2..dabfe41 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_sint_v2i32
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptosi <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fp_to_sint_v4i32
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index d91098f..95c62f7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_uint_v2i32
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptoui <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fp_to_uint_v4i32
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 812388b..f93212c 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -2,7 +2,6 @@
 
 ; CHECK: @fsub_f32
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
 define void @fsub_f32() {
    %r0 = call float @llvm.R600.load.input(i32 0)
    %r1 = call float @llvm.R600.load.input(i32 1)
@@ -15,12 +14,21 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fsub_v2f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fsub <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
 
+; CHECK: @fsub_v4f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 0752f2e..ba8fca7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @setcc_v2i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = icmp eq <2 x i32> %a, %b
+  %sext = sext <2 x i1> %result to <2 x i32>
+  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @setcc_v4i32
+; CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 6a56db3..dc163da 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @sint_to_fp_v2i32
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = sitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @sint_to_fp_v4i32
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index b81e366..0e91b2b 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,11 +1,19 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;The code generated by udiv is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 udiv
-;CHECK: CF_END
+; The code generated by udiv is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail on udiv
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @udiv_v2i32
+; CHECK: CF_END
+define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = udiv <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @udiv_v4i32
+; CHECK: CF_END
+define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index ae8fc8e..791f117 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,5 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @uint_to_fp_v2i32
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = uitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @uint_to_fp_v4i32
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index a2cc0bd..1a50e65 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,11 +1,20 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;The code generated by urem is long and complex and may frequently change.
-;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 urem
-;CHECK: CF_END
+; The code generated by urem is long and complex and may frequently change.
+; The goal of these tests is to make sure the ISel doesn't fail when it gets
+; a urem.
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @urem_v2i32
+; CHECK: CF_END
+define void @urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = urem <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @urem_v4i32
+; CHECK: CF_END
+define void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
-- 
1.8.1.5

-------------- next part --------------
>From b4a725a06bc00ec00c8d13e207d187d0fcca2ea2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 17 Apr 2013 11:55:44 -0400
Subject: [PATCH 2/3] R600: Add pattern for SHA-256 Ma function

This can be optimized using the BFI_INT instruction.
---
 lib/Target/R600/AMDGPUInstructions.td |  8 ++++++++
 lib/Target/R600/R600Instructions.td   |  3 +++
 lib/Target/R600/SIInstructions.td     |  6 ++++++
 test/CodeGen/R600/bfi_int.ll          | 18 ++++++++++++++++++
 4 files changed, 35 insertions(+)

diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 4b37a53..d801b32 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -281,6 +281,14 @@ multiclass BFIPatterns <Instruction BFI_INT> {
 
 }
 
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+  (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+  (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index b1e8d1c..1630936 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1691,6 +1691,9 @@ let hasSideEffects = 1 in {
   def : Pat<(fp_to_uint R600_Reg32:$src0),
     (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
 
+  // SHA-256 Patterns
+  def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
+
   def EG_ExportSwz : ExportSwzInst {
     let Word1{19-16} = 0; // BURST_COUNT
     let Word1{20} = 1; // VALID_PIXEL_MODE
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9faf89b..b6217e6 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1617,4 +1617,10 @@ def : Pat<
   (V_CMP_U_F32_e64 f32:$src0, f32:$src1)
 >;
 
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+
 } // End isSI predicate
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index c9015a6..abe2591 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -32,3 +32,21 @@ entry:
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
+
+; SHA-256 Ma function
+; ((x & z) | (y & (x | z)))
+; R600-CHECK: @bfi_sha256_ma
+; R600-CHECK: XOR_INT [[DST:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: BFI_INT {{T[0-9]+\.[XYZW]}}, [[DST]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: V_XOR_B32_e32 [[DST:VGPR[0-9]+]], {{VGPR[0-9]+, VGPR[0-9]+}}
+; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %0 = and i32 %x, %z
+  %1 = or i32 %x, %z
+  %2 = and i32 %y, %1
+  %3 = or i32 %0, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.8.1.5

-------------- next part --------------
>From 2bf7212e559190087101b49e7b09a18f10a85db2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Wed, 17 Apr 2013 12:10:21 -0400
Subject: [PATCH 3/3] R600: BFI_INT is a vector-only instruction

---
 lib/Target/R600/R600Instructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 1630936..78d2320 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1635,7 +1635,7 @@ let Predicates = [isEGorCayman] in {
     VecALU
   >;
 
-  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", []>;
+  def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
 
   def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
-- 
1.8.1.5

-------------- next part --------------
>From 05c4903985bbe60a54b43ce745d34143895cdf9a Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 29 Apr 2013 13:40:01 -0400
Subject: [PATCH] R600: Expand vector or, shl, srl, and xor nodes

---
 lib/Target/R600/R600ISelLowering.cpp |  8 ++++++++
 test/CodeGen/R600/or.ll              | 23 +++++++++++++++++++++++
 test/CodeGen/R600/shl.ll             | 23 +++++++++++++++++++++++
 test/CodeGen/R600/srl.ll             | 23 +++++++++++++++++++++++
 test/CodeGen/R600/xor.ll             | 23 +++++++++++++++++++++++
 5 files changed, 100 insertions(+)
 create mode 100644 test/CodeGen/R600/or.ll
 create mode 100644 test/CodeGen/R600/shl.ll
 create mode 100644 test/CodeGen/R600/srl.ll
 create mode 100644 test/CodeGen/R600/xor.ll

diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index b6b7c32..ce3f16f 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -54,8 +54,14 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
+  setOperationAction(ISD::OR, MVT::v4i32, Expand);
+  setOperationAction(ISD::OR, MVT::v2i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
@@ -64,6 +70,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::UREM, MVT::v2i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
+  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
new file mode 100644
index 0000000..e102a5e
--- /dev/null
+++ b/test/CodeGen/R600/or.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @or_v2i32
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = or <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @or_v4i32
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: OR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %result = or <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
new file mode 100644
index 0000000..88db44d
--- /dev/null
+++ b/test/CodeGen/R600/shl.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @shl_v2i32
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = shl <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @shl_v4i32
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %result = shl <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
new file mode 100644
index 0000000..ebfb9bc
--- /dev/null
+++ b/test/CodeGen/R600/srl.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @lshr_v2i32
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = lshr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @lshr_v4i32
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: LSHR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %result = lshr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
new file mode 100644
index 0000000..109019f
--- /dev/null
+++ b/test/CodeGen/R600/xor.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @xor_v2i32
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @xor_v4i32
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: XOR_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %result = xor <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
-- 
1.8.1.5