PATCH: R600/SI: Lower BUILD_VECTOR to REG_SEQUENCE

Tue Aug 6 16:12:08 PDT 2013

Hi,

The attached patches teach the SI backend to lower BUILD_VECTOR to
REG_SEQUENCE, which should help the register allocator produce better
code.  This also fixes 364 array indexing piglit tests.

-Tom
-------------- next part --------------
>From 90dfb000600eae7a03f3f36fafcfaee1edde5613 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 6 Aug 2013 14:20:10 -0400
Subject: [PATCH 1/2] R600/SI: Assign a register class to the $vaddr operand
 for MIMG instructions

The previous code declared the operand as unknown:$vaddr, which made
it possible for scalar registers to be used instead of vector registers.
---
 lib/Target/R600/SIDefines.h            |   6 ++
 lib/Target/R600/SIISelLowering.cpp     |   8 ++-
 lib/Target/R600/SIInstrFormats.td      |   3 +
 lib/Target/R600/SIInstrInfo.cpp        |   5 ++
 lib/Target/R600/SIInstrInfo.h          |   2 +-
 lib/Target/R600/SIInstrInfo.td         |  33 ++++++-----
 lib/Target/R600/SIInstructions.td      | 100 +++++++++++++++++++--------------
 test/CodeGen/R600/llvm.SI.imageload.ll |  44 +++++++++++++++
 8 files changed, 144 insertions(+), 57 deletions(-)

diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 147578c..572ed6a 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -11,6 +11,12 @@
 #ifndef SIDEFINES_H_
 #define SIDEFINES_H_
 
+namespace SIInstrFlags {
+enum {
+  MIMG = 1 << 3
+};
+}
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index c64027f..fb8f257 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -1001,9 +1001,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   Node = AdjustRegClass(Node, DAG);
 
-  if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+  if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
 
   return foldOperands(Node, DAG);
@@ -1013,7 +1015,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!TII->isMIMG(MI->getOpcode()))
     return;
 
   unsigned VReg = MI->getOperand(0).getReg();
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 434aa7e..cd1bbcd 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,10 +17,12 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
   field bits<1> LGKM_CNT = 0;
+  field bits<1> MIMG = 0;
 
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
+  let TSFlags{3} = MIMG;
 }
 
 class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -414,6 +416,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
+  let MIMG = 1;
 }
 
 def EXP : Enc64<
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 551ae86..9bb4ad9 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -15,6 +15,7 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -224,6 +225,10 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
+int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+}
+
 //===----------------------------------------------------------------------===//
 // Indirect addressing callbacks
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 87eff4d..8d24ab4 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -47,6 +47,7 @@ public:
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  int isMIMG(uint16_t Opcode) const;
 
   virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
 
@@ -80,7 +81,6 @@ namespace AMDGPU {
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int isMIMG(uint16_t Opcode);
 
 } // End namespace AMDGPU
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 52af79c..b6bbadc 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -391,11 +391,12 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   let mayStore = 0;
 }
 
-class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass src_rc> : MIMG <
   op,
   (outs VReg_128:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc",
@@ -406,11 +407,18 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, VReg_32>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, VReg_64>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, VReg_128>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass src_rc> : MIMG <
   op,
   (outs VReg_128:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc, SReg_128:$ssamp),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
@@ -420,6 +428,14 @@ class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, VReg_32>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, VReg_64>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, VReg_128>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, VReg_256>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, VReg_512>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
@@ -451,13 +467,4 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-// Test if the supplied opcode is an MIMG instruction
-def isMIMG : InstrMapping {
-  let FilterClass = "MIMG";
-  let RowFields = ["Inst"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["8"]];
-}
-
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 500d15e..c7ba00b 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -501,7 +501,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 //def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -510,7 +510,7 @@ def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
+def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO", VReg_32>;
 //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
 //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
 //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -528,20 +528,20 @@ def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; 
+defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
 //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
 //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper <0x0000002a, "IMAGE_SAMPLE_C_D">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -1313,7 +1313,7 @@ def : Pat <
 /* int_SI_sample for simple 1D texture lookup */
 def : Pat <
   (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
-  (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (IMAGE_SAMPLE_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
@@ -1344,33 +1344,51 @@ class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
 >;
 
 /* int_SI_sample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<ValueType addr_type> {
-  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-
-  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-
-  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-
-  def : SamplePattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleArrayPattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleShadowPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <int_SI_sample, sample, addr_type>;
+  def : SampleRectPattern <int_SI_sample, sample, addr_type>;
+  def : SampleArrayPattern <int_SI_sample, sample, addr_type>;
+  def : SampleShadowPattern <int_SI_sample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sample, sample_c, addr_type>;
+
+  def : SamplePattern <int_SI_samplel, sample_l, addr_type>;
+  def : SampleArrayPattern <int_SI_samplel, sample_l, addr_type>;
+  def : SampleShadowPattern <int_SI_samplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_samplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <int_SI_sampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <int_SI_sampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <int_SI_sampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <int_SI_sampled, sample_d, addr_type>;
+  def : SampleArrayPattern <int_SI_sampled, sample_d, addr_type>;
+  def : SampleShadowPattern <int_SI_sampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sampled, sample_c_d, addr_type>;
 }
 
-defm : SamplePatterns<v2i32>;
-defm : SamplePatterns<v4i32>;
-defm : SamplePatterns<v8i32>;
-defm : SamplePatterns<v16i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V2, IMAGE_SAMPLE_C_V2,
+                      IMAGE_SAMPLE_L_V2, IMAGE_SAMPLE_C_L_V2,
+                      IMAGE_SAMPLE_B_V2, IMAGE_SAMPLE_C_B_V2,
+                      IMAGE_SAMPLE_D_V2, IMAGE_SAMPLE_C_D_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4, IMAGE_SAMPLE_C_V4,
+                      IMAGE_SAMPLE_L_V4, IMAGE_SAMPLE_C_L_V4,
+                      IMAGE_SAMPLE_B_V4, IMAGE_SAMPLE_C_B_V4,
+                      IMAGE_SAMPLE_D_V4, IMAGE_SAMPLE_C_D_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V8, IMAGE_SAMPLE_C_V8,
+                      IMAGE_SAMPLE_L_V8, IMAGE_SAMPLE_C_L_V8,
+                      IMAGE_SAMPLE_B_V8, IMAGE_SAMPLE_C_B_V8,
+                      IMAGE_SAMPLE_D_V8, IMAGE_SAMPLE_C_D_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V16, IMAGE_SAMPLE_C_V16,
+                      IMAGE_SAMPLE_L_V16, IMAGE_SAMPLE_C_L_V16,
+                      IMAGE_SAMPLE_B_V16, IMAGE_SAMPLE_C_B_V16,
+                      IMAGE_SAMPLE_D_V16, IMAGE_SAMPLE_C_D_V16,
+                      v16i32>;
 
 /* int_SI_imageload for texture fetches consuming varying address parameters */
 class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
@@ -1383,13 +1401,13 @@ class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> :
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
 >;
 
-multiclass ImageLoadPatterns<ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
 }
 
-defm : ImageLoadPatterns<v2i32>;
-defm : ImageLoadPatterns<v4i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4, v4i32>;
 
 /* Image resource information */
 def : Pat <
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 0adcdfc..ca46fee 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -82,6 +82,50 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
+; Test that ccordinates are stored in vgprs and not sgprs
+; CHECK: vgpr_coords
+; CHECK: IMAGE_LOAD_MIP VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 15, 0, 0, 0, 0, 0, 0, 0, VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}
+define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)* addrspace(2)* %20, !tbaa !0
+  %22 = getelementptr float addrspace(2)* %21, i32 0
+  %23 = load float addrspace(2)* %22, !tbaa !0, !invariant.load !1
+  %24 = getelementptr float addrspace(2)* %21, i32 1
+  %25 = load float addrspace(2)* %24, !tbaa !0, !invariant.load !1
+  %26 = getelementptr float addrspace(2)* %21, i32 4
+  %27 = load float addrspace(2)* %26, !tbaa !0, !invariant.load !1
+  %28 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8> addrspace(2)* %28, !tbaa !0
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %23 to i32
+  %32 = bitcast float %25 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
+  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
+  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
+  %38 = extractelement <4 x i32> %37, i32 0
+  %39 = extractelement <4 x i32> %37, i32 1
+  %40 = extractelement <4 x i32> %37, i32 2
+  %41 = extractelement <4 x i32> %37, i32 3
+  %42 = bitcast i32 %38 to float
+  %43 = bitcast i32 %39 to float
+  %44 = bitcast i32 %40 to float
+  %45 = bitcast i32 %41 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
+  ret void
+}
+
 declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <8 x i32>, i32) readnone
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null, i32 1}
+!1 = metadata !{}
+
-- 
1.8.1.5

-------------- next part --------------
>From 7da99726d4ffa151a4f0ea6e073ad98fc545647e Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 6 Aug 2013 14:35:24 -0400
Subject: [PATCH 2/2] R600/SI: Lower BUILD_VECTOR to REG_SEQUENCE

Using REG_SEQUENCE for BUILD_VECTOR rather than a series of INSERT_SUBREG
instructions should make it easier for the register allocator to coalasce
unnecessary copies.

This patch also happens to fix 364 piglit tests, almost all of which
use some form of array indexing
---
 lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 69 ++++++++++++++++++++++++----------
 lib/Target/R600/AMDGPUInstructions.td  | 49 ------------------------
 lib/Target/R600/AMDGPURegisterInfo.cpp | 32 +++++++---------
 lib/Target/R600/AMDGPURegisterInfo.h   |  4 ++
 lib/Target/R600/R600RegisterInfo.cpp   | 10 -----
 lib/Target/R600/R600RegisterInfo.h     |  4 --
 lib/Target/R600/SIInstructions.td      | 10 -----
 test/CodeGen/R600/si-lod-bias.ll       | 50 ++++++++++++++++++++++++
 test/CodeGen/R600/store.ll             |  2 +-
 9 files changed, 118 insertions(+), 112 deletions(-)
 create mode 100644 test/CodeGen/R600/si-lod-bias.ll

diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f222901..bf656f7 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -256,35 +256,66 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     break;
   }
   case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      break;
+    const AMDGPURegisterInfo *TRI =
+                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      switch(NumVectorElts) {
+      // The instruction emitter will try to infer the correct register
+      // class to use for REG_SEQUENCE instructions, so if all the elements
+      // of the BUILD_VECTOR end up being store in scalar registers, then
+      // the destination class of the REG_SEQUENCE instruction will changed to
+      // the appropriate sized scalar register class.  The register class we pass
+      // to the REG_SEQUENCE here is what will be used when the inputs to
+      // REG_SEQUENCE use different register classes.
+      case 1: RegClassID = AMDGPU::VReg_32RegClassID; break;
+      case 2: RegClassID = AMDGPU::VReg_64RegClassID; break;
+      case 4: RegClassID = AMDGPU::VReg_128RegClassID; break;
+      case 8: RegClassID = AMDGPU::VReg_256RegClassID; break;
+      case 16: RegClassID = AMDGPU::VReg_512RegClassID; break;
+      }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
     }
 
-    unsigned RegClassID;
-    switch(N->getValueType(0).getVectorNumElements()) {
-    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-    case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
-    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
+                                  VT.getVectorElementType(),
+                                  N->getOperand(0), RegClass);
     }
-    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-    // that adds a 128 bits reg copy when going through TwoAddressInstructions
-    // pass. We want to avoid 128 bits copies as much as possible because they
-    // can't be bundled by our scheduler.
-    SDValue RegSeqArgs[9] = {
-      CurDAG->getTargetConstant(RegClassID, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SDValue RegSeqArgs[16 * 2 + 1];
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
     for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      // XXX: Why is this here?
       if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
         IsRegSeq = false;
         break;
       }
-      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
     }
     if (!IsRegSeq)
       break;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index d6a7759..ddb655a 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -254,61 +254,12 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
-// Vector Build pattern
-class Vector1_Build <ValueType vecType, ValueType elemType,
-                     RegisterClass rc> : Pat <
-  (vecType (build_vector elemType:$src)),
-  (vecType (COPY_TO_REGCLASS $src, rc))
->;
-
-class Vector2_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1)),
-  (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1)
->;
-
 class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
   (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
     (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
 >;
 
-class Vector8_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                              $sub2, sub2), $sub3, sub3),
-                              $sub4, sub4), $sub5, sub5),
-                              $sub6, sub6), $sub7, sub7)
->;
-
-class Vector16_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7,
-                         elemType:$sub8, elemType:$sub9,
-                         elemType:$sub10, elemType:$sub11,
-                         elemType:$sub12, elemType:$sub13,
-                         elemType:$sub14, elemType:$sub15)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                            $sub2, sub2), $sub3, sub3),
-                            $sub4, sub4), $sub5, sub5),
-                            $sub6, sub6), $sub7, sub7),
-                            $sub8, sub8), $sub9, sub9),
-                            $sub10, sub10), $sub11, sub11),
-                            $sub12, sub12), $sub13, sub13),
-                            $sub14, sub14), $sub15, sub15)
->;
-
 // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
 // can handle COPY instructions.
 // bitconvert pattern
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 3402092..47617a7 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -46,27 +46,21 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return 0;
 }
 
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert (Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
 unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
 
-  switch(IndirectIndex) {
-  case 0: return AMDGPU::sub0;
-  case 1: return AMDGPU::sub1;
-  case 2: return AMDGPU::sub2;
-  case 3: return AMDGPU::sub3;
-  case 4: return AMDGPU::sub4;
-  case 5: return AMDGPU::sub5;
-  case 6: return AMDGPU::sub6;
-  case 7: return AMDGPU::sub7;
-  case 8: return AMDGPU::sub8;
-  case 9: return AMDGPU::sub9;
-  case 10: return AMDGPU::sub10;
-  case 11: return AMDGPU::sub11;
-  case 12: return AMDGPU::sub12;
-  case 13: return AMDGPU::sub13;
-  case 14: return AMDGPU::sub14;
-  case 15: return AMDGPU::sub15;
-  default: llvm_unreachable("indirect index out of range");
-  }
+  return getSubRegFromChannel(IndirectIndex);
 }
 
 #define GET_REGINFO_TARGET_DESC
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 7cbd34b..135d3dd 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -50,6 +50,10 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented"); return NULL;
   }
 
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
   const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index a42043b..4dc63fe 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -86,16 +86,6 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
-  switch (Channel) {
-    default: assert(!"Invalid channel index"); return 0;
-    case 0: return AMDGPU::sub0;
-    case 1: return AMDGPU::sub1;
-    case 2: return AMDGPU::sub2;
-    case 3: return AMDGPU::sub3;
-  }
-}
-
 const RegClassWeight &R600RegisterInfo::getRegClassWeight(
   const TargetRegisterClass *RC) const {
   return RCW;
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 9b286ee..d458e55 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -43,10 +43,6 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
-
   virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
 
 };
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index c7ba00b..a37344b 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1488,16 +1488,6 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : Vector1_Build <v1i32, i32, VReg_32>;
-def : Vector2_Build <v2i32, i32>;
-def : Vector2_Build <v2f32, f32>;
-def : Vector4_Build <v4i32, i32>;
-def : Vector4_Build <v4f32, f32>;
-def : Vector8_Build <v8i32, i32>;
-def : Vector8_Build <v8f32, f32>;
-def : Vector16_Build <v16i32, i32>;
-def : Vector16_Build <v16f32, f32>;
-
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
 
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
new file mode 100644
index 0000000..9b58f2a
--- /dev/null
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -0,0 +1,50 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+; This shader has the potential to generated illeagal VGPR to SGPR copies if
+; the wrong register class is used for the REG_SEQUENCE instructions.
+
+; CHECK: @main
+; CHECK: IMAGE_SAMPLE_B VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 15, 0, 0, 0, 0, 0, 0, 0, VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8> addrspace(2)* %23, !tbaa !0
+  %25 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8> addrspace(2)* %25, !tbaa !0
+  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %29 = bitcast float %22 to i32
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %28 to i32
+  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
+  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
+  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = extractelement <4 x float> %36, i32 1
+  %39 = extractelement <4 x float> %36, i32 2
+  %40 = extractelement <4 x float> %36, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 1bda5e6..506f0b0 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -26,7 +26,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
-  %1 = insertelement <2 x float> %0, float %b, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
   store <2 x float> %1, <2 x float> addrspace(1)* %out
   ret void
 }
-- 
1.8.1.5