R600: Various improvements to scheduler and packetizer

Wed May 22 14:55:26 PDT 2013

On Tue, May 21, 2013 at 01:06:09PM -0700, Vincent Lejeune wrote:
> Hi
> 
> Another serie aiming at fixing some light inneficiencies in our scheduler and packetizer.
> 
> Currently packetizer does not pack instruction without a write bit ; however Op3 operand don't have such bit and are perfectly packetizable.
> Second patch relaxes this constraints.
> Third patch properly handle data dependency inside a packet candidate : we check if such dependency is real or if it is between 2 different subregs.
> 4th one remove extra COPY instructions that were appearing when we switched to bottom up scheduling
> Last one implement the change suggested in a review by Tom.
> 
> Vincent

Just a few comments on patches 3 and 4.  The rest are:
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

> From c65d202fd7a8be44d7149f2b6013552cc33c2f21 Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Fri, 17 May 2013 14:22:08 +0200
> Subject: [PATCH 4/5] R600: Schedule copy from phys register at beginning of
>  block
> 
> It allows regalloc pass to remove them by trivially assigning associated reg
> ---
>  lib/Target/R600/R600MachineScheduler.cpp | 29 ++++++++++++++++++++++++++---
>  lib/Target/R600/R600MachineScheduler.h   |  1 +
>  test/CodeGen/R600/fabs.ll                |  2 +-
>  test/CodeGen/R600/fadd.ll                |  2 +-
>  test/CodeGen/R600/floor.ll               |  2 +-
>  test/CodeGen/R600/fmad.ll                |  2 +-
>  test/CodeGen/R600/fmax.ll                |  2 +-
>  test/CodeGen/R600/fmin.ll                |  2 +-
>  test/CodeGen/R600/fmul.ll                |  2 +-
>  test/CodeGen/R600/fsub.ll                |  2 +-
>  test/CodeGen/R600/llvm.AMDGPU.mul.ll     |  2 +-
>  test/CodeGen/R600/llvm.pow.ll            |  2 +-
>  12 files changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index 4c89124..1b83393 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -51,6 +51,14 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>    SUnit *SU = 0;
>    NextInstKind = IDOther;
>  
> +  if (!FakeCopy.empty()) {
> +    IsTopNode = true;
> +    SU = FakeCopy.back();
> +    FakeCopy.resize(FakeCopy.size() - 1);
> +    DEBUG(dbgs() << "Picked fake copy : "; SU->dump(DAG););
> +    return SU;
> +  }
> +
>    IsTopNode = false;
>  
>    // check if we might want to switch current clause type
> @@ -64,6 +72,8 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
>      // try to pick ALU
>      SU = pickAlu();
>      if (SU) {
> +      if (SU->isScheduled)
> +        return pickNode(IsTopNode);
>        if (CurEmitted >= InstKindLimit[IDAlu])
>          CurEmitted = 0;
>        NextInstKind = IDAlu;
> @@ -140,9 +150,22 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
>    }
>  }
>  
> +static bool
> +isFakeCopy(MachineInstr *MI) {
> +  if (MI->getOpcode() != AMDGPU::COPY)
> +    return false;
> +
> +  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
> +}
> +
>  void R600SchedStrategy::releaseTopNode(SUnit *SU) {
>    DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
>  
> +  if (isFakeCopy(SU->getInstr())) {
> +    FakeCopy.push_back(SU);
> +    return;
> +  }
> +
>  }
>  
>  void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
> @@ -278,7 +301,7 @@ void R600SchedStrategy::LoadAlu() {
>  
>  void R600SchedStrategy::PrepareNextSlot() {
>    DEBUG(dbgs() << "New Slot\n");
> -  assert (OccupedSlotsMask && "Slot wasn't filled");
> +//  assert (OccupedSlotsMask && "Slot wasn't filled");

This looks like it was left in by accident.

>    OccupedSlotsMask = 0;
>    InstructionsGroupCandidate.clear();
>    LoadAlu();
> @@ -315,10 +338,10 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
>  SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
>    static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
>    SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
> -  if (SlotedSU)
> +  if (SlotedSU && !SlotedSU->isScheduled)
>      return SlotedSU;
>    SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
> -  if (UnslotedSU)
> +  if (UnslotedSU && !UnslotedSU->isScheduled)
>      AssignSlot(UnslotedSU->getInstr(), Slot);
>    return UnslotedSU;
>  }
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index 1db877d..da99dbd 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -52,6 +52,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
>  
>    std::vector<SUnit *> Available[IDLast], Pending[IDLast];
>    std::vector<SUnit *> AvailableAlus[AluLast];
> +  std::vector<SUnit *> FakeCopy;
>  
>    InstKind CurInstKind;
>    int CurEmitted;
> diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
> index 17ac895..85f2882 100644
> --- a/test/CodeGen/R600/fabs.ll
> +++ b/test/CodeGen/R600/fabs.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: MOV * T{{[0-9]+\.[XYZW], \|PV\.[xyzw]\|}}
> +;CHECK: MOV * T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
> index 821d329..9a67232 100644
> --- a/test/CodeGen/R600/fadd.ll
> +++ b/test/CodeGen/R600/fadd.ll
> @@ -1,7 +1,7 @@
>  ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
>  ; CHECK: @fadd_f32
> -; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @fadd_f32() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
> index 0a807b1..877d69a 100644
> --- a/test/CodeGen/R600/floor.ll
> +++ b/test/CodeGen/R600/floor.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: FLOOR * T{{[0-9]+\.[XYZW], PV\.[xyzw]}}
> +;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
> index 8614115..75e65d8 100644
> --- a/test/CodeGen/R600/fmad.ll
> +++ b/test/CodeGen/R600/fmad.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], PV.[xyzw], PV\.[xyzw]}}
> +;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
> index ef3daad..8b704e5 100644
> --- a/test/CodeGen/R600/fmax.ll
> +++ b/test/CodeGen/R600/fmax.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: MAX * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
> index 026481c..5e34b7c 100644
> --- a/test/CodeGen/R600/fmin.ll
> +++ b/test/CodeGen/R600/fmin.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: MIN * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
> index dbb6424..a40e818 100644
> --- a/test/CodeGen/R600/fmul.ll
> +++ b/test/CodeGen/R600/fmul.ll
> @@ -1,7 +1,7 @@
>  ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
>  ; CHECK: @fmul_f32
> -; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @fmul_f32() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
> index f88729e..f784cde 100644
> --- a/test/CodeGen/R600/fsub.ll
> +++ b/test/CodeGen/R600/fsub.ll
> @@ -1,7 +1,7 @@
>  ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
>  ; CHECK: @fsub_f32
> -; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], -PV\.[xyzw]}}
> +; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
>  
>  define void @fsub_f32() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> index 69fbe58..cc0732b 100644
> --- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> +++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
>     %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
> index 3800abf..1422083 100644
> --- a/test/CodeGen/R600/llvm.pow.ll
> +++ b/test/CodeGen/R600/llvm.pow.ll
> @@ -1,7 +1,7 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
>  ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW]}}
> +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>  
>  define void @test() {
> -- 
> 1.8.2.1
> 

> From 52d7e124207b8bcc20522e754be697246e55ceaa Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Sat, 4 May 2013 21:13:14 +0200
> Subject: [PATCH 3/5] R600: Packetize data dep
> 
> ---
>  lib/Target/R600/R600Packetizer.cpp |   9 ++
>  test/CodeGen/R600/packet.ll        | 243 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 252 insertions(+)
>  create mode 100644 test/CodeGen/R600/packet.ll
> 
> diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
> index 86dff4e..3537a9f 100644
> --- a/lib/Target/R600/R600Packetizer.cpp
> +++ b/lib/Target/R600/R600Packetizer.cpp
> @@ -181,6 +181,15 @@ public:
>          if (Dep.getKind() == SDep::Output)
>            if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
>              continue;
> +        if (Dep.getKind() == SDep::Data) {
> +          for (MachineInstr::mop_iterator MIIOp = MII->operands_begin(),
> +              MIIE = MII->operands_end(); MIIOp != MIIE; ++MIIOp) {
> +            MachineOperand &MO = *MIIOp;
> +            if (MO.isReg() && MO.getReg() == MIJ->getOperand(0).getReg())
> +              return false;
> +          }
> +          continue;
> +        }
>          return false;
>        }
>      }
> diff --git a/test/CodeGen/R600/packet.ll b/test/CodeGen/R600/packet.ll
> new file mode 100644
> index 0000000..669904a
> --- /dev/null
> +++ b/test/CodeGen/R600/packet.ll
> @@ -0,0 +1,243 @@
> +;RUN: llc < %s -march=r600 | FileCheck %s
> +
> +;CHECK: MULADD_IEEE T{{[0-9].[XYZW]}}
> +

Can you add a comment that explains exactly what this is testing.  Also,
is it possible to reduce this testcase?

> +define void @main() #0 {
> +main_body:
> +  %0 = call float @llvm.R600.load.input(i32 4)
> +  %1 = call float @llvm.R600.load.input(i32 5)
> +  %2 = call float @llvm.R600.load.input(i32 6)
> +  %3 = call float @llvm.R600.load.input(i32 7)
> +  %4 = call float @llvm.R600.load.input(i32 8)
> +  %5 = call float @llvm.R600.load.input(i32 9)
> +  %6 = call float @llvm.R600.load.input(i32 10)
> +  %7 = call float @llvm.R600.load.input(i32 11)
> +  %8 = call float @llvm.R600.load.input(i32 12)
> +  %9 = call float @llvm.R600.load.input(i32 13)
> +  %10 = call float @llvm.R600.load.input(i32 14)
> +  %11 = call float @llvm.R600.load.input(i32 15)
> +  %12 = call float @llvm.R600.load.input(i32 16)
> +  %13 = call float @llvm.R600.load.input(i32 17)
> +  %14 = call float @llvm.R600.load.input(i32 18)
> +  %15 = call float @llvm.R600.load.input(i32 19)
> +  %16 = call float @llvm.R600.load.input(i32 20)
> +  %17 = call float @llvm.R600.load.input(i32 21)
> +  %18 = call float @llvm.R600.load.input(i32 22)
> +  %19 = call float @llvm.R600.load.input(i32 23)
> +  %20 = call float @llvm.R600.load.input(i32 24)
> +  %21 = call float @llvm.R600.load.input(i32 25)
> +  %22 = call float @llvm.R600.load.input(i32 26)
> +  %23 = call float @llvm.R600.load.input(i32 27)
> +  %24 = call float @llvm.R600.load.input(i32 28)
> +  %25 = call float @llvm.R600.load.input(i32 29)
> +  %26 = call float @llvm.R600.load.input(i32 30)
> +  %27 = call float @llvm.R600.load.input(i32 31)
> +  %28 = load <4 x float> addrspace(8)* null
> +  %29 = extractelement <4 x float> %28, i32 0
> +  %30 = fmul float %0, %29
> +  %31 = load <4 x float> addrspace(8)* null
> +  %32 = extractelement <4 x float> %31, i32 1
> +  %33 = fmul float %0, %32
> +  %34 = load <4 x float> addrspace(8)* null
> +  %35 = extractelement <4 x float> %34, i32 2
> +  %36 = fmul float %0, %35
> +  %37 = load <4 x float> addrspace(8)* null
> +  %38 = extractelement <4 x float> %37, i32 3
> +  %39 = fmul float %0, %38
> +  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %41 = extractelement <4 x float> %40, i32 0
> +  %42 = fmul float %1, %41
> +  %43 = fadd float %42, %30
> +  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %45 = extractelement <4 x float> %44, i32 1
> +  %46 = fmul float %1, %45
> +  %47 = fadd float %46, %33
> +  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %49 = extractelement <4 x float> %48, i32 2
> +  %50 = fmul float %1, %49
> +  %51 = fadd float %50, %36
> +  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %53 = extractelement <4 x float> %52, i32 3
> +  %54 = fmul float %1, %53
> +  %55 = fadd float %54, %39
> +  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %57 = extractelement <4 x float> %56, i32 0
> +  %58 = fmul float %2, %57
> +  %59 = fadd float %58, %43
> +  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %61 = extractelement <4 x float> %60, i32 1
> +  %62 = fmul float %2, %61
> +  %63 = fadd float %62, %47
> +  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %65 = extractelement <4 x float> %64, i32 2
> +  %66 = fmul float %2, %65
> +  %67 = fadd float %66, %51
> +  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %69 = extractelement <4 x float> %68, i32 3
> +  %70 = fmul float %2, %69
> +  %71 = fadd float %70, %55
> +  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> +  %73 = extractelement <4 x float> %72, i32 0
> +  %74 = fmul float %3, %73
> +  %75 = fadd float %74, %59
> +  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> +  %77 = extractelement <4 x float> %76, i32 1
> +  %78 = fmul float %3, %77
> +  %79 = fadd float %78, %63
> +  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> +  %81 = extractelement <4 x float> %80, i32 2
> +  %82 = fmul float %3, %81
> +  %83 = fadd float %82, %67
> +  %84 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> +  %85 = extractelement <4 x float> %84, i32 3
> +  %86 = fmul float %3, %85
> +  %87 = fadd float %86, %71
> +  %88 = insertelement <4 x float> undef, float %4, i32 0
> +  %89 = insertelement <4 x float> %88, float %5, i32 1
> +  %90 = insertelement <4 x float> %89, float %6, i32 2
> +  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
> +  %92 = insertelement <4 x float> undef, float %4, i32 0
> +  %93 = insertelement <4 x float> %92, float %5, i32 1
> +  %94 = insertelement <4 x float> %93, float %6, i32 2
> +  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
> +  %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
> +  %97 = call float @fabs(float %96)
> +  %98 = call float @llvm.AMDGPU.rsq(float %97)
> +  %99 = fmul float %4, %98
> +  %100 = fmul float %5, %98
> +  %101 = fmul float %6, %98
> +  %102 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> +  %103 = extractelement <4 x float> %102, i32 0
> +  %104 = fmul float %103, %8
> +  %105 = fadd float %104, %20
> +  %106 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> +  %107 = extractelement <4 x float> %106, i32 1
> +  %108 = fmul float %107, %9
> +  %109 = fadd float %108, %21
> +  %110 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> +  %111 = extractelement <4 x float> %110, i32 2
> +  %112 = fmul float %111, %10
> +  %113 = fadd float %112, %22
> +  %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
> +  %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
> +  %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
> +  %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
> +  %118 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> +  %119 = extractelement <4 x float> %118, i32 0
> +  %120 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> +  %121 = extractelement <4 x float> %120, i32 1
> +  %122 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> +  %123 = extractelement <4 x float> %122, i32 2
> +  %124 = insertelement <4 x float> undef, float %99, i32 0
> +  %125 = insertelement <4 x float> %124, float %100, i32 1
> +  %126 = insertelement <4 x float> %125, float %101, i32 2
> +  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
> +  %128 = insertelement <4 x float> undef, float %119, i32 0
> +  %129 = insertelement <4 x float> %128, float %121, i32 1
> +  %130 = insertelement <4 x float> %129, float %123, i32 2
> +  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
> +  %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
> +  %133 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> +  %134 = extractelement <4 x float> %133, i32 0
> +  %135 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> +  %136 = extractelement <4 x float> %135, i32 1
> +  %137 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> +  %138 = extractelement <4 x float> %137, i32 2
> +  %139 = insertelement <4 x float> undef, float %99, i32 0
> +  %140 = insertelement <4 x float> %139, float %100, i32 1
> +  %141 = insertelement <4 x float> %140, float %101, i32 2
> +  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
> +  %143 = insertelement <4 x float> undef, float %134, i32 0
> +  %144 = insertelement <4 x float> %143, float %136, i32 1
> +  %145 = insertelement <4 x float> %144, float %138, i32 2
> +  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
> +  %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
> +  %148 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> +  %149 = extractelement <4 x float> %148, i32 0
> +  %150 = fmul float %149, %8
> +  %151 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> +  %152 = extractelement <4 x float> %151, i32 1
> +  %153 = fmul float %152, %9
> +  %154 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> +  %155 = extractelement <4 x float> %154, i32 2
> +  %156 = fmul float %155, %10
> +  %157 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> +  %158 = extractelement <4 x float> %157, i32 0
> +  %159 = fmul float %158, %12
> +  %160 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> +  %161 = extractelement <4 x float> %160, i32 1
> +  %162 = fmul float %161, %13
> +  %163 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> +  %164 = extractelement <4 x float> %163, i32 2
> +  %165 = fmul float %164, %14
> +  %166 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> +  %167 = extractelement <4 x float> %166, i32 0
> +  %168 = fmul float %167, %16
> +  %169 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> +  %170 = extractelement <4 x float> %169, i32 1
> +  %171 = fmul float %170, %17
> +  %172 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> +  %173 = extractelement <4 x float> %172, i32 2
> +  %174 = fmul float %173, %18
> +  %175 = fcmp uge float %132, 0.000000e+00
> +  %176 = select i1 %175, float %132, float 0.000000e+00
> +  %177 = fcmp uge float %147, 0.000000e+00
> +  %178 = select i1 %177, float %147, float 0.000000e+00
> +  %179 = call float @llvm.pow.f32(float %178, float %24)
> +  %180 = fcmp ult float %132, 0.000000e+00
> +  %181 = select i1 %180, float 0.000000e+00, float %179
> +  %182 = fadd float %150, %105
> +  %183 = fadd float %153, %109
> +  %184 = fadd float %156, %113
> +  %185 = fmul float %176, %159
> +  %186 = fadd float %185, %182
> +  %187 = fmul float %176, %162
> +  %188 = fadd float %187, %183
> +  %189 = fmul float %176, %165
> +  %190 = fadd float %189, %184
> +  %191 = fmul float %181, %168
> +  %192 = fadd float %191, %186
> +  %193 = fmul float %181, %171
> +  %194 = fadd float %193, %188
> +  %195 = fmul float %181, %174
> +  %196 = fadd float %195, %190
> +  %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
> +  %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
> +  %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
> +  %200 = insertelement <4 x float> undef, float %75, i32 0
> +  %201 = insertelement <4 x float> %200, float %79, i32 1
> +  %202 = insertelement <4 x float> %201, float %83, i32 2
> +  %203 = insertelement <4 x float> %202, float %87, i32 3
> +  call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
> +  %204 = insertelement <4 x float> undef, float %197, i32 0
> +  %205 = insertelement <4 x float> %204, float %198, i32 1
> +  %206 = insertelement <4 x float> %205, float %199, i32 2
> +  %207 = insertelement <4 x float> %206, float %117, i32 3
> +  call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
> +  ret void
> +}
> +
> +; Function Attrs: readnone
> +declare float @llvm.R600.load.input(i32) #1
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
> +
> +; Function Attrs: readonly
> +declare float @fabs(float) #2
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDGPU.rsq(float) #1
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDIL.clamp.(float, float, float) #1
> +
> +; Function Attrs: nounwind readonly
> +declare float @llvm.pow.f32(float, float) #3
> +
> +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
> +
> +attributes #0 = { "ShaderType"="1" }
> +attributes #1 = { readnone }
> +attributes #2 = { readonly }
> +attributes #3 = { nounwind readonly }
> -- 
> 1.8.2.1
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits