R600: Various improvements to scheduler and packetizer
Tom Stellard
tom at stellard.net
Wed May 22 14:55:26 PDT 2013
On Tue, May 21, 2013 at 01:06:09PM -0700, Vincent Lejeune wrote:
> Hi
>
> Another serie aiming at fixing some light inneficiencies in our scheduler and packetizer.
>
> Currently packetizer does not pack instruction without a write bit ; however Op3 operand don't have such bit and are perfectly packetizable.
> Second patch relaxes this constraints.
> Third patch properly handle data dependency inside a packet candidate : we check if such dependency is real or if it is between 2 different subregs.
> 4th one remove extra COPY instructions that were appearing when we switched to bottom up scheduling
> Last one implement the change suggested in a review by Tom.
>
> Vincent
Just a few comments on patches 3 and 4. The rest are:
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>
> From c65d202fd7a8be44d7149f2b6013552cc33c2f21 Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Fri, 17 May 2013 14:22:08 +0200
> Subject: [PATCH 4/5] R600: Schedule copy from phys register at beginning of
> block
>
> It allows regalloc pass to remove them by trivially assigning associated reg
> ---
> lib/Target/R600/R600MachineScheduler.cpp | 29 ++++++++++++++++++++++++++---
> lib/Target/R600/R600MachineScheduler.h | 1 +
> test/CodeGen/R600/fabs.ll | 2 +-
> test/CodeGen/R600/fadd.ll | 2 +-
> test/CodeGen/R600/floor.ll | 2 +-
> test/CodeGen/R600/fmad.ll | 2 +-
> test/CodeGen/R600/fmax.ll | 2 +-
> test/CodeGen/R600/fmin.ll | 2 +-
> test/CodeGen/R600/fmul.ll | 2 +-
> test/CodeGen/R600/fsub.ll | 2 +-
> test/CodeGen/R600/llvm.AMDGPU.mul.ll | 2 +-
> test/CodeGen/R600/llvm.pow.ll | 2 +-
> 12 files changed, 37 insertions(+), 13 deletions(-)
>
> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index 4c89124..1b83393 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -51,6 +51,14 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> SUnit *SU = 0;
> NextInstKind = IDOther;
>
> + if (!FakeCopy.empty()) {
> + IsTopNode = true;
> + SU = FakeCopy.back();
> + FakeCopy.resize(FakeCopy.size() - 1);
> + DEBUG(dbgs() << "Picked fake copy : "; SU->dump(DAG););
> + return SU;
> + }
> +
> IsTopNode = false;
>
> // check if we might want to switch current clause type
> @@ -64,6 +72,8 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
> // try to pick ALU
> SU = pickAlu();
> if (SU) {
> + if (SU->isScheduled)
> + return pickNode(IsTopNode);
> if (CurEmitted >= InstKindLimit[IDAlu])
> CurEmitted = 0;
> NextInstKind = IDAlu;
> @@ -140,9 +150,22 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
> }
> }
>
> +static bool
> +isFakeCopy(MachineInstr *MI) {
> + if (MI->getOpcode() != AMDGPU::COPY)
> + return false;
> +
> + return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
> +}
> +
> void R600SchedStrategy::releaseTopNode(SUnit *SU) {
> DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
>
> + if (isFakeCopy(SU->getInstr())) {
> + FakeCopy.push_back(SU);
> + return;
> + }
> +
> }
>
> void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
> @@ -278,7 +301,7 @@ void R600SchedStrategy::LoadAlu() {
>
> void R600SchedStrategy::PrepareNextSlot() {
> DEBUG(dbgs() << "New Slot\n");
> - assert (OccupedSlotsMask && "Slot wasn't filled");
> +// assert (OccupedSlotsMask && "Slot wasn't filled");
This looks like it was left in by accident.
> OccupedSlotsMask = 0;
> InstructionsGroupCandidate.clear();
> LoadAlu();
> @@ -315,10 +338,10 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
> SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
> static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
> SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
> - if (SlotedSU)
> + if (SlotedSU && !SlotedSU->isScheduled)
> return SlotedSU;
> SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
> - if (UnslotedSU)
> + if (UnslotedSU && !UnslotedSU->isScheduled)
> AssignSlot(UnslotedSU->getInstr(), Slot);
> return UnslotedSU;
> }
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index 1db877d..da99dbd 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -52,6 +52,7 @@ class R600SchedStrategy : public MachineSchedStrategy {
>
> std::vector<SUnit *> Available[IDLast], Pending[IDLast];
> std::vector<SUnit *> AvailableAlus[AluLast];
> + std::vector<SUnit *> FakeCopy;
>
> InstKind CurInstKind;
> int CurEmitted;
> diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
> index 17ac895..85f2882 100644
> --- a/test/CodeGen/R600/fabs.ll
> +++ b/test/CodeGen/R600/fabs.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: MOV * T{{[0-9]+\.[XYZW], \|PV\.[xyzw]\|}}
> +;CHECK: MOV * T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
> index 821d329..9a67232 100644
> --- a/test/CodeGen/R600/fadd.ll
> +++ b/test/CodeGen/R600/fadd.ll
> @@ -1,7 +1,7 @@
> ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ; CHECK: @fadd_f32
> -; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @fadd_f32() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
> index 0a807b1..877d69a 100644
> --- a/test/CodeGen/R600/floor.ll
> +++ b/test/CodeGen/R600/floor.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: FLOOR * T{{[0-9]+\.[XYZW], PV\.[xyzw]}}
> +;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
> index 8614115..75e65d8 100644
> --- a/test/CodeGen/R600/fmad.ll
> +++ b/test/CodeGen/R600/fmad.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], PV.[xyzw], PV\.[xyzw]}}
> +;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
> index ef3daad..8b704e5 100644
> --- a/test/CodeGen/R600/fmax.ll
> +++ b/test/CodeGen/R600/fmax.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: MAX * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
> index 026481c..5e34b7c 100644
> --- a/test/CodeGen/R600/fmin.ll
> +++ b/test/CodeGen/R600/fmin.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: MIN * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
> index dbb6424..a40e818 100644
> --- a/test/CodeGen/R600/fmul.ll
> +++ b/test/CodeGen/R600/fmul.ll
> @@ -1,7 +1,7 @@
> ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ; CHECK: @fmul_f32
> -; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @fmul_f32() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
> index f88729e..f784cde 100644
> --- a/test/CodeGen/R600/fsub.ll
> +++ b/test/CodeGen/R600/fsub.ll
> @@ -1,7 +1,7 @@
> ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ; CHECK: @fsub_f32
> -; CHECK: ADD * T{{[0-9]+\.[XYZW], PV\.[xyzw], -PV\.[xyzw]}}
> +; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
>
> define void @fsub_f32() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> index 69fbe58..cc0732b 100644
> --- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> +++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
> @@ -1,6 +1,6 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], PV\.[xyzw]}}
> +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> %r0 = call float @llvm.R600.load.input(i32 0)
> diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
> index 3800abf..1422083 100644
> --- a/test/CodeGen/R600/llvm.pow.ll
> +++ b/test/CodeGen/R600/llvm.pow.ll
> @@ -1,7 +1,7 @@
> ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>
> ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW]}}
> +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
> ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>
> define void @test() {
> --
> 1.8.2.1
>
> From 52d7e124207b8bcc20522e754be697246e55ceaa Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Sat, 4 May 2013 21:13:14 +0200
> Subject: [PATCH 3/5] R600: Packetize data dep
>
> ---
> lib/Target/R600/R600Packetizer.cpp | 9 ++
> test/CodeGen/R600/packet.ll | 243 +++++++++++++++++++++++++++++++++++++
> 2 files changed, 252 insertions(+)
> create mode 100644 test/CodeGen/R600/packet.ll
>
> diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
> index 86dff4e..3537a9f 100644
> --- a/lib/Target/R600/R600Packetizer.cpp
> +++ b/lib/Target/R600/R600Packetizer.cpp
> @@ -181,6 +181,15 @@ public:
> if (Dep.getKind() == SDep::Output)
> if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
> continue;
> + if (Dep.getKind() == SDep::Data) {
> + for (MachineInstr::mop_iterator MIIOp = MII->operands_begin(),
> + MIIE = MII->operands_end(); MIIOp != MIIE; ++MIIOp) {
> + MachineOperand &MO = *MIIOp;
> + if (MO.isReg() && MO.getReg() == MIJ->getOperand(0).getReg())
> + return false;
> + }
> + continue;
> + }
> return false;
> }
> }
> diff --git a/test/CodeGen/R600/packet.ll b/test/CodeGen/R600/packet.ll
> new file mode 100644
> index 0000000..669904a
> --- /dev/null
> +++ b/test/CodeGen/R600/packet.ll
> @@ -0,0 +1,243 @@
> +;RUN: llc < %s -march=r600 | FileCheck %s
> +
> +;CHECK: MULADD_IEEE T{{[0-9].[XYZW]}}
> +
Can you add a comment that explains exactly what this is testing. Also,
is it possible to reduce this testcase?
> +define void @main() #0 {
> +main_body:
> + %0 = call float @llvm.R600.load.input(i32 4)
> + %1 = call float @llvm.R600.load.input(i32 5)
> + %2 = call float @llvm.R600.load.input(i32 6)
> + %3 = call float @llvm.R600.load.input(i32 7)
> + %4 = call float @llvm.R600.load.input(i32 8)
> + %5 = call float @llvm.R600.load.input(i32 9)
> + %6 = call float @llvm.R600.load.input(i32 10)
> + %7 = call float @llvm.R600.load.input(i32 11)
> + %8 = call float @llvm.R600.load.input(i32 12)
> + %9 = call float @llvm.R600.load.input(i32 13)
> + %10 = call float @llvm.R600.load.input(i32 14)
> + %11 = call float @llvm.R600.load.input(i32 15)
> + %12 = call float @llvm.R600.load.input(i32 16)
> + %13 = call float @llvm.R600.load.input(i32 17)
> + %14 = call float @llvm.R600.load.input(i32 18)
> + %15 = call float @llvm.R600.load.input(i32 19)
> + %16 = call float @llvm.R600.load.input(i32 20)
> + %17 = call float @llvm.R600.load.input(i32 21)
> + %18 = call float @llvm.R600.load.input(i32 22)
> + %19 = call float @llvm.R600.load.input(i32 23)
> + %20 = call float @llvm.R600.load.input(i32 24)
> + %21 = call float @llvm.R600.load.input(i32 25)
> + %22 = call float @llvm.R600.load.input(i32 26)
> + %23 = call float @llvm.R600.load.input(i32 27)
> + %24 = call float @llvm.R600.load.input(i32 28)
> + %25 = call float @llvm.R600.load.input(i32 29)
> + %26 = call float @llvm.R600.load.input(i32 30)
> + %27 = call float @llvm.R600.load.input(i32 31)
> + %28 = load <4 x float> addrspace(8)* null
> + %29 = extractelement <4 x float> %28, i32 0
> + %30 = fmul float %0, %29
> + %31 = load <4 x float> addrspace(8)* null
> + %32 = extractelement <4 x float> %31, i32 1
> + %33 = fmul float %0, %32
> + %34 = load <4 x float> addrspace(8)* null
> + %35 = extractelement <4 x float> %34, i32 2
> + %36 = fmul float %0, %35
> + %37 = load <4 x float> addrspace(8)* null
> + %38 = extractelement <4 x float> %37, i32 3
> + %39 = fmul float %0, %38
> + %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> + %41 = extractelement <4 x float> %40, i32 0
> + %42 = fmul float %1, %41
> + %43 = fadd float %42, %30
> + %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> + %45 = extractelement <4 x float> %44, i32 1
> + %46 = fmul float %1, %45
> + %47 = fadd float %46, %33
> + %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> + %49 = extractelement <4 x float> %48, i32 2
> + %50 = fmul float %1, %49
> + %51 = fadd float %50, %36
> + %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> + %53 = extractelement <4 x float> %52, i32 3
> + %54 = fmul float %1, %53
> + %55 = fadd float %54, %39
> + %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> + %57 = extractelement <4 x float> %56, i32 0
> + %58 = fmul float %2, %57
> + %59 = fadd float %58, %43
> + %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> + %61 = extractelement <4 x float> %60, i32 1
> + %62 = fmul float %2, %61
> + %63 = fadd float %62, %47
> + %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> + %65 = extractelement <4 x float> %64, i32 2
> + %66 = fmul float %2, %65
> + %67 = fadd float %66, %51
> + %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> + %69 = extractelement <4 x float> %68, i32 3
> + %70 = fmul float %2, %69
> + %71 = fadd float %70, %55
> + %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> + %73 = extractelement <4 x float> %72, i32 0
> + %74 = fmul float %3, %73
> + %75 = fadd float %74, %59
> + %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> + %77 = extractelement <4 x float> %76, i32 1
> + %78 = fmul float %3, %77
> + %79 = fadd float %78, %63
> + %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> + %81 = extractelement <4 x float> %80, i32 2
> + %82 = fmul float %3, %81
> + %83 = fadd float %82, %67
> + %84 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
> + %85 = extractelement <4 x float> %84, i32 3
> + %86 = fmul float %3, %85
> + %87 = fadd float %86, %71
> + %88 = insertelement <4 x float> undef, float %4, i32 0
> + %89 = insertelement <4 x float> %88, float %5, i32 1
> + %90 = insertelement <4 x float> %89, float %6, i32 2
> + %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
> + %92 = insertelement <4 x float> undef, float %4, i32 0
> + %93 = insertelement <4 x float> %92, float %5, i32 1
> + %94 = insertelement <4 x float> %93, float %6, i32 2
> + %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
> + %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
> + %97 = call float @fabs(float %96)
> + %98 = call float @llvm.AMDGPU.rsq(float %97)
> + %99 = fmul float %4, %98
> + %100 = fmul float %5, %98
> + %101 = fmul float %6, %98
> + %102 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> + %103 = extractelement <4 x float> %102, i32 0
> + %104 = fmul float %103, %8
> + %105 = fadd float %104, %20
> + %106 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> + %107 = extractelement <4 x float> %106, i32 1
> + %108 = fmul float %107, %9
> + %109 = fadd float %108, %21
> + %110 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
> + %111 = extractelement <4 x float> %110, i32 2
> + %112 = fmul float %111, %10
> + %113 = fadd float %112, %22
> + %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00)
> + %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00)
> + %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00)
> + %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
> + %118 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> + %119 = extractelement <4 x float> %118, i32 0
> + %120 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> + %121 = extractelement <4 x float> %120, i32 1
> + %122 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
> + %123 = extractelement <4 x float> %122, i32 2
> + %124 = insertelement <4 x float> undef, float %99, i32 0
> + %125 = insertelement <4 x float> %124, float %100, i32 1
> + %126 = insertelement <4 x float> %125, float %101, i32 2
> + %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
> + %128 = insertelement <4 x float> undef, float %119, i32 0
> + %129 = insertelement <4 x float> %128, float %121, i32 1
> + %130 = insertelement <4 x float> %129, float %123, i32 2
> + %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
> + %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131)
> + %133 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> + %134 = extractelement <4 x float> %133, i32 0
> + %135 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> + %136 = extractelement <4 x float> %135, i32 1
> + %137 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
> + %138 = extractelement <4 x float> %137, i32 2
> + %139 = insertelement <4 x float> undef, float %99, i32 0
> + %140 = insertelement <4 x float> %139, float %100, i32 1
> + %141 = insertelement <4 x float> %140, float %101, i32 2
> + %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
> + %143 = insertelement <4 x float> undef, float %134, i32 0
> + %144 = insertelement <4 x float> %143, float %136, i32 1
> + %145 = insertelement <4 x float> %144, float %138, i32 2
> + %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
> + %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146)
> + %148 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> + %149 = extractelement <4 x float> %148, i32 0
> + %150 = fmul float %149, %8
> + %151 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> + %152 = extractelement <4 x float> %151, i32 1
> + %153 = fmul float %152, %9
> + %154 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
> + %155 = extractelement <4 x float> %154, i32 2
> + %156 = fmul float %155, %10
> + %157 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> + %158 = extractelement <4 x float> %157, i32 0
> + %159 = fmul float %158, %12
> + %160 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> + %161 = extractelement <4 x float> %160, i32 1
> + %162 = fmul float %161, %13
> + %163 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
> + %164 = extractelement <4 x float> %163, i32 2
> + %165 = fmul float %164, %14
> + %166 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> + %167 = extractelement <4 x float> %166, i32 0
> + %168 = fmul float %167, %16
> + %169 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> + %170 = extractelement <4 x float> %169, i32 1
> + %171 = fmul float %170, %17
> + %172 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
> + %173 = extractelement <4 x float> %172, i32 2
> + %174 = fmul float %173, %18
> + %175 = fcmp uge float %132, 0.000000e+00
> + %176 = select i1 %175, float %132, float 0.000000e+00
> + %177 = fcmp uge float %147, 0.000000e+00
> + %178 = select i1 %177, float %147, float 0.000000e+00
> + %179 = call float @llvm.pow.f32(float %178, float %24)
> + %180 = fcmp ult float %132, 0.000000e+00
> + %181 = select i1 %180, float 0.000000e+00, float %179
> + %182 = fadd float %150, %105
> + %183 = fadd float %153, %109
> + %184 = fadd float %156, %113
> + %185 = fmul float %176, %159
> + %186 = fadd float %185, %182
> + %187 = fmul float %176, %162
> + %188 = fadd float %187, %183
> + %189 = fmul float %176, %165
> + %190 = fadd float %189, %184
> + %191 = fmul float %181, %168
> + %192 = fadd float %191, %186
> + %193 = fmul float %181, %171
> + %194 = fadd float %193, %188
> + %195 = fmul float %181, %174
> + %196 = fadd float %195, %190
> + %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00)
> + %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00)
> + %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00)
> + %200 = insertelement <4 x float> undef, float %75, i32 0
> + %201 = insertelement <4 x float> %200, float %79, i32 1
> + %202 = insertelement <4 x float> %201, float %83, i32 2
> + %203 = insertelement <4 x float> %202, float %87, i32 3
> + call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1)
> + %204 = insertelement <4 x float> undef, float %197, i32 0
> + %205 = insertelement <4 x float> %204, float %198, i32 1
> + %206 = insertelement <4 x float> %205, float %199, i32 2
> + %207 = insertelement <4 x float> %206, float %117, i32 3
> + call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2)
> + ret void
> +}
> +
> +; Function Attrs: readnone
> +declare float @llvm.R600.load.input(i32) #1
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
> +
> +; Function Attrs: readonly
> +declare float @fabs(float) #2
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDGPU.rsq(float) #1
> +
> +; Function Attrs: readnone
> +declare float @llvm.AMDIL.clamp.(float, float, float) #1
> +
> +; Function Attrs: nounwind readonly
> +declare float @llvm.pow.f32(float, float) #3
> +
> +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
> +
> +attributes #0 = { "ShaderType"="1" }
> +attributes #1 = { readnone }
> +attributes #2 = { readonly }
> +attributes #3 = { nounwind readonly }
> --
> 1.8.2.1
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list