[PATCH] R600/SI: add pass to mark CF live ranges as non-spillable

Fri Apr 24 16:24:31 PDT 2015

Spilling can insert instructions almost anywhere, and this can mess
up control flow lowering in a multitude of ways, due to instruction
reordering. Let's sort this out the easy way: never spill registers
involved with control flow, i.e. saved EXEC masks.

The test was reduced from the "stacks" shader of [1]. Some issues
trigger the machine verifier while another one is checked manually.

[1] http://madebyevan.com/webgl-path-tracing/
---
 lib/Target/R600/AMDGPU.h                          |   4 +
 lib/Target/R600/AMDGPUTargetMachine.cpp           |   6 +
 lib/Target/R600/CMakeLists.txt                    |   1 +
 lib/Target/R600/SIFixControlFlowLiveIntervals.cpp |  96 +++++
 test/CodeGen/R600/si-spill-cf.ll                  | 503 ++++++++++++++++++++++
 5 files changed, 610 insertions(+)
 create mode 100644 lib/Target/R600/SIFixControlFlowLiveIntervals.cpp
 create mode 100644 test/CodeGen/R600/si-spill-cf.ll

diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index fb87cc5..9b36063 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -43,6 +43,7 @@ FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
@@ -64,6 +65,9 @@ Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
 
+void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
+extern char &SIFixControlFlowLiveIntervalsID;
+
 void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
 extern char &SIFixSGPRLiveRangesID;
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index cb95835..b8de121 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -252,6 +252,12 @@ bool GCNPassConfig::addInstSelector() {
 
 void GCNPassConfig::addPreRegAlloc() {
   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+
+  // This needs to be run directly before register allocation because
+  // earlier passes might recompute live intervals.
+  initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+
   if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
     // Don't do this with no optimizations since it throws away debug info by
     // merging nonadjacent loads.
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 5a4bae2..3c1bc49 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_target(R600CodeGen
   R600RegisterInfo.cpp
   R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
+  SIFixControlFlowLiveIntervals.cpp
   SIFixSGPRCopies.cpp
   SIFixSGPRLiveRanges.cpp
   SIFoldOperands.cpp
diff --git a/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp b/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp
new file mode 100644
index 0000000..5fe8d19
--- /dev/null
+++ b/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp
@@ -0,0 +1,96 @@
+//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Spilling of EXEC masks used for control flow messes up control flow
+/// lowering, so mark all live intervals associated with CF instructions as
+/// non-spillable.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-cf-live-intervals"
+
+namespace {
+
+class SIFixControlFlowLiveIntervals : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) {
+    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Fix CF Live Intervals";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                      "SI Fix CF Live Intervals", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE,
+                    "SI Fix CF Live Intervals", false, false)
+
+char SIFixControlFlowLiveIntervals::ID = 0;
+
+char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID;
+
+FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() {
+  return new SIFixControlFlowLiveIntervals();
+}
+
+bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+        case AMDGPU::SI_IF:
+        case AMDGPU::SI_ELSE:
+        case AMDGPU::SI_BREAK:
+        case AMDGPU::SI_IF_BREAK:
+        case AMDGPU::SI_ELSE_BREAK:
+        case AMDGPU::SI_END_CF: {
+          unsigned Reg = MI.getOperand(0).getReg();
+          LIS->getInterval(Reg).markNotSpillable();
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/test/CodeGen/R600/si-spill-cf.ll b/test/CodeGen/R600/si-spill-cf.ll
new file mode 100644
index 0000000..66f8ed0
--- /dev/null
+++ b/test/CodeGen/R600/si-spill-cf.ll
@@ -0,0 +1,503 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+
+; If this occurs it is likely due to reordering and the restore was
+; originally supposed to happen before SI_END_CF.
+; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
+; SI-NOT: v_readlane_b32 [[SAVED]]
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF2795, %main_body
+  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
+  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
+  %67 = icmp sgt i32 undef, 4
+  br i1 %67, label %ENDLOOP, label %ENDIF
+
+ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
+  %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef)
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %69 = fsub float %2, undef
+  %70 = fsub float %3, undef
+  %71 = fsub float %4, undef
+  %72 = fmul float %69, 0.000000e+00
+  %73 = fmul float %70, undef
+  %74 = fmul float %71, undef
+  %75 = fsub float %6, undef
+  %76 = fsub float %7, undef
+  %77 = fmul float %75, undef
+  %78 = fmul float %76, 0.000000e+00
+  %79 = call float @llvm.minnum.f32(float %74, float %78)
+  %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00)
+  %81 = call float @llvm.maxnum.f32(float %73, float %77)
+  %82 = call float @llvm.maxnum.f32(float undef, float %79)
+  %83 = call float @llvm.minnum.f32(float %80, float %81)
+  %84 = call float @llvm.minnum.f32(float %83, float undef)
+  %85 = fsub float %14, undef
+  %86 = fsub float %15, undef
+  %87 = fsub float %16, undef
+  %88 = fmul float %85, undef
+  %89 = fmul float %86, undef
+  %90 = fmul float %87, undef
+  %91 = fsub float %17, undef
+  %92 = fsub float %18, undef
+  %93 = fsub float %19, undef
+  %94 = fmul float %91, 0.000000e+00
+  %95 = fmul float %92, undef
+  %96 = fmul float %93, undef
+  %97 = call float @llvm.minnum.f32(float %89, float %95)
+  %98 = call float @llvm.maxnum.f32(float %88, float %94)
+  %99 = call float @llvm.maxnum.f32(float %90, float %96)
+  %100 = call float @llvm.maxnum.f32(float undef, float %97)
+  %101 = call float @llvm.maxnum.f32(float %100, float undef)
+  %102 = call float @llvm.minnum.f32(float %98, float undef)
+  %103 = call float @llvm.minnum.f32(float %102, float %99)
+  %104 = fsub float %30, undef
+  %105 = fsub float %31, undef
+  %106 = fmul float %104, 0.000000e+00
+  %107 = fmul float %105, 0.000000e+00
+  %108 = call float @llvm.minnum.f32(float undef, float %106)
+  %109 = call float @llvm.maxnum.f32(float undef, float %107)
+  %110 = call float @llvm.maxnum.f32(float undef, float %108)
+  %111 = call float @llvm.maxnum.f32(float %110, float undef)
+  %112 = call float @llvm.minnum.f32(float undef, float %109)
+  %113 = fsub float %32, undef
+  %114 = fsub float %33, undef
+  %115 = fsub float %34, undef
+  %116 = fmul float %113, 0.000000e+00
+  %117 = fmul float %114, undef
+  %118 = fmul float %115, undef
+  %119 = fsub float %35, undef
+  %120 = fsub float %36, undef
+  %121 = fsub float %37, undef
+  %122 = fmul float %119, undef
+  %123 = fmul float %120, undef
+  %124 = fmul float %121, undef
+  %125 = call float @llvm.minnum.f32(float %116, float %122)
+  %126 = call float @llvm.minnum.f32(float %117, float %123)
+  %127 = call float @llvm.minnum.f32(float %118, float %124)
+  %128 = call float @llvm.maxnum.f32(float %125, float %126)
+  %129 = call float @llvm.maxnum.f32(float %128, float %127)
+  %130 = fsub float %38, undef
+  %131 = fsub float %39, undef
+  %132 = fsub float %40, undef
+  %133 = fmul float %130, 0.000000e+00
+  %134 = fmul float %131, undef
+  %135 = fmul float %132, undef
+  %136 = fsub float %41, undef
+  %137 = fsub float %42, undef
+  %138 = fsub float %43, undef
+  %139 = fmul float %136, undef
+  %140 = fmul float %137, undef
+  %141 = fmul float %138, undef
+  %142 = call float @llvm.minnum.f32(float %133, float %139)
+  %143 = call float @llvm.minnum.f32(float %134, float %140)
+  %144 = call float @llvm.minnum.f32(float %135, float %141)
+  %145 = call float @llvm.maxnum.f32(float %142, float %143)
+  %146 = call float @llvm.maxnum.f32(float %145, float %144)
+  %147 = fsub float %44, undef
+  %148 = fsub float %45, undef
+  %149 = fsub float %46, undef
+  %150 = fmul float %147, 0.000000e+00
+  %151 = fmul float %148, 0.000000e+00
+  %152 = fmul float %149, undef
+  %153 = fsub float %47, undef
+  %154 = fsub float %48, undef
+  %155 = fsub float %49, undef
+  %156 = fmul float %153, undef
+  %157 = fmul float %154, 0.000000e+00
+  %158 = fmul float %155, undef
+  %159 = call float @llvm.minnum.f32(float %150, float %156)
+  %160 = call float @llvm.minnum.f32(float %151, float %157)
+  %161 = call float @llvm.minnum.f32(float %152, float %158)
+  %162 = call float @llvm.maxnum.f32(float %159, float %160)
+  %163 = call float @llvm.maxnum.f32(float %162, float %161)
+  %164 = fsub float %50, undef
+  %165 = fsub float %51, undef
+  %166 = fsub float %52, undef
+  %167 = fmul float %164, undef
+  %168 = fmul float %165, 0.000000e+00
+  %169 = fmul float %166, 0.000000e+00
+  %170 = fsub float %53, undef
+  %171 = fsub float %54, undef
+  %172 = fsub float %55, undef
+  %173 = fdiv float 1.000000e+00, %temp18.0
+  %174 = fmul float %170, undef
+  %175 = fmul float %171, undef
+  %176 = fmul float %172, %173
+  %177 = call float @llvm.minnum.f32(float %167, float %174)
+  %178 = call float @llvm.minnum.f32(float %168, float %175)
+  %179 = call float @llvm.minnum.f32(float %169, float %176)
+  %180 = call float @llvm.maxnum.f32(float %177, float %178)
+  %181 = call float @llvm.maxnum.f32(float %180, float %179)
+  %182 = fsub float %62, undef
+  %183 = fsub float %63, undef
+  %184 = fsub float %64, undef
+  %185 = fmul float %182, 0.000000e+00
+  %186 = fmul float %183, undef
+  %187 = fmul float %184, undef
+  %188 = fsub float %65, undef
+  %189 = fsub float %66, undef
+  %190 = fmul float %188, undef
+  %191 = fmul float %189, undef
+  %192 = call float @llvm.maxnum.f32(float %185, float %190)
+  %193 = call float @llvm.maxnum.f32(float %186, float %191)
+  %194 = call float @llvm.maxnum.f32(float %187, float undef)
+  %195 = call float @llvm.minnum.f32(float %192, float %193)
+  %196 = call float @llvm.minnum.f32(float %195, float %194)
+  %.temp292.7 = select i1 undef, float %163, float undef
+  %temp292.9 = select i1 false, float %181, float %.temp292.7
+  %.temp292.9 = select i1 undef, float undef, float %temp292.9
+  %197 = fcmp ogt float undef, 0.000000e+00
+  %198 = fcmp olt float undef, %196
+  %199 = and i1 %197, %198
+  %200 = fcmp olt float undef, %.temp292.9
+  %201 = and i1 %199, %200
+  %temp292.11 = select i1 %201, float undef, float %.temp292.9
+  br i1 undef, label %IF2565, label %ELSE2566
+
+IF2565:                                           ; preds = %ENDIF
+  br i1 false, label %ENDIF2582, label %ELSE2584
+
+ELSE2566:                                         ; preds = %ENDIF
+  %202 = fcmp oeq float %temp292.11, 1.000000e+04
+  br i1 %202, label %ENDLOOP, label %ELSE2593
+
+ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
+  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
+  %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %203 = fsub float %5, undef
+  %204 = fmul float %203, undef
+  %205 = call float @llvm.maxnum.f32(float undef, float %204)
+  %206 = call float @llvm.minnum.f32(float %205, float undef)
+  %207 = call float @llvm.minnum.f32(float %206, float undef)
+  %208 = fcmp ogt float undef, 0.000000e+00
+  %209 = fcmp olt float undef, 1.000000e+00
+  %210 = and i1 %208, %209
+  %211 = fcmp olt float undef, %207
+  %212 = and i1 %210, %211
+  br i1 %212, label %ENDIF2795, label %ELSE2797
+
+ELSE2584:                                         ; preds = %IF2565
+  br label %ENDIF2582
+
+ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
+  %213 = fadd float %1, undef
+  %214 = fadd float 0.000000e+00, %213
+  %215 = call float @llvm.AMDIL.fraction.(float %214)
+  br i1 undef, label %IF2589, label %ELSE2590
+
+IF2589:                                           ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ELSE2590:                                         ; preds = %ENDIF2582
+  br label %ENDIF2588
+
+ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
+  %216 = fsub float 1.000000e+00, %215
+  %217 = call float @llvm.sqrt.f32(float %216)
+  %218 = fmul float %217, undef
+  %219 = fadd float %218, undef
+  br label %ENDIF2564
+
+ELSE2593:                                         ; preds = %ELSE2566
+  %220 = fcmp oeq float %temp292.11, %82
+  %221 = fcmp olt float %82, %84
+  %222 = and i1 %220, %221
+  br i1 %222, label %ENDIF2594, label %ELSE2596
+
+ELSE2596:                                         ; preds = %ELSE2593
+  %223 = fcmp oeq float %temp292.11, %101
+  %224 = fcmp olt float %101, %103
+  %225 = and i1 %223, %224
+  br i1 %225, label %ENDIF2594, label %ELSE2632
+
+ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
+  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
+  %226 = fmul float %temp894.2, undef
+  br label %ENDIF2564
+
+ELSE2632:                                         ; preds = %ELSE2596
+  br i1 undef, label %ENDIF2594, label %ELSE2650
+
+ELSE2650:                                         ; preds = %ELSE2632
+  %227 = fcmp oeq float %temp292.11, %111
+  %228 = fcmp olt float %111, %112
+  %229 = and i1 %227, %228
+  br i1 %229, label %IF2667, label %ELSE2668
+
+IF2667:                                           ; preds = %ELSE2650
+  br i1 undef, label %ENDIF2594, label %ELSE2671
+
+ELSE2668:                                         ; preds = %ELSE2650
+  %230 = fcmp oeq float %temp292.11, %129
+  %231 = fcmp olt float %129, undef
+  %232 = and i1 %230, %231
+  br i1 %232, label %ENDIF2594, label %ELSE2686
+
+ELSE2671:                                         ; preds = %IF2667
+  br label %ENDIF2594
+
+ELSE2686:                                         ; preds = %ELSE2668
+  %233 = fcmp oeq float %temp292.11, %146
+  %234 = fcmp olt float %146, undef
+  %235 = and i1 %233, %234
+  br i1 %235, label %ENDIF2594, label %ELSE2704
+
+ELSE2704:                                         ; preds = %ELSE2686
+  %236 = fcmp oeq float %temp292.11, %181
+  %237 = fcmp olt float %181, undef
+  %238 = and i1 %236, %237
+  br i1 %238, label %ENDIF2594, label %ELSE2740
+
+ELSE2740:                                         ; preds = %ELSE2704
+  br i1 undef, label %IF2757, label %ELSE2758
+
+IF2757:                                           ; preds = %ELSE2740
+  br i1 undef, label %ENDIF2594, label %ELSE2761
+
+ELSE2758:                                         ; preds = %ELSE2740
+  br i1 undef, label %IF2775, label %ENDIF2594
+
+ELSE2761:                                         ; preds = %IF2757
+  br label %ENDIF2594
+
+IF2775:                                           ; preds = %ELSE2758
+  %239 = fcmp olt float undef, undef
+  br i1 %239, label %ENDIF2594, label %ELSE2779
+
+ELSE2779:                                         ; preds = %IF2775
+  br i1 undef, label %ENDIF2594, label %ELSE2782
+
+ELSE2782:                                         ; preds = %ELSE2779
+  br i1 undef, label %ENDIF2594, label %ELSE2785
+
+ELSE2785:                                         ; preds = %ELSE2782
+  %240 = fcmp olt float undef, 0.000000e+00
+  br i1 %240, label %ENDIF2594, label %ELSE2788
+
+ELSE2788:                                         ; preds = %ELSE2785
+  %241 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00
+  br label %ENDIF2594
+
+ELSE2797:                                         ; preds = %ENDIF2564
+  %242 = fsub float %8, undef
+  %243 = fsub float %9, undef
+  %244 = fsub float %10, undef
+  %245 = fmul float %242, undef
+  %246 = fmul float %243, undef
+  %247 = fmul float %244, undef
+  %248 = fsub float %11, undef
+  %249 = fsub float %12, undef
+  %250 = fsub float %13, undef
+  %251 = fmul float %248, undef
+  %252 = fmul float %249, undef
+  %253 = fmul float %250, undef
+  %254 = call float @llvm.minnum.f32(float %245, float %251)
+  %255 = call float @llvm.minnum.f32(float %246, float %252)
+  %256 = call float @llvm.maxnum.f32(float %247, float %253)
+  %257 = call float @llvm.maxnum.f32(float %254, float %255)
+  %258 = call float @llvm.maxnum.f32(float %257, float undef)
+  %259 = call float @llvm.minnum.f32(float undef, float %256)
+  %260 = fcmp ogt float %258, 0.000000e+00
+  %261 = fcmp olt float %258, 1.000000e+00
+  %262 = and i1 %260, %261
+  %263 = fcmp olt float %258, %259
+  %264 = and i1 %262, %263
+  br i1 %264, label %ENDIF2795, label %ELSE2800
+
+ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
+  br label %LOOP
+
+ELSE2800:                                         ; preds = %ELSE2797
+  br i1 undef, label %ENDIF2795, label %ELSE2803
+
+ELSE2803:                                         ; preds = %ELSE2800
+  %265 = fsub float %20, undef
+  %266 = fsub float %21, undef
+  %267 = fsub float %22, undef
+  %268 = fmul float %265, undef
+  %269 = fmul float %266, undef
+  %270 = fmul float %267, 0.000000e+00
+  %271 = fsub float %23, undef
+  %272 = fsub float %24, undef
+  %273 = fsub float %25, undef
+  %274 = fmul float %271, undef
+  %275 = fmul float %272, undef
+  %276 = fmul float %273, undef
+  %277 = call float @llvm.minnum.f32(float %268, float %274)
+  %278 = call float @llvm.maxnum.f32(float %269, float %275)
+  %279 = call float @llvm.maxnum.f32(float %270, float %276)
+  %280 = call float @llvm.maxnum.f32(float %277, float undef)
+  %281 = call float @llvm.maxnum.f32(float %280, float undef)
+  %282 = call float @llvm.minnum.f32(float undef, float %278)
+  %283 = call float @llvm.minnum.f32(float %282, float %279)
+  %284 = fcmp ogt float %281, 0.000000e+00
+  %285 = fcmp olt float %281, 1.000000e+00
+  %286 = and i1 %284, %285
+  %287 = fcmp olt float %281, %283
+  %288 = and i1 %286, %287
+  br i1 %288, label %ENDIF2795, label %ELSE2806
+
+ELSE2806:                                         ; preds = %ELSE2803
+  %289 = fsub float %26, undef
+  %290 = fsub float %27, undef
+  %291 = fsub float %28, undef
+  %292 = fmul float %289, undef
+  %293 = fmul float %290, 0.000000e+00
+  %294 = fmul float %291, undef
+  %295 = fsub float %29, undef
+  %296 = fmul float %295, undef
+  %297 = call float @llvm.minnum.f32(float %292, float %296)
+  %298 = call float @llvm.minnum.f32(float %293, float undef)
+  %299 = call float @llvm.maxnum.f32(float %294, float undef)
+  %300 = call float @llvm.maxnum.f32(float %297, float %298)
+  %301 = call float @llvm.maxnum.f32(float %300, float undef)
+  %302 = call float @llvm.minnum.f32(float undef, float %299)
+  %303 = fcmp ogt float %301, 0.000000e+00
+  %304 = fcmp olt float %301, 1.000000e+00
+  %305 = and i1 %303, %304
+  %306 = fcmp olt float %301, %302
+  %307 = and i1 %305, %306
+  br i1 %307, label %ENDIF2795, label %ELSE2809
+
+ELSE2809:                                         ; preds = %ELSE2806
+  br i1 undef, label %ENDIF2795, label %ELSE2812
+
+ELSE2812:                                         ; preds = %ELSE2809
+  br i1 undef, label %ENDIF2795, label %ELSE2815
+
+ELSE2815:                                         ; preds = %ELSE2812
+  br i1 undef, label %ENDIF2795, label %ELSE2818
+
+ELSE2818:                                         ; preds = %ELSE2815
+  br i1 undef, label %ENDIF2795, label %ELSE2821
+
+ELSE2821:                                         ; preds = %ELSE2818
+  %308 = fsub float %56, undef
+  %309 = fsub float %57, undef
+  %310 = fsub float %58, undef
+  %311 = fmul float %308, undef
+  %312 = fmul float %309, 0.000000e+00
+  %313 = fmul float %310, undef
+  %314 = fsub float %59, undef
+  %315 = fsub float %60, undef
+  %316 = fsub float %61, undef
+  %317 = fmul float %314, undef
+  %318 = fmul float %315, undef
+  %319 = fmul float %316, undef
+  %320 = call float @llvm.maxnum.f32(float %311, float %317)
+  %321 = call float @llvm.maxnum.f32(float %312, float %318)
+  %322 = call float @llvm.maxnum.f32(float %313, float %319)
+  %323 = call float @llvm.minnum.f32(float %320, float %321)
+  %324 = call float @llvm.minnum.f32(float %323, float %322)
+  %325 = fcmp ogt float undef, 0.000000e+00
+  %326 = fcmp olt float undef, 1.000000e+00
+  %327 = and i1 %325, %326
+  %328 = fcmp olt float undef, %324
+  %329 = and i1 %327, %328
+  br i1 %329, label %ENDIF2795, label %ELSE2824
+
+ELSE2824:                                         ; preds = %ELSE2821
+  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
+  br label %ENDIF2795
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.fraction.(float) #2
+
+; Function Attrs: nounwind readnone
+declare float @llvm.sqrt.f32(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.minnum.f32(float, float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.maxnum.f32(float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
-- 
1.9.1