[llvm] Swedev 414443 (PR #65947)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 11 03:58:07 PDT 2023


https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/65947:

None

>From 2dda14f6e55bac312b44f45da42a79c111498491 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 7 Sep 2023 15:25:34 +0200
Subject: [PATCH 1/2] MachineSink/AMDGPU: Add test for SWEDEV-414443

---
 .../AMDGPU/machine-sink-swdev414443.mir       | 4102 +++++++++++++++++
 1 file changed, 4102 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir

diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir
new file mode 100644
index 00000000000000..84fc2a619a5c70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir
@@ -0,0 +1,4102 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-sink -o -  %s | FileCheck %s
+
+--- |
+  source_filename = "/work/mselehov/rocBLAS/library/src/blas2/rocblas_gemv_kernels.cpp"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+  target triple = "amdgcn-amd-amdhsa"
+
+  %llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds.t = type { [4096 x float] }
+  %llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EmffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds.t = type { [4096 x float] }
+
+  $_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil = comdat any
+
+  $_ZL20rocblas_gemvn_kernelILi64ELi16EmffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil = comdat any
+
+  @llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds = internal addrspace(3) global %llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds.t undef, align 16, !absolute_symbol !0
+  @llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EmffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds = internal addrspace(3) global %llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EmffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds.t undef, align 16, !absolute_symbol !0
+
+  define amdgpu_kernel void @_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil(i32 noundef %0, i32 noundef %1, float noundef %2, i64 noundef %3, ptr addrspace(1) nocapture noundef readonly %4, i64 noundef %5, i32 noundef %6, i64 noundef %7, ptr addrspace(1) nocapture noundef readonly %8, i64 noundef %9, i32 noundef %10, i64 noundef %11, float noundef %12, i64 noundef %13, ptr addrspace(1) nocapture noundef %14, i64 noundef %15, i32 noundef %16, i64 noundef %17) local_unnamed_addr #0 comdat {
+    %19 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+    %20 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr(), !amdgpu.uniform !5
+    %21 = load <3 x i32>, ptr addrspace(4) %20, align 4, !tbaa !6
+    %22 = extractelement <3 x i32> %21, i32 0
+    %23 = extractelement <3 x i32> %21, i32 1
+    %24 = extractelement <3 x i32> %21, i32 2
+    %25 = icmp ult i32 %19, %22
+    %26 = select i1 %25, i64 6, i64 9
+    %27 = getelementptr inbounds i16, ptr addrspace(4) %20, i64 %26, !amdgpu.uniform !5
+    %28 = load i16, ptr addrspace(4) %27, align 2, !tbaa !10
+    %29 = zext i16 %28 to i32
+    %30 = tail call i32 @llvm.amdgcn.workgroup.id.y()
+    %31 = icmp ult i32 %30, %23
+    %32 = select i1 %31, i64 7, i64 10
+    %33 = getelementptr inbounds i16, ptr addrspace(4) %20, i64 %32, !amdgpu.uniform !5
+    %34 = load i16, ptr addrspace(4) %33, align 2, !tbaa !10
+    %35 = zext i16 %34 to i32
+    %36 = mul nuw i32 %35, %29
+    %37 = tail call i32 @llvm.amdgcn.workgroup.id.z()
+    %38 = icmp ult i32 %37, %24
+    %39 = select i1 %38, i64 8, i64 11
+    %40 = getelementptr inbounds i16, ptr addrspace(4) %20, i64 %39, !amdgpu.uniform !5
+    %41 = load i16, ptr addrspace(4) %40, align 2, !tbaa !10
+    %42 = zext i16 %41 to i32
+    %43 = mul i32 %36, %42
+    %.not = icmp eq i32 %43, 1024
+    br i1 %.not, label %44, label %_Z25rocblas_gemvn_kernel_calcILi64ELi16EiffLi0EEviiT3_PKT2_T1_S3_iS0_PS1_i.exit, !amdgpu.uniform !5
+
+  44:                                               ; preds = %18
+    %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment = call nonnull align 16 dereferenceable(392) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %.kernarg.offset65 = bitcast ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment to ptr addrspace(4), !amdgpu.uniform !5
+    %45 = load <3 x i32>, ptr addrspace(4) %.kernarg.offset65, align 16, !invariant.load !5
+    %.load3868 = extractelement <3 x i32> %45, i32 2
+    %46 = bitcast i32 %.load3868 to float
+    %.kernarg.offset55 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 88, !amdgpu.uniform !5
+    %.load56 = load float, ptr addrspace(4) %.kernarg.offset55, align 8, !invariant.load !5
+    %47 = fcmp contract oeq float %46, 0.000000e+00
+    %48 = fcmp contract oeq float %.load56, 1.000000e+00
+    %or.cond = and i1 %47, %48
+    %or.cond.inv = xor i1 %or.cond, true
+    br i1 %or.cond.inv, label %49, label %Flow97, !amdgpu.uniform !5
+
+  49:                                               ; preds = %44
+    %.load66 = extractelement <3 x i32> %45, i32 0
+    %.kernarg.offset57 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 104, !amdgpu.uniform !5
+    %50 = load <2 x i64>, ptr addrspace(4) %.kernarg.offset57, align 8, !invariant.load !5
+    %.load5874 = extractelement <2 x i64> %50, i32 0
+    %51 = inttoptr i64 %.load5874 to ptr addrspace(1)
+    %.load6075 = extractelement <2 x i64> %50, i32 1
+    %.kernarg.offset61 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 120, !amdgpu.uniform !5
+    %.load62 = load i32, ptr addrspace(4) %.kernarg.offset61, align 8, !invariant.load !5
+    %.kernarg.offset63 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 128, !amdgpu.uniform !5
+    %.load64 = load i64, ptr addrspace(4) %.kernarg.offset63, align 16, !invariant.load !5
+    %52 = sext i32 %30 to i64
+    %53 = mul nsw i64 %52, %.load64
+    %54 = getelementptr inbounds float, ptr addrspace(1) %51, i64 %53
+    %55 = getelementptr inbounds float, ptr addrspace(1) %54, i64 %.load6075
+    %56 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !12, !noundef !5
+    %57 = tail call i32 @llvm.amdgcn.workitem.id.y(), !range !12, !noundef !5
+    %58 = call i32 @llvm.amdgcn.mul.u24(i32 %57, i32 %29)
+    %59 = add nuw nsw i32 %58, %56
+    %60 = fcmp contract oeq float %46, 0.000000e+00
+    br i1 %60, label %61, label %Flow90, !amdgpu.uniform !5
+
+  61:                                               ; preds = %49
+    %62 = icmp ult i32 %59, 256
+    %63 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %62)
+    %64 = extractvalue { i1, i64 } %63, 0
+    %65 = extractvalue { i1, i64 } %63, 1
+    br i1 %64, label %70, label %Flow91
+
+  Flow90:                                           ; preds = %Flow91, %49
+    %66 = phi float [ %77, %Flow91 ], [ undef, %49 ]
+    %67 = phi i64 [ %78, %Flow91 ], [ undef, %49 ]
+    %68 = phi i1 [ %79, %Flow91 ], [ false, %49 ]
+    %69 = phi i1 [ false, %Flow91 ], [ true, %49 ]
+    br i1 %69, label %91, label %Flow93, !amdgpu.uniform !5
+
+  70:                                               ; preds = %61
+    %71 = shl i32 %19, 8
+    %72 = add nuw i32 %59, %71
+    %73 = icmp slt i32 %72, %.load66
+    %74 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %73)
+    %75 = extractvalue { i1, i64 } %74, 0
+    %76 = extractvalue { i1, i64 } %74, 1
+    br i1 %75, label %80, label %Flow92
+
+  Flow91:                                           ; preds = %Flow92, %61
+    %77 = phi float [ %88, %Flow92 ], [ undef, %61 ]
+    %78 = phi i64 [ %89, %Flow92 ], [ undef, %61 ]
+    %79 = phi i1 [ %90, %Flow92 ], [ false, %61 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %65)
+    br label %Flow90, !amdgpu.uniform !5
+
+  80:                                               ; preds = %70
+    %81 = fcmp contract une float %.load56, 0.000000e+00
+    %82 = mul nsw i32 %72, %.load62
+    %83 = sext i32 %82 to i64
+    br i1 %81, label %84, label %Flow, !amdgpu.uniform !5
+
+  84:                                               ; preds = %80
+    %85 = getelementptr inbounds float, ptr addrspace(1) %55, i64 %83
+    %86 = load float, ptr addrspace(1) %85, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %87 = fmul contract float %86, %.load56
+    br label %Flow, !amdgpu.uniform !5
+
+  Flow92:                                           ; preds = %Flow, %70
+    %88 = phi float [ %511, %Flow ], [ undef, %70 ]
+    %89 = phi i64 [ %83, %Flow ], [ undef, %70 ]
+    %90 = phi i1 [ true, %Flow ], [ false, %70 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %76)
+    br label %Flow91, !amdgpu.uniform !5
+
+  91:                                               ; preds = %Flow90
+    %.kernarg.offset39 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 24, !amdgpu.uniform !5
+    %92 = load <2 x i64>, ptr addrspace(4) %.kernarg.offset39, align 8, !invariant.load !5
+    %.load4069 = extractelement <2 x i64> %92, i32 0
+    %93 = inttoptr i64 %.load4069 to ptr addrspace(1)
+    %.load4270 = extractelement <2 x i64> %92, i32 1
+    %.kernarg.offset45 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 48, !amdgpu.uniform !5
+    %94 = load <3 x i64>, ptr addrspace(4) %.kernarg.offset45, align 16, !invariant.load !5
+    %.load4671 = extractelement <3 x i64> %94, i32 0
+    %.load4872 = extractelement <3 x i64> %94, i32 1
+    %95 = inttoptr i64 %.load4872 to ptr addrspace(1)
+    %.load5073 = extractelement <3 x i64> %94, i32 2
+    %.kernarg.offset53 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 80, !amdgpu.uniform !5
+    %.load54 = load i64, ptr addrspace(4) %.kernarg.offset53, align 16, !invariant.load !5
+    %96 = mul nsw i64 %52, %.load4671
+    %97 = getelementptr inbounds float, ptr addrspace(1) %93, i64 %96
+    %98 = getelementptr inbounds float, ptr addrspace(1) %97, i64 %.load4270
+    %99 = mul i64 %52, %.load54
+    %100 = getelementptr inbounds float, ptr addrspace(1) %95, i64 %99
+    %101 = getelementptr inbounds float, ptr addrspace(1) %100, i64 %.load5073
+    %.load3667 = extractelement <3 x i32> %45, i32 1
+    %.kernarg.offset43 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 40, !amdgpu.uniform !5
+    %.load44 = load i32, ptr addrspace(4) %.kernarg.offset43, align 8, !invariant.load !5
+    %.kernarg.offset51 = getelementptr inbounds i8, ptr addrspace(4) %_ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.kernarg.segment, i64 72, !amdgpu.uniform !5
+    %.load52 = load i32, ptr addrspace(4) %.kernarg.offset51, align 8, !invariant.load !5
+    %102 = shl i32 %19, 8
+    %103 = add i32 %102, %56
+    %104 = srem i32 %.load3667, 64
+    %105 = shl nuw nsw i32 %57, 2
+    %106 = sub nsw i32 %.load3667, %104
+    %107 = icmp slt i32 %105, %106
+    %108 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %107)
+    %109 = extractvalue { i1, i64 } %108, 0
+    %110 = extractvalue { i1, i64 } %108, 1
+    br i1 %109, label %.lr.ph.i, label %Flow89
+
+  .lr.ph.i:                                         ; preds = %91
+    %111 = icmp slt i32 %103, %.load66
+    %112 = add nsw i32 %103, 64
+    %113 = icmp slt i32 %112, %.load66
+    %114 = add nsw i32 %103, 128
+    %115 = icmp slt i32 %114, %.load66
+    %116 = add nsw i32 %103, 192
+    %117 = icmp slt i32 %116, %.load66
+    %118 = add nuw nsw i32 %105, 1
+    %119 = mul i32 %.load44, %118
+    %120 = shl i32 %.load44, 6
+    %121 = add nuw nsw i32 %105, 2
+    %122 = mul i32 %.load44, %121
+    %123 = add nuw nsw i32 %105, 3
+    %124 = mul i32 %.load44, %123
+    %125 = mul i32 %57, %.load44
+    %126 = shl i32 %125, 2
+    %127 = mul i32 %.load52, %118
+    %128 = shl i32 %.load52, 6
+    %129 = mul i32 %.load52, %121
+    %130 = mul i32 %.load52, %123
+    %131 = shl i64 %99, 2
+    %132 = shl i64 %.load5073, 2
+    %133 = add i64 %131, %132
+    %134 = mul i32 %57, %.load52
+    %135 = shl i32 %134, 2
+    %136 = sext i32 %135 to i64
+    %137 = shl nsw i64 %136, 2
+    %138 = add i64 %133, %137
+    %scevgep = getelementptr i8, ptr addrspace(1) %95, i64 %138
+    %139 = sext i32 %128 to i64
+    %140 = shl nsw i64 %139, 2
+    br label %146, !amdgpu.uniform !5
+
+  Flow89:                                           ; preds = %Flow88, %91
+    %141 = phi i32 [ %.lcssa98, %Flow88 ], [ %105, %91 ]
+    %142 = phi float [ %.sroa.095.1.i.lcssa, %Flow88 ], [ 0.000000e+00, %91 ]
+    %143 = phi float [ %.sroa.20.1.i.lcssa, %Flow88 ], [ 0.000000e+00, %91 ]
+    %144 = phi float [ %.sroa.38.1.i.lcssa, %Flow88 ], [ 0.000000e+00, %91 ]
+    %145 = phi float [ %.sroa.56.1.i.lcssa, %Flow88 ], [ 0.000000e+00, %91 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %110)
+    br label %._crit_edge.i, !amdgpu.uniform !5
+
+  146:                                              ; preds = %.lr.ph.i, %254
+    %phi.broken = phi i64 [ 0, %.lr.ph.i ], [ %257, %254 ]
+    %lsr.iv33 = phi ptr addrspace(1) [ %scevgep34, %254 ], [ %scevgep, %.lr.ph.i ]
+    %lsr.iv31 = phi i32 [ %lsr.iv.next32, %254 ], [ 0, %.lr.ph.i ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %254 ], [ %103, %.lr.ph.i ]
+    %.0318342.i = phi i32 [ %255, %254 ], [ %105, %.lr.ph.i ]
+    %.sroa.095.0341.i = phi float [ %.sroa.095.1.i, %254 ], [ 0.000000e+00, %.lr.ph.i ]
+    %.sroa.20.0340.i = phi float [ %.sroa.20.1.i, %254 ], [ 0.000000e+00, %.lr.ph.i ]
+    %.sroa.38.0339.i = phi float [ %.sroa.38.1.i, %254 ], [ 0.000000e+00, %.lr.ph.i ]
+    %.sroa.56.0338.i = phi float [ %.sroa.56.1.i, %254 ], [ 0.000000e+00, %.lr.ph.i ]
+    %147 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %111)
+    %148 = extractvalue { i1, i64 } %147, 0
+    %149 = extractvalue { i1, i64 } %147, 1
+    br i1 %148, label %150, label %254
+
+  150:                                              ; preds = %146
+    %151 = load float, ptr addrspace(1) %lsr.iv33, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %152 = add i32 %127, %lsr.iv31
+    %153 = sext i32 %152 to i64
+    %154 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %153
+    %155 = load float, ptr addrspace(1) %154, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %156 = add i32 %129, %lsr.iv31
+    %157 = sext i32 %156 to i64
+    %158 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %157
+    %159 = load float, ptr addrspace(1) %158, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %160 = add i32 %130, %lsr.iv31
+    %161 = sext i32 %160 to i64
+    %162 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %161
+    %163 = load float, ptr addrspace(1) %162, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %164 = add i32 %126, %lsr.iv
+    %165 = sext i32 %164 to i64
+    %166 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %165
+    %167 = load float, ptr addrspace(1) %166, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %168 = fmul contract float %151, %167
+    %169 = fadd contract float %.sroa.095.0341.i, %168
+    %170 = add i32 %119, %lsr.iv
+    %171 = sext i32 %170 to i64
+    %172 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %171
+    %173 = load float, ptr addrspace(1) %172, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %174 = fmul contract float %155, %173
+    %175 = fadd contract float %169, %174
+    %176 = add i32 %122, %lsr.iv
+    %177 = sext i32 %176 to i64
+    %178 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %177
+    %179 = load float, ptr addrspace(1) %178, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %180 = fmul contract float %159, %179
+    %181 = fadd contract float %175, %180
+    %182 = add i32 %124, %lsr.iv
+    %183 = sext i32 %182 to i64
+    %184 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %183
+    %185 = load float, ptr addrspace(1) %184, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %186 = fmul contract float %163, %185
+    %187 = fadd contract float %181, %186
+    %188 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %113)
+    %189 = extractvalue { i1, i64 } %188, 0
+    %190 = extractvalue { i1, i64 } %188, 1
+    br i1 %189, label %191, label %Flow87
+
+  191:                                              ; preds = %150
+    %192 = getelementptr inbounds float, ptr addrspace(1) %166, i64 64
+    %193 = load float, ptr addrspace(1) %192, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %194 = fmul contract float %151, %193
+    %195 = fadd contract float %.sroa.20.0340.i, %194
+    %196 = getelementptr inbounds float, ptr addrspace(1) %172, i64 64
+    %197 = load float, ptr addrspace(1) %196, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %198 = fmul contract float %155, %197
+    %199 = fadd contract float %195, %198
+    %200 = getelementptr inbounds float, ptr addrspace(1) %178, i64 64
+    %201 = load float, ptr addrspace(1) %200, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %202 = fmul contract float %159, %201
+    %203 = fadd contract float %199, %202
+    %204 = getelementptr inbounds float, ptr addrspace(1) %184, i64 64
+    %205 = load float, ptr addrspace(1) %204, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %206 = fmul contract float %163, %205
+    %207 = fadd contract float %203, %206
+    %208 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %115)
+    %209 = extractvalue { i1, i64 } %208, 0
+    %210 = extractvalue { i1, i64 } %208, 1
+    br i1 %209, label %211, label %Flow86
+
+  211:                                              ; preds = %191
+    %212 = getelementptr inbounds float, ptr addrspace(1) %166, i64 128
+    %213 = load float, ptr addrspace(1) %212, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %214 = fmul contract float %151, %213
+    %215 = fadd contract float %.sroa.38.0339.i, %214
+    %216 = getelementptr inbounds float, ptr addrspace(1) %172, i64 128
+    %217 = load float, ptr addrspace(1) %216, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %218 = fmul contract float %155, %217
+    %219 = fadd contract float %215, %218
+    %220 = getelementptr inbounds float, ptr addrspace(1) %178, i64 128
+    %221 = load float, ptr addrspace(1) %220, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %222 = fmul contract float %159, %221
+    %223 = fadd contract float %219, %222
+    %224 = getelementptr inbounds float, ptr addrspace(1) %184, i64 128
+    %225 = load float, ptr addrspace(1) %224, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %226 = fmul contract float %163, %225
+    %227 = fadd contract float %223, %226
+    %228 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %117)
+    %229 = extractvalue { i1, i64 } %228, 0
+    %230 = extractvalue { i1, i64 } %228, 1
+    br i1 %229, label %231, label %Flow85
+
+  231:                                              ; preds = %211
+    %232 = getelementptr inbounds float, ptr addrspace(1) %166, i64 192
+    %233 = load float, ptr addrspace(1) %232, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %234 = fmul contract float %151, %233
+    %235 = fadd contract float %.sroa.56.0338.i, %234
+    %236 = getelementptr inbounds float, ptr addrspace(1) %172, i64 192
+    %237 = load float, ptr addrspace(1) %236, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %238 = fmul contract float %155, %237
+    %239 = fadd contract float %235, %238
+    %240 = getelementptr inbounds float, ptr addrspace(1) %178, i64 192
+    %241 = load float, ptr addrspace(1) %240, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %242 = fmul contract float %159, %241
+    %243 = fadd contract float %239, %242
+    %244 = getelementptr inbounds float, ptr addrspace(1) %184, i64 192
+    %245 = load float, ptr addrspace(1) %244, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %246 = fmul contract float %163, %245
+    %247 = fadd contract float %243, %246
+    br label %Flow85, !amdgpu.uniform !5
+
+  Flow85:                                           ; preds = %231, %211
+    %248 = phi float [ %247, %231 ], [ %.sroa.56.0338.i, %211 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %230)
+    br label %Flow86, !amdgpu.uniform !5
+
+  Flow86:                                           ; preds = %Flow85, %191
+    %249 = phi float [ %227, %Flow85 ], [ %.sroa.38.0339.i, %191 ]
+    %250 = phi float [ %248, %Flow85 ], [ %.sroa.56.0338.i, %191 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %210)
+    br label %Flow87, !amdgpu.uniform !5
+
+  Flow87:                                           ; preds = %Flow86, %150
+    %251 = phi float [ %207, %Flow86 ], [ %.sroa.20.0340.i, %150 ]
+    %252 = phi float [ %249, %Flow86 ], [ %.sroa.38.0339.i, %150 ]
+    %253 = phi float [ %250, %Flow86 ], [ %.sroa.56.0338.i, %150 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %190)
+    br label %254, !amdgpu.uniform !5
+
+  254:                                              ; preds = %146, %Flow87
+    %.sroa.56.1.i = phi float [ %.sroa.56.0338.i, %146 ], [ %253, %Flow87 ]
+    %.sroa.38.1.i = phi float [ %.sroa.38.0339.i, %146 ], [ %252, %Flow87 ]
+    %.sroa.20.1.i = phi float [ %.sroa.20.0340.i, %146 ], [ %251, %Flow87 ]
+    %.sroa.095.1.i = phi float [ %.sroa.095.0341.i, %146 ], [ %187, %Flow87 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %149)
+    %255 = add nuw nsw i32 %.0318342.i, 64
+    %lsr.iv.next = add i32 %lsr.iv, %120
+    %lsr.iv.next32 = add i32 %lsr.iv31, %128
+    %scevgep34 = getelementptr i8, ptr addrspace(1) %lsr.iv33, i64 %140
+    %256 = icmp sge i32 %255, %106
+    %257 = call i64 @llvm.amdgcn.if.break.i64(i1 %256, i64 %phi.broken)
+    %258 = call i1 @llvm.amdgcn.loop.i64(i64 %257)
+    br i1 %258, label %Flow88, label %146
+
+  Flow88:                                           ; preds = %254
+    %.sroa.56.1.i.lcssa = phi float [ %.sroa.56.1.i, %254 ]
+    %.sroa.38.1.i.lcssa = phi float [ %.sroa.38.1.i, %254 ]
+    %.sroa.20.1.i.lcssa = phi float [ %.sroa.20.1.i, %254 ]
+    %.sroa.095.1.i.lcssa = phi float [ %.sroa.095.1.i, %254 ]
+    %.lcssa98 = phi i32 [ %255, %254 ]
+    %.lcssa = phi i64 [ %257, %254 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %.lcssa)
+    br label %Flow89, !amdgpu.uniform !5
+
+  Flow93:                                           ; preds = %Flow94, %Flow90
+    %259 = phi float [ %498, %Flow94 ], [ %66, %Flow90 ]
+    %260 = phi i64 [ %499, %Flow94 ], [ %67, %Flow90 ]
+    %261 = phi i1 [ %500, %Flow94 ], [ %68, %Flow90 ]
+    %262 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %261)
+    %263 = extractvalue { i1, i64 } %262, 0
+    %264 = extractvalue { i1, i64 } %262, 1
+    br i1 %263, label %.sink.split.i, label %Flow96
+
+  ._crit_edge.i:                                    ; preds = %Flow89
+    %265 = icmp sgt i32 %104, 0
+    br i1 %265, label %266, label %Flow84, !amdgpu.uniform !5
+
+  266:                                              ; preds = %._crit_edge.i
+    %267 = icmp slt i32 %141, %.load3667
+    %268 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %267)
+    %269 = extractvalue { i1, i64 } %268, 0
+    %270 = extractvalue { i1, i64 } %268, 1
+    br i1 %269, label %271, label %316
+
+  271:                                              ; preds = %266
+    %272 = mul nsw i32 %141, %.load52
+    %273 = sext i32 %272 to i64
+    %274 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %273
+    %275 = load float, ptr addrspace(1) %274, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %276 = or i32 %141, 1
+    %277 = icmp slt i32 %276, %.load3667
+    %278 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %277)
+    %279 = extractvalue { i1, i64 } %278, 0
+    %280 = extractvalue { i1, i64 } %278, 1
+    br i1 %279, label %281, label %Flow83
+
+  281:                                              ; preds = %271
+    %282 = mul nsw i32 %276, %.load52
+    %283 = sext i32 %282 to i64
+    %284 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %283
+    %285 = load float, ptr addrspace(1) %284, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %286 = or i32 %141, 2
+    %287 = icmp slt i32 %286, %.load3667
+    %288 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %287)
+    %289 = extractvalue { i1, i64 } %288, 0
+    %290 = extractvalue { i1, i64 } %288, 1
+    br i1 %289, label %291, label %Flow82
+
+  291:                                              ; preds = %281
+    %292 = mul nsw i32 %286, %.load52
+    %293 = sext i32 %292 to i64
+    %294 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %293
+    %295 = load float, ptr addrspace(1) %294, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %296 = or i32 %141, 3
+    %297 = icmp slt i32 %296, %.load3667
+    %298 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %297)
+    %299 = extractvalue { i1, i64 } %298, 0
+    %300 = extractvalue { i1, i64 } %298, 1
+    br i1 %299, label %301, label %Flow81
+
+  301:                                              ; preds = %291
+    %302 = mul nsw i32 %296, %.load52
+    %303 = sext i32 %302 to i64
+    %304 = getelementptr inbounds float, ptr addrspace(1) %101, i64 %303
+    %305 = load float, ptr addrspace(1) %304, align 4, !tbaa !13, !amdgpu.noclobber !5
+    br label %Flow81, !amdgpu.uniform !5
+
+  Flow81:                                           ; preds = %301, %291
+    %306 = phi float [ %305, %301 ], [ 0.000000e+00, %291 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %300)
+    br label %Flow82, !amdgpu.uniform !5
+
+  Flow82:                                           ; preds = %Flow81, %281
+    %307 = phi float [ %295, %Flow81 ], [ 0.000000e+00, %281 ]
+    %308 = phi float [ %306, %Flow81 ], [ 0.000000e+00, %281 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %290)
+    br label %Flow83, !amdgpu.uniform !5
+
+  Flow83:                                           ; preds = %Flow82, %271
+    %309 = phi float [ %285, %Flow82 ], [ 0.000000e+00, %271 ]
+    %310 = phi float [ %307, %Flow82 ], [ 0.000000e+00, %271 ]
+    %311 = phi float [ %308, %Flow82 ], [ 0.000000e+00, %271 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %280)
+    br label %316, !amdgpu.uniform !5
+
+  Flow84:                                           ; preds = %Flow80, %._crit_edge.i
+    %312 = phi float [ %432, %Flow80 ], [ %142, %._crit_edge.i ]
+    %313 = phi float [ %433, %Flow80 ], [ %143, %._crit_edge.i ]
+    %314 = phi float [ %434, %Flow80 ], [ %144, %._crit_edge.i ]
+    %315 = phi float [ %435, %Flow80 ], [ %145, %._crit_edge.i ]
+    br label %436, !amdgpu.uniform !5
+
+  316:                                              ; preds = %266, %Flow83
+    %.sroa.35.0.i = phi float [ 0.000000e+00, %266 ], [ %311, %Flow83 ]
+    %.sroa.24.0.i = phi float [ 0.000000e+00, %266 ], [ %310, %Flow83 ]
+    %.sroa.13.0.i = phi float [ 0.000000e+00, %266 ], [ %309, %Flow83 ]
+    %.sroa.0.0.i = phi float [ 0.000000e+00, %266 ], [ %275, %Flow83 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %270)
+    %317 = icmp slt i32 %103, %.load66
+    %318 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %317)
+    %319 = extractvalue { i1, i64 } %318, 0
+    %320 = extractvalue { i1, i64 } %318, 1
+    br i1 %319, label %321, label %Flow80
+
+  321:                                              ; preds = %316
+    %322 = mul nsw i32 %141, %.load44
+    %323 = select i1 %267, i32 %322, i32 0
+    %324 = add nsw i32 %323, %103
+    %325 = sext i32 %324 to i64
+    %326 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %325
+    %327 = load float, ptr addrspace(1) %326, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %328 = fmul contract float %.sroa.0.0.i, %327
+    %329 = fadd contract float %142, %328
+    %330 = or i32 %141, 1
+    %331 = mul nsw i32 %330, %.load44
+    %332 = icmp slt i32 %330, %.load3667
+    %333 = select i1 %332, i32 %331, i32 0
+    %334 = add nsw i32 %333, %103
+    %335 = sext i32 %334 to i64
+    %336 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %335
+    %337 = load float, ptr addrspace(1) %336, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %338 = fmul contract float %.sroa.13.0.i, %337
+    %339 = fadd contract float %329, %338
+    %340 = or i32 %141, 2
+    %341 = mul nsw i32 %340, %.load44
+    %342 = icmp slt i32 %340, %.load3667
+    %343 = select i1 %342, i32 %341, i32 0
+    %344 = add nsw i32 %343, %103
+    %345 = sext i32 %344 to i64
+    %346 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %345
+    %347 = load float, ptr addrspace(1) %346, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %348 = fmul contract float %.sroa.24.0.i, %347
+    %349 = fadd contract float %339, %348
+    %350 = or i32 %141, 3
+    %351 = mul nsw i32 %350, %.load44
+    %352 = icmp slt i32 %350, %.load3667
+    %353 = select i1 %352, i32 %351, i32 0
+    %354 = add nsw i32 %353, %103
+    %355 = sext i32 %354 to i64
+    %356 = getelementptr inbounds float, ptr addrspace(1) %98, i64 %355
+    %357 = load float, ptr addrspace(1) %356, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %358 = fmul contract float %.sroa.35.0.i, %357
+    %359 = fadd contract float %349, %358
+    %360 = add nsw i32 %103, 64
+    %361 = icmp slt i32 %360, %.load66
+    %362 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %361)
+    %363 = extractvalue { i1, i64 } %362, 0
+    %364 = extractvalue { i1, i64 } %362, 1
+    br i1 %363, label %365, label %Flow79
+
+  365:                                              ; preds = %321
+    %366 = getelementptr inbounds float, ptr addrspace(1) %326, i64 64
+    %367 = load float, ptr addrspace(1) %366, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %368 = fmul contract float %.sroa.0.0.i, %367
+    %369 = fadd contract float %143, %368
+    %370 = getelementptr inbounds float, ptr addrspace(1) %336, i64 64
+    %371 = load float, ptr addrspace(1) %370, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %372 = fmul contract float %.sroa.13.0.i, %371
+    %373 = fadd contract float %369, %372
+    %374 = getelementptr inbounds float, ptr addrspace(1) %346, i64 64
+    %375 = load float, ptr addrspace(1) %374, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %376 = fmul contract float %.sroa.24.0.i, %375
+    %377 = fadd contract float %373, %376
+    %378 = getelementptr inbounds float, ptr addrspace(1) %356, i64 64
+    %379 = load float, ptr addrspace(1) %378, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %380 = fmul contract float %.sroa.35.0.i, %379
+    %381 = fadd contract float %377, %380
+    %382 = add nsw i32 %103, 128
+    %383 = icmp slt i32 %382, %.load66
+    %384 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %383)
+    %385 = extractvalue { i1, i64 } %384, 0
+    %386 = extractvalue { i1, i64 } %384, 1
+    br i1 %385, label %387, label %Flow78
+
+  387:                                              ; preds = %365
+    %388 = getelementptr inbounds float, ptr addrspace(1) %326, i64 128
+    %389 = load float, ptr addrspace(1) %388, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %390 = fmul contract float %.sroa.0.0.i, %389
+    %391 = fadd contract float %144, %390
+    %392 = getelementptr inbounds float, ptr addrspace(1) %336, i64 128
+    %393 = load float, ptr addrspace(1) %392, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %394 = fmul contract float %.sroa.13.0.i, %393
+    %395 = fadd contract float %391, %394
+    %396 = getelementptr inbounds float, ptr addrspace(1) %346, i64 128
+    %397 = load float, ptr addrspace(1) %396, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %398 = fmul contract float %.sroa.24.0.i, %397
+    %399 = fadd contract float %395, %398
+    %400 = getelementptr inbounds float, ptr addrspace(1) %356, i64 128
+    %401 = load float, ptr addrspace(1) %400, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %402 = fmul contract float %.sroa.35.0.i, %401
+    %403 = fadd contract float %399, %402
+    %404 = add nsw i32 %103, 192
+    %405 = icmp slt i32 %404, %.load66
+    %406 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %405)
+    %407 = extractvalue { i1, i64 } %406, 0
+    %408 = extractvalue { i1, i64 } %406, 1
+    br i1 %407, label %409, label %Flow77
+
+  409:                                              ; preds = %387
+    %410 = getelementptr inbounds float, ptr addrspace(1) %326, i64 192
+    %411 = load float, ptr addrspace(1) %410, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %412 = fmul contract float %.sroa.0.0.i, %411
+    %413 = fadd contract float %145, %412
+    %414 = getelementptr inbounds float, ptr addrspace(1) %336, i64 192
+    %415 = load float, ptr addrspace(1) %414, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %416 = fmul contract float %.sroa.13.0.i, %415
+    %417 = fadd contract float %413, %416
+    %418 = getelementptr inbounds float, ptr addrspace(1) %346, i64 192
+    %419 = load float, ptr addrspace(1) %418, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %420 = fmul contract float %.sroa.24.0.i, %419
+    %421 = fadd contract float %417, %420
+    %422 = getelementptr inbounds float, ptr addrspace(1) %356, i64 192
+    %423 = load float, ptr addrspace(1) %422, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %424 = fmul contract float %.sroa.35.0.i, %423
+    %425 = fadd contract float %421, %424
+    br label %Flow77, !amdgpu.uniform !5
+
+  Flow77:                                           ; preds = %409, %387
+    %426 = phi float [ %425, %409 ], [ %145, %387 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %408)
+    br label %Flow78, !amdgpu.uniform !5
+
+  Flow78:                                           ; preds = %Flow77, %365
+    %427 = phi float [ %403, %Flow77 ], [ %144, %365 ]
+    %428 = phi float [ %426, %Flow77 ], [ %145, %365 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %386)
+    br label %Flow79, !amdgpu.uniform !5
+
+  Flow79:                                           ; preds = %Flow78, %321
+    %429 = phi float [ %381, %Flow78 ], [ %143, %321 ]
+    %430 = phi float [ %427, %Flow78 ], [ %144, %321 ]
+    %431 = phi float [ %428, %Flow78 ], [ %145, %321 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %364)
+    br label %Flow80, !amdgpu.uniform !5
+
+  Flow80:                                           ; preds = %Flow79, %316
+    %432 = phi float [ %359, %Flow79 ], [ %142, %316 ]
+    %433 = phi float [ %429, %Flow79 ], [ %143, %316 ]
+    %434 = phi float [ %430, %Flow79 ], [ %144, %316 ]
+    %435 = phi float [ %431, %Flow79 ], [ %145, %316 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %320)
+    br label %Flow84, !amdgpu.uniform !5
+
+  436:                                              ; preds = %Flow84
+    %437 = shl nuw nsw i32 %57, 8
+    %438 = add nuw nsw i32 %437, %56
+    %439 = getelementptr inbounds [4096 x float], ptr addrspace(3) @llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds, i32 0, i32 %438
+    store float %312, ptr addrspace(3) %439, align 4, !tbaa !13
+    %440 = getelementptr inbounds float, ptr addrspace(3) %439, i32 64
+    store float %313, ptr addrspace(3) %440, align 4, !tbaa !13
+    %441 = getelementptr inbounds float, ptr addrspace(3) %439, i32 128
+    store float %314, ptr addrspace(3) %441, align 4, !tbaa !13
+    %442 = getelementptr inbounds float, ptr addrspace(3) %439, i32 192
+    store float %315, ptr addrspace(3) %442, align 4, !tbaa !13
+    fence syncscope("workgroup") release
+    tail call void @llvm.amdgcn.s.barrier()
+    fence syncscope("workgroup") acquire
+    %443 = icmp ult i32 %59, 256
+    %444 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %443)
+    %445 = extractvalue { i1, i64 } %444, 0
+    %446 = extractvalue { i1, i64 } %444, 1
+    br i1 %445, label %.preheader.i, label %Flow94
+
+  .preheader.i:                                     ; preds = %436
+    %447 = getelementptr inbounds [4096 x float], ptr addrspace(3) @llvm.amdgcn.kernel._ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil.lds, i32 0, i32 %59
+    %.promoted.i = load float, ptr addrspace(3) %447, align 4, !tbaa !13
+    %448 = getelementptr inbounds float, ptr addrspace(3) %447, i32 256
+    %449 = load float, ptr addrspace(3) %448, align 4, !tbaa !13
+    %450 = fadd contract float %.promoted.i, %449
+    %451 = getelementptr inbounds float, ptr addrspace(3) %447, i32 512
+    %452 = load float, ptr addrspace(3) %451, align 4, !tbaa !13
+    %453 = fadd contract float %452, %450
+    %454 = getelementptr inbounds float, ptr addrspace(3) %447, i32 768
+    %455 = load float, ptr addrspace(3) %454, align 4, !tbaa !13
+    %456 = fadd contract float %455, %453
+    %457 = getelementptr inbounds float, ptr addrspace(3) %447, i32 1024
+    %458 = load float, ptr addrspace(3) %457, align 4, !tbaa !13
+    %459 = fadd contract float %458, %456
+    %460 = getelementptr inbounds float, ptr addrspace(3) %447, i32 1280
+    %461 = load float, ptr addrspace(3) %460, align 4, !tbaa !13
+    %462 = fadd contract float %461, %459
+    %463 = getelementptr inbounds float, ptr addrspace(3) %447, i32 1536
+    %464 = load float, ptr addrspace(3) %463, align 4, !tbaa !13
+    %465 = fadd contract float %464, %462
+    %466 = getelementptr inbounds float, ptr addrspace(3) %447, i32 1792
+    %467 = load float, ptr addrspace(3) %466, align 4, !tbaa !13
+    %468 = fadd contract float %467, %465
+    %469 = getelementptr inbounds float, ptr addrspace(3) %447, i32 2048
+    %470 = load float, ptr addrspace(3) %469, align 4, !tbaa !13
+    %471 = fadd contract float %470, %468
+    %472 = getelementptr inbounds float, ptr addrspace(3) %447, i32 2304
+    %473 = load float, ptr addrspace(3) %472, align 4, !tbaa !13
+    %474 = fadd contract float %473, %471
+    %475 = getelementptr inbounds float, ptr addrspace(3) %447, i32 2560
+    %476 = load float, ptr addrspace(3) %475, align 4, !tbaa !13
+    %477 = fadd contract float %476, %474
+    %478 = getelementptr inbounds float, ptr addrspace(3) %447, i32 2816
+    %479 = load float, ptr addrspace(3) %478, align 4, !tbaa !13
+    %480 = fadd contract float %479, %477
+    %481 = getelementptr inbounds float, ptr addrspace(3) %447, i32 3072
+    %482 = load float, ptr addrspace(3) %481, align 4, !tbaa !13
+    %483 = fadd contract float %482, %480
+    %484 = getelementptr inbounds float, ptr addrspace(3) %447, i32 3328
+    %485 = load float, ptr addrspace(3) %484, align 4, !tbaa !13
+    %486 = fadd contract float %485, %483
+    %487 = getelementptr inbounds float, ptr addrspace(3) %447, i32 3584
+    %488 = load float, ptr addrspace(3) %487, align 4, !tbaa !13
+    %489 = fadd contract float %488, %486
+    %490 = getelementptr inbounds float, ptr addrspace(3) %447, i32 3840
+    %491 = load float, ptr addrspace(3) %490, align 4, !tbaa !13
+    %492 = fadd contract float %491, %489
+    store float %492, ptr addrspace(3) %447, align 4, !tbaa !13
+    %493 = add i32 %59, %102
+    %494 = icmp slt i32 %493, %.load66
+    %495 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %494)
+    %496 = extractvalue { i1, i64 } %495, 0
+    %497 = extractvalue { i1, i64 } %495, 1
+    br i1 %496, label %501, label %Flow95
+
+  Flow94:                                           ; preds = %Flow95, %436
+    %498 = phi float [ %513, %Flow95 ], [ undef, %436 ]
+    %499 = phi i64 [ %514, %Flow95 ], [ undef, %436 ]
+    %500 = phi i1 [ %515, %Flow95 ], [ %68, %436 ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %446)
+    br label %Flow93, !amdgpu.uniform !5
+
+  501:                                              ; preds = %.preheader.i
+    %502 = fcmp contract une float %.load56, 0.000000e+00
+    %503 = fmul contract float %492, %46
+    %504 = mul nsw i32 %493, %.load62
+    %505 = sext i32 %504 to i64
+    br i1 %502, label %506, label %Flow76, !amdgpu.uniform !5
+
+  506:                                              ; preds = %501
+    %507 = getelementptr inbounds float, ptr addrspace(1) %55, i64 %505
+    %508 = load float, ptr addrspace(1) %507, align 4, !tbaa !13, !amdgpu.noclobber !5
+    %509 = fmul contract float %508, %.load56
+    %510 = fadd contract float %503, %509
+    br label %Flow76, !amdgpu.uniform !5
+
+  Flow:                                             ; preds = %84, %80
+    %511 = phi float [ %87, %84 ], [ 0.000000e+00, %80 ]
+    br label %Flow92, !amdgpu.uniform !5
+
+  Flow76:                                           ; preds = %506, %501
+    %512 = phi float [ %510, %506 ], [ %503, %501 ]
+    br label %Flow95, !amdgpu.uniform !5
+
+  Flow95:                                           ; preds = %Flow76, %.preheader.i
+    %513 = phi float [ %512, %Flow76 ], [ undef, %.preheader.i ]
+    %514 = phi i64 [ %505, %Flow76 ], [ undef, %.preheader.i ]
+    %515 = phi i1 [ true, %Flow76 ], [ %68, %.preheader.i ]
+    call void @llvm.amdgcn.end.cf.i64(i64 %497)
+    br label %Flow94, !amdgpu.uniform !5
+
+  .sink.split.i:                                    ; preds = %Flow93
+    %516 = getelementptr inbounds float, ptr addrspace(1) %55, i64 %260
+    store float %259, ptr addrspace(1) %516, align 4, !tbaa !13
+    br label %Flow96, !amdgpu.uniform !5
+
+  Flow96:                                           ; preds = %.sink.split.i, %Flow93
+    call void @llvm.amdgcn.end.cf.i64(i64 %264)
+    br label %Flow97, !amdgpu.uniform !5
+
+  Flow97:                                           ; preds = %Flow96, %44
+    br label %_Z25rocblas_gemvn_kernel_calcILi64ELi16EiffLi0EEviiT3_PKT2_T1_S3_iS0_PS1_i.exit, !amdgpu.uniform !5
+
+  _Z25rocblas_gemvn_kernel_calcILi64ELi16EiffLi0EEviiT3_PKT2_T1_S3_iS0_PS1_i.exit: ; preds = %18, %Flow97
+    ret void
+  }
+
+  declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
+
+  declare void @llvm.amdgcn.s.barrier() #2
+
+  declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+  declare i32 @llvm.amdgcn.workgroup.id.y() #1
+
+  declare i32 @llvm.amdgcn.workgroup.id.z() #1
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+  declare i32 @llvm.amdgcn.mul.u24(i32, i32) #3
+
+  declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #3
+
+  declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #4
+
+  declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #4
+
+  declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #5
+
+  declare i1 @llvm.amdgcn.loop.i64(i64) #4
+
+  declare void @llvm.amdgcn.end.cf.i64(i64) #4
+
+  attributes #0 = { nofree nounwind "amdgpu-lds-size"="16384" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-wave-limiter"="true" "uniform-work-group-size"="false" }
+  attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #2 = { convergent mustprogress nocallback nofree nounwind willreturn }
+  attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #4 = { convergent nocallback nofree nounwind willreturn }
+  attributes #5 = { convergent nocallback nofree nounwind willreturn memory(none) }
+
+  !llvm.module.flags = !{!1, !2}
+  !opencl.ocl.version = !{!3}
+  !llvm.ident = !{!4}
+
+  !0 = !{i32 0, i32 1}
+  !1 = !{i32 1, !"wchar_size", i32 4}
+  !2 = !{i32 8, !"PIC Level", i32 1}
+  !3 = !{i32 2, i32 0}
+  !4 = !{!"AMD clang version 16.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.6.0 23223 3403c07804e79cd94b1efdf9f8d6cd45ac127382)"}
+  !5 = !{}
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"int", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C/C++ TBAA"}
+  !10 = !{!11, !11, i64 0}
+  !11 = !{!"short", !8, i64 0}
+  !12 = !{i32 0, i32 1024}
+  !13 = !{!14, !14, i64 0}
+  !14 = !{!"float", !15, i64 0}
+  !15 = !{!"omnipotent char", !16, i64 0}
+  !16 = !{!"Simple C++ TBAA"}
+
+...
+---
+name:            _ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: sreg_32, preferred-register: '' }
+  - { id: 1, class: sreg_32, preferred-register: '' }
+  - { id: 2, class: sreg_32, preferred-register: '' }
+  - { id: 3, class: sreg_64, preferred-register: '' }
+  - { id: 4, class: sgpr_96, preferred-register: '' }
+  - { id: 5, class: sgpr_32, preferred-register: '' }
+  - { id: 6, class: sgpr_32, preferred-register: '' }
+  - { id: 7, class: sreg_32, preferred-register: '' }
+  - { id: 8, class: sreg_32, preferred-register: '' }
+  - { id: 9, class: sreg_64, preferred-register: '' }
+  - { id: 10, class: sreg_64, preferred-register: '' }
+  - { id: 11, class: vgpr_32, preferred-register: '' }
+  - { id: 12, class: vgpr_32, preferred-register: '' }
+  - { id: 13, class: vgpr_32, preferred-register: '' }
+  - { id: 14, class: sreg_64, preferred-register: '' }
+  - { id: 15, class: vgpr_32, preferred-register: '' }
+  - { id: 16, class: vreg_64_align2, preferred-register: '' }
+  - { id: 17, class: sreg_64, preferred-register: '' }
+  - { id: 18, class: sreg_64, preferred-register: '' }
+  - { id: 19, class: vgpr_32, preferred-register: '' }
+  - { id: 20, class: sreg_64, preferred-register: '' }
+  - { id: 21, class: vgpr_32, preferred-register: '' }
+  - { id: 22, class: vreg_64_align2, preferred-register: '' }
+  - { id: 23, class: sreg_64, preferred-register: '' }
+  - { id: 24, class: vreg_64_align2, preferred-register: '' }
+  - { id: 25, class: vgpr_32, preferred-register: '' }
+  - { id: 26, class: vgpr_32, preferred-register: '' }
+  - { id: 27, class: vreg_64_align2, preferred-register: '' }
+  - { id: 28, class: sreg_64, preferred-register: '' }
+  - { id: 29, class: sreg_64, preferred-register: '' }
+  - { id: 30, class: sreg_64, preferred-register: '' }
+  - { id: 31, class: sreg_64, preferred-register: '' }
+  - { id: 32, class: sreg_64, preferred-register: '' }
+  - { id: 33, class: sreg_64, preferred-register: '' }
+  - { id: 34, class: sreg_32, preferred-register: '' }
+  - { id: 35, class: sreg_32, preferred-register: '' }
+  - { id: 36, class: sreg_32, preferred-register: '' }
+  - { id: 37, class: sreg_32, preferred-register: '' }
+  - { id: 38, class: vgpr_32, preferred-register: '' }
+  - { id: 39, class: sreg_32, preferred-register: '' }
+  - { id: 40, class: vgpr_32, preferred-register: '' }
+  - { id: 41, class: sreg_32, preferred-register: '' }
+  - { id: 42, class: sreg_64, preferred-register: '' }
+  - { id: 43, class: sreg_64, preferred-register: '' }
+  - { id: 44, class: sreg_64, preferred-register: '' }
+  - { id: 45, class: sreg_64, preferred-register: '' }
+  - { id: 46, class: sreg_64, preferred-register: '' }
+  - { id: 47, class: vgpr_32, preferred-register: '' }
+  - { id: 48, class: sreg_32, preferred-register: '' }
+  - { id: 49, class: vgpr_32, preferred-register: '' }
+  - { id: 50, class: vgpr_32, preferred-register: '' }
+  - { id: 51, class: vgpr_32, preferred-register: '' }
+  - { id: 52, class: vgpr_32, preferred-register: '' }
+  - { id: 53, class: sreg_32, preferred-register: '' }
+  - { id: 54, class: vgpr_32, preferred-register: '' }
+  - { id: 55, class: vgpr_32, preferred-register: '' }
+  - { id: 56, class: vreg_64_align2, preferred-register: '' }
+  - { id: 57, class: sreg_64, preferred-register: '' }
+  - { id: 58, class: vgpr_32, preferred-register: '' }
+  - { id: 59, class: vgpr_32, preferred-register: '' }
+  - { id: 60, class: vgpr_32, preferred-register: '' }
+  - { id: 61, class: vgpr_32, preferred-register: '' }
+  - { id: 62, class: vgpr_32, preferred-register: '' }
+  - { id: 63, class: sreg_64, preferred-register: '' }
+  - { id: 64, class: vreg_64_align2, preferred-register: '' }
+  - { id: 65, class: sreg_32, preferred-register: '' }
+  - { id: 66, class: vgpr_32, preferred-register: '' }
+  - { id: 67, class: vgpr_32, preferred-register: '' }
+  - { id: 68, class: vgpr_32, preferred-register: '' }
+  - { id: 69, class: vgpr_32, preferred-register: '' }
+  - { id: 70, class: vgpr_32, preferred-register: '' }
+  - { id: 71, class: vgpr_32, preferred-register: '' }
+  - { id: 72, class: sreg_64, preferred-register: '' }
+  - { id: 73, class: vgpr_32, preferred-register: '' }
+  - { id: 74, class: vgpr_32, preferred-register: '' }
+  - { id: 75, class: vgpr_32, preferred-register: '' }
+  - { id: 76, class: vgpr_32, preferred-register: '' }
+  - { id: 77, class: vreg_64_align2, preferred-register: '' }
+  - { id: 78, class: vreg_64_align2, preferred-register: '' }
+  - { id: 79, class: vreg_64_align2, preferred-register: '' }
+  - { id: 80, class: vreg_64_align2, preferred-register: '' }
+  - { id: 81, class: vgpr_32, preferred-register: '' }
+  - { id: 82, class: sreg_64, preferred-register: '' }
+  - { id: 83, class: vgpr_32, preferred-register: '' }
+  - { id: 84, class: sreg_64, preferred-register: '' }
+  - { id: 85, class: vgpr_32, preferred-register: '' }
+  - { id: 86, class: sreg_64, preferred-register: '' }
+  - { id: 87, class: vgpr_32, preferred-register: '' }
+  - { id: 88, class: vgpr_32, preferred-register: '' }
+  - { id: 89, class: vgpr_32, preferred-register: '' }
+  - { id: 90, class: vgpr_32, preferred-register: '' }
+  - { id: 91, class: vgpr_32, preferred-register: '' }
+  - { id: 92, class: vgpr_32, preferred-register: '' }
+  - { id: 93, class: vgpr_32, preferred-register: '' }
+  - { id: 94, class: vgpr_32, preferred-register: '' }
+  - { id: 95, class: vgpr_32, preferred-register: '' }
+  - { id: 96, class: vgpr_32, preferred-register: '' }
+  - { id: 97, class: vgpr_32, preferred-register: '' }
+  - { id: 98, class: vgpr_32, preferred-register: '' }
+  - { id: 99, class: vgpr_32, preferred-register: '' }
+  - { id: 100, class: sreg_32, preferred-register: '' }
+  - { id: 101, class: vreg_64_align2, preferred-register: '' }
+  - { id: 102, class: sreg_64, preferred-register: '' }
+  - { id: 103, class: vgpr_32, preferred-register: '' }
+  - { id: 104, class: vgpr_32, preferred-register: '' }
+  - { id: 105, class: vgpr_32, preferred-register: '' }
+  - { id: 106, class: vgpr_32, preferred-register: '' }
+  - { id: 107, class: vgpr_32, preferred-register: '' }
+  - { id: 108, class: sreg_64, preferred-register: '' }
+  - { id: 109, class: vgpr_32, preferred-register: '' }
+  - { id: 110, class: vreg_64_align2, preferred-register: '' }
+  - { id: 111, class: sreg_64, preferred-register: '' }
+  - { id: 112, class: sreg_64, preferred-register: '' }
+  - { id: 113, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 114, class: sreg_64, preferred-register: '' }
+  - { id: 115, class: vgpr_32, preferred-register: '' }
+  - { id: 116, class: vgpr_32, preferred-register: '' }
+  - { id: 117, class: sreg_64, preferred-register: '' }
+  - { id: 118, class: vgpr_32, preferred-register: '' }
+  - { id: 119, class: vgpr_32, preferred-register: '' }
+  - { id: 120, class: sreg_64, preferred-register: '' }
+  - { id: 121, class: vgpr_32, preferred-register: '' }
+  - { id: 122, class: vgpr_32, preferred-register: '' }
+  - { id: 123, class: sreg_64, preferred-register: '' }
+  - { id: 124, class: vgpr_32, preferred-register: '' }
+  - { id: 125, class: vgpr_32, preferred-register: '' }
+  - { id: 126, class: vgpr_32, preferred-register: '' }
+  - { id: 127, class: vgpr_32, preferred-register: '' }
+  - { id: 128, class: vgpr_32, preferred-register: '' }
+  - { id: 129, class: vgpr_32, preferred-register: '' }
+  - { id: 130, class: vgpr_32, preferred-register: '' }
+  - { id: 131, class: vgpr_32, preferred-register: '' }
+  - { id: 132, class: vgpr_32, preferred-register: '' }
+  - { id: 133, class: vgpr_32, preferred-register: '' }
+  - { id: 134, class: vgpr_32, preferred-register: '' }
+  - { id: 135, class: vgpr_32, preferred-register: '' }
+  - { id: 136, class: vgpr_32, preferred-register: '' }
+  - { id: 137, class: vgpr_32, preferred-register: '' }
+  - { id: 138, class: vgpr_32, preferred-register: '' }
+  - { id: 139, class: sreg_64, preferred-register: '' }
+  - { id: 140, class: vreg_64_align2, preferred-register: '' }
+  - { id: 141, class: vreg_64_align2, preferred-register: '' }
+  - { id: 142, class: vreg_64_align2, preferred-register: '' }
+  - { id: 143, class: vreg_64_align2, preferred-register: '' }
+  - { id: 144, class: vgpr_32, preferred-register: '' }
+  - { id: 145, class: sreg_64, preferred-register: '' }
+  - { id: 146, class: vgpr_32, preferred-register: '' }
+  - { id: 147, class: sreg_64, preferred-register: '' }
+  - { id: 148, class: vgpr_32, preferred-register: '' }
+  - { id: 149, class: sreg_64, preferred-register: '' }
+  - { id: 150, class: vgpr_32, preferred-register: '' }
+  - { id: 151, class: vgpr_32, preferred-register: '' }
+  - { id: 152, class: vgpr_32, preferred-register: '' }
+  - { id: 153, class: vgpr_32, preferred-register: '' }
+  - { id: 154, class: vgpr_32, preferred-register: '' }
+  - { id: 155, class: vgpr_32, preferred-register: '' }
+  - { id: 156, class: vgpr_32, preferred-register: '' }
+  - { id: 157, class: vgpr_32, preferred-register: '' }
+  - { id: 158, class: vgpr_32, preferred-register: '' }
+  - { id: 159, class: vgpr_32, preferred-register: '' }
+  - { id: 160, class: vgpr_32, preferred-register: '' }
+  - { id: 161, class: sreg_64, preferred-register: '' }
+  - { id: 162, class: vgpr_32, preferred-register: '' }
+  - { id: 163, class: vgpr_32, preferred-register: '' }
+  - { id: 164, class: sreg_64, preferred-register: '' }
+  - { id: 165, class: vgpr_32, preferred-register: '' }
+  - { id: 166, class: vreg_64_align2, preferred-register: '' }
+  - { id: 167, class: sreg_64, preferred-register: '' }
+  - { id: 168, class: vgpr_32, preferred-register: '' }
+  - { id: 169, class: vreg_64_align2, preferred-register: '' }
+  - { id: 170, class: vgpr_32, preferred-register: '' }
+  - { id: 171, class: vgpr_32, preferred-register: '' }
+  - { id: 172, class: vgpr_32, preferred-register: '' }
+  - { id: 173, class: vgpr_32, preferred-register: '' }
+  - { id: 174, class: vreg_64_align2, preferred-register: '' }
+  - { id: 175, class: sreg_64, preferred-register: '' }
+  - { id: 176, class: vgpr_32, preferred-register: '' }
+  - { id: 177, class: sgpr_128, preferred-register: '' }
+  - { id: 178, class: sgpr_64, preferred-register: '' }
+  - { id: 179, class: sgpr_32, preferred-register: '' }
+  - { id: 180, class: sgpr_32, preferred-register: '' }
+  - { id: 181, class: sgpr_32, preferred-register: '' }
+  - { id: 182, class: sgpr_32, preferred-register: '' }
+  - { id: 183, class: sreg_32, preferred-register: '' }
+  - { id: 184, class: sreg_32, preferred-register: '' }
+  - { id: 185, class: sreg_64, preferred-register: '' }
+  - { id: 186, class: sreg_64, preferred-register: '' }
+  - { id: 187, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 188, class: sreg_32_xm0_xexec, preferred-register: '' }
+  - { id: 189, class: sreg_32, preferred-register: '' }
+  - { id: 190, class: sreg_32, preferred-register: '' }
+  - { id: 191, class: sreg_32, preferred-register: '' }
+  - { id: 192, class: sreg_32, preferred-register: '' }
+  - { id: 193, class: sreg_32, preferred-register: '' }
+  - { id: 194, class: sreg_64, preferred-register: '' }
+  - { id: 195, class: sreg_64, preferred-register: '' }
+  - { id: 196, class: vgpr_32, preferred-register: '' }
+  - { id: 197, class: vgpr_32, preferred-register: '' }
+  - { id: 198, class: sreg_32, preferred-register: '' }
+  - { id: 199, class: sreg_32, preferred-register: '' }
+  - { id: 200, class: sreg_32, preferred-register: '' }
+  - { id: 201, class: sreg_64, preferred-register: '' }
+  - { id: 202, class: sreg_64, preferred-register: '' }
+  - { id: 203, class: vgpr_32, preferred-register: '' }
+  - { id: 204, class: sreg_32, preferred-register: '' }
+  - { id: 205, class: sreg_32, preferred-register: '' }
+  - { id: 206, class: sreg_32, preferred-register: '' }
+  - { id: 207, class: sreg_32, preferred-register: '' }
+  - { id: 208, class: sreg_32, preferred-register: '' }
+  - { id: 209, class: sreg_32, preferred-register: '' }
+  - { id: 210, class: sreg_64, preferred-register: '' }
+  - { id: 211, class: sreg_64, preferred-register: '' }
+  - { id: 212, class: vgpr_32, preferred-register: '' }
+  - { id: 213, class: sreg_32, preferred-register: '' }
+  - { id: 214, class: sreg_32, preferred-register: '' }
+  - { id: 215, class: sreg_32, preferred-register: '' }
+  - { id: 216, class: sgpr_128, preferred-register: '' }
+  - { id: 217, class: sreg_32, preferred-register: '' }
+  - { id: 218, class: sreg_32, preferred-register: '' }
+  - { id: 219, class: sgpr_96, preferred-register: '' }
+  - { id: 220, class: sgpr_32, preferred-register: '' }
+  - { id: 221, class: sgpr_32, preferred-register: '' }
+  - { id: 222, class: sreg_64, preferred-register: '' }
+  - { id: 223, class: vgpr_32, preferred-register: '' }
+  - { id: 224, class: sgpr_32, preferred-register: '' }
+  - { id: 225, class: sreg_64, preferred-register: '' }
+  - { id: 226, class: vgpr_32, preferred-register: '' }
+  - { id: 227, class: sreg_64, preferred-register: '' }
+  - { id: 228, class: sreg_64, preferred-register: '' }
+  - { id: 229, class: sgpr_32, preferred-register: '' }
+  - { id: 230, class: sreg_64, preferred-register: '' }
+  - { id: 231, class: sreg_64, preferred-register: '' }
+  - { id: 232, class: sreg_64, preferred-register: '' }
+  - { id: 233, class: sgpr_128, preferred-register: '' }
+  - { id: 234, class: sreg_32_xm0_xexec, preferred-register: '' }
+  - { id: 235, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 236, class: sreg_32, preferred-register: '' }
+  - { id: 237, class: sreg_32, preferred-register: '' }
+  - { id: 238, class: sreg_64, preferred-register: '' }
+  - { id: 239, class: sreg_32, preferred-register: '' }
+  - { id: 240, class: sreg_32, preferred-register: '' }
+  - { id: 241, class: sreg_64, preferred-register: '' }
+  - { id: 242, class: sreg_32, preferred-register: '' }
+  - { id: 243, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 244, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 245, class: sreg_64, preferred-register: '' }
+  - { id: 246, class: sreg_32, preferred-register: '' }
+  - { id: 247, class: sreg_32, preferred-register: '' }
+  - { id: 248, class: sreg_32, preferred-register: '' }
+  - { id: 249, class: sreg_32, preferred-register: '' }
+  - { id: 250, class: sreg_32, preferred-register: '' }
+  - { id: 251, class: sreg_32, preferred-register: '' }
+  - { id: 252, class: sreg_32, preferred-register: '' }
+  - { id: 253, class: sreg_32, preferred-register: '' }
+  - { id: 254, class: sreg_64, preferred-register: '' }
+  - { id: 255, class: sreg_32, preferred-register: '' }
+  - { id: 256, class: sreg_64, preferred-register: '' }
+  - { id: 257, class: sreg_64, preferred-register: '' }
+  - { id: 258, class: sreg_64, preferred-register: '' }
+  - { id: 259, class: sreg_32, preferred-register: '' }
+  - { id: 260, class: sgpr_32, preferred-register: '' }
+  - { id: 261, class: sreg_64, preferred-register: '' }
+  - { id: 262, class: vgpr_32, preferred-register: '' }
+  - { id: 263, class: sreg_64, preferred-register: '' }
+  - { id: 264, class: sgpr_32, preferred-register: '' }
+  - { id: 265, class: sreg_64, preferred-register: '' }
+  - { id: 266, class: sreg_64, preferred-register: '' }
+  - { id: 267, class: sreg_32, preferred-register: '' }
+  - { id: 268, class: sreg_64, preferred-register: '' }
+  - { id: 269, class: sgpr_32, preferred-register: '' }
+  - { id: 270, class: sreg_64, preferred-register: '' }
+  - { id: 271, class: sreg_64, preferred-register: '' }
+  - { id: 272, class: sreg_32, preferred-register: '' }
+  - { id: 273, class: vgpr_32, preferred-register: '' }
+  - { id: 274, class: sreg_64, preferred-register: '' }
+  - { id: 275, class: sgpr_32, preferred-register: '' }
+  - { id: 276, class: sreg_64, preferred-register: '' }
+  - { id: 277, class: vgpr_32, preferred-register: '' }
+  - { id: 278, class: vgpr_32, preferred-register: '' }
+  - { id: 279, class: vgpr_32, preferred-register: '' }
+  - { id: 280, class: vgpr_32, preferred-register: '' }
+  - { id: 281, class: vreg_64_align2, preferred-register: '' }
+  - { id: 282, class: sreg_64, preferred-register: '' }
+  - { id: 283, class: sreg_32, preferred-register: '' }
+  - { id: 284, class: vreg_64_align2, preferred-register: '' }
+  - { id: 285, class: vreg_64_align2, preferred-register: '' }
+  - { id: 286, class: vgpr_32, preferred-register: '' }
+  - { id: 287, class: sreg_64, preferred-register: '' }
+  - { id: 288, class: sreg_64, preferred-register: '' }
+  - { id: 289, class: sreg_64, preferred-register: '' }
+  - { id: 290, class: sgpr_32, preferred-register: '' }
+  - { id: 291, class: sgpr_128, preferred-register: '' }
+  - { id: 292, class: sreg_32_xm0_xexec, preferred-register: '' }
+  - { id: 293, class: sgpr_128, preferred-register: '' }
+  - { id: 294, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 295, class: sreg_32_xm0_xexec, preferred-register: '' }
+  - { id: 296, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 297, class: sreg_32, preferred-register: '' }
+  - { id: 298, class: sreg_32, preferred-register: '' }
+  - { id: 299, class: sreg_64, preferred-register: '' }
+  - { id: 300, class: sreg_32, preferred-register: '' }
+  - { id: 301, class: sreg_32, preferred-register: '' }
+  - { id: 302, class: sreg_64, preferred-register: '' }
+  - { id: 303, class: sreg_32, preferred-register: '' }
+  - { id: 304, class: sreg_32, preferred-register: '' }
+  - { id: 305, class: sreg_32, preferred-register: '' }
+  - { id: 306, class: sreg_32, preferred-register: '' }
+  - { id: 307, class: sreg_32, preferred-register: '' }
+  - { id: 308, class: sreg_32, preferred-register: '' }
+  - { id: 309, class: sreg_64, preferred-register: '' }
+  - { id: 310, class: sreg_64, preferred-register: '' }
+  - { id: 311, class: sreg_32, preferred-register: '' }
+  - { id: 312, class: sreg_32, preferred-register: '' }
+  - { id: 313, class: sreg_32, preferred-register: '' }
+  - { id: 314, class: sreg_32, preferred-register: '' }
+  - { id: 315, class: sreg_32, preferred-register: '' }
+  - { id: 316, class: sreg_32, preferred-register: '' }
+  - { id: 317, class: sreg_32, preferred-register: '' }
+  - { id: 318, class: sreg_32, preferred-register: '' }
+  - { id: 319, class: sreg_32, preferred-register: '' }
+  - { id: 320, class: sreg_64, preferred-register: '' }
+  - { id: 321, class: sreg_32, preferred-register: '' }
+  - { id: 322, class: sreg_64, preferred-register: '' }
+  - { id: 323, class: sreg_64, preferred-register: '' }
+  - { id: 324, class: sreg_64, preferred-register: '' }
+  - { id: 325, class: sreg_32, preferred-register: '' }
+  - { id: 326, class: sreg_32, preferred-register: '' }
+  - { id: 327, class: sreg_32, preferred-register: '' }
+  - { id: 328, class: sreg_32, preferred-register: '' }
+  - { id: 329, class: sreg_32, preferred-register: '' }
+  - { id: 330, class: sreg_32, preferred-register: '' }
+  - { id: 331, class: sreg_32, preferred-register: '' }
+  - { id: 332, class: sreg_64, preferred-register: '' }
+  - { id: 333, class: sreg_64, preferred-register: '' }
+  - { id: 334, class: sreg_64, preferred-register: '' }
+  - { id: 335, class: sreg_64, preferred-register: '' }
+  - { id: 336, class: sreg_32, preferred-register: '' }
+  - { id: 337, class: sreg_32, preferred-register: '' }
+  - { id: 338, class: sreg_32, preferred-register: '' }
+  - { id: 339, class: sreg_32, preferred-register: '' }
+  - { id: 340, class: sreg_32, preferred-register: '' }
+  - { id: 341, class: sreg_32, preferred-register: '' }
+  - { id: 342, class: sreg_32, preferred-register: '' }
+  - { id: 343, class: sreg_64, preferred-register: '' }
+  - { id: 344, class: sreg_64, preferred-register: '' }
+  - { id: 345, class: sreg_32, preferred-register: '' }
+  - { id: 346, class: sgpr_32, preferred-register: '' }
+  - { id: 347, class: sreg_64, preferred-register: '' }
+  - { id: 348, class: sreg_32, preferred-register: '' }
+  - { id: 349, class: vgpr_32, preferred-register: '' }
+  - { id: 350, class: sreg_64, preferred-register: '' }
+  - { id: 351, class: sreg_32, preferred-register: '' }
+  - { id: 352, class: vgpr_32, preferred-register: '' }
+  - { id: 353, class: sreg_64, preferred-register: '' }
+  - { id: 354, class: sreg_32, preferred-register: '' }
+  - { id: 355, class: vgpr_32, preferred-register: '' }
+  - { id: 356, class: sreg_64, preferred-register: '' }
+  - { id: 357, class: vgpr_32, preferred-register: '' }
+  - { id: 358, class: sreg_32, preferred-register: '' }
+  - { id: 359, class: sreg_32, preferred-register: '' }
+  - { id: 360, class: vgpr_32, preferred-register: '' }
+  - { id: 361, class: sreg_32, preferred-register: '' }
+  - { id: 362, class: vgpr_32, preferred-register: '' }
+  - { id: 363, class: vgpr_32, preferred-register: '' }
+  - { id: 364, class: vgpr_32, preferred-register: '' }
+  - { id: 365, class: sreg_64, preferred-register: '' }
+  - { id: 366, class: sreg_64, preferred-register: '' }
+  - { id: 367, class: sreg_64, preferred-register: '' }
+  - { id: 368, class: vgpr_32, preferred-register: '' }
+  - { id: 369, class: vgpr_32, preferred-register: '' }
+  - { id: 370, class: vgpr_32, preferred-register: '' }
+  - { id: 371, class: vgpr_32, preferred-register: '' }
+  - { id: 372, class: vreg_64_align2, preferred-register: '' }
+  - { id: 373, class: vreg_64_align2, preferred-register: '' }
+  - { id: 374, class: sreg_64, preferred-register: '' }
+  - { id: 375, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 376, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 377, class: sreg_64, preferred-register: '' }
+  - { id: 378, class: sreg_64, preferred-register: '' }
+  - { id: 379, class: vgpr_32, preferred-register: '' }
+  - { id: 380, class: vgpr_32, preferred-register: '' }
+  - { id: 381, class: vgpr_32, preferred-register: '' }
+  - { id: 382, class: vreg_64_align2, preferred-register: '' }
+  - { id: 383, class: sreg_32, preferred-register: '' }
+  - { id: 384, class: vreg_64_align2, preferred-register: '' }
+  - { id: 385, class: vreg_64_align2, preferred-register: '' }
+  - { id: 386, class: vgpr_32, preferred-register: '' }
+  - { id: 387, class: vgpr_32, preferred-register: '' }
+  - { id: 388, class: vgpr_32, preferred-register: '' }
+  - { id: 389, class: vreg_64_align2, preferred-register: '' }
+  - { id: 390, class: vreg_64_align2, preferred-register: '' }
+  - { id: 391, class: vreg_64_align2, preferred-register: '' }
+  - { id: 392, class: vgpr_32, preferred-register: '' }
+  - { id: 393, class: vgpr_32, preferred-register: '' }
+  - { id: 394, class: vgpr_32, preferred-register: '' }
+  - { id: 395, class: vreg_64_align2, preferred-register: '' }
+  - { id: 396, class: vreg_64_align2, preferred-register: '' }
+  - { id: 397, class: vreg_64_align2, preferred-register: '' }
+  - { id: 398, class: vgpr_32, preferred-register: '' }
+  - { id: 399, class: vgpr_32, preferred-register: '' }
+  - { id: 400, class: vgpr_32, preferred-register: '' }
+  - { id: 401, class: vreg_64_align2, preferred-register: '' }
+  - { id: 402, class: vreg_64_align2, preferred-register: '' }
+  - { id: 403, class: vgpr_32, preferred-register: '' }
+  - { id: 404, class: vgpr_32, preferred-register: '' }
+  - { id: 405, class: vgpr_32, preferred-register: '' }
+  - { id: 406, class: vgpr_32, preferred-register: '' }
+  - { id: 407, class: vgpr_32, preferred-register: '' }
+  - { id: 408, class: vreg_64_align2, preferred-register: '' }
+  - { id: 409, class: vreg_64_align2, preferred-register: '' }
+  - { id: 410, class: vgpr_32, preferred-register: '' }
+  - { id: 411, class: vgpr_32, preferred-register: '' }
+  - { id: 412, class: vgpr_32, preferred-register: '' }
+  - { id: 413, class: vgpr_32, preferred-register: '' }
+  - { id: 414, class: vgpr_32, preferred-register: '' }
+  - { id: 415, class: vreg_64_align2, preferred-register: '' }
+  - { id: 416, class: vreg_64_align2, preferred-register: '' }
+  - { id: 417, class: vgpr_32, preferred-register: '' }
+  - { id: 418, class: vgpr_32, preferred-register: '' }
+  - { id: 419, class: vgpr_32, preferred-register: '' }
+  - { id: 420, class: vgpr_32, preferred-register: '' }
+  - { id: 421, class: vgpr_32, preferred-register: '' }
+  - { id: 422, class: vreg_64_align2, preferred-register: '' }
+  - { id: 423, class: vreg_64_align2, preferred-register: '' }
+  - { id: 424, class: vgpr_32, preferred-register: '' }
+  - { id: 425, class: sreg_64, preferred-register: '' }
+  - { id: 426, class: vgpr_32, preferred-register: '' }
+  - { id: 427, class: vgpr_32, preferred-register: '' }
+  - { id: 428, class: vgpr_32, preferred-register: '' }
+  - { id: 429, class: vgpr_32, preferred-register: '' }
+  - { id: 430, class: vgpr_32, preferred-register: '' }
+  - { id: 431, class: vgpr_32, preferred-register: '' }
+  - { id: 432, class: vgpr_32, preferred-register: '' }
+  - { id: 433, class: sreg_64, preferred-register: '' }
+  - { id: 434, class: vgpr_32, preferred-register: '' }
+  - { id: 435, class: vgpr_32, preferred-register: '' }
+  - { id: 436, class: vgpr_32, preferred-register: '' }
+  - { id: 437, class: vgpr_32, preferred-register: '' }
+  - { id: 438, class: vgpr_32, preferred-register: '' }
+  - { id: 439, class: vgpr_32, preferred-register: '' }
+  - { id: 440, class: vgpr_32, preferred-register: '' }
+  - { id: 441, class: sreg_64, preferred-register: '' }
+  - { id: 442, class: vgpr_32, preferred-register: '' }
+  - { id: 443, class: vgpr_32, preferred-register: '' }
+  - { id: 444, class: vgpr_32, preferred-register: '' }
+  - { id: 445, class: vgpr_32, preferred-register: '' }
+  - { id: 446, class: vgpr_32, preferred-register: '' }
+  - { id: 447, class: vgpr_32, preferred-register: '' }
+  - { id: 448, class: vgpr_32, preferred-register: '' }
+  - { id: 449, class: sreg_32, preferred-register: '' }
+  - { id: 450, class: sreg_64, preferred-register: '' }
+  - { id: 451, class: sreg_32, preferred-register: '' }
+  - { id: 452, class: sgpr_32, preferred-register: '' }
+  - { id: 453, class: sreg_64, preferred-register: '' }
+  - { id: 454, class: sgpr_32, preferred-register: '' }
+  - { id: 455, class: vgpr_32, preferred-register: '' }
+  - { id: 456, class: vgpr_32, preferred-register: '' }
+  - { id: 457, class: vgpr_32, preferred-register: '' }
+  - { id: 458, class: vreg_64_align2, preferred-register: '' }
+  - { id: 459, class: sreg_32, preferred-register: '' }
+  - { id: 460, class: vreg_64_align2, preferred-register: '' }
+  - { id: 461, class: vreg_64_align2, preferred-register: '' }
+  - { id: 462, class: sreg_32, preferred-register: '' }
+  - { id: 463, class: sreg_64, preferred-register: '' }
+  - { id: 464, class: sgpr_32, preferred-register: '' }
+  - { id: 465, class: vgpr_32, preferred-register: '' }
+  - { id: 466, class: vgpr_32, preferred-register: '' }
+  - { id: 467, class: vgpr_32, preferred-register: '' }
+  - { id: 468, class: vreg_64_align2, preferred-register: '' }
+  - { id: 469, class: sreg_32, preferred-register: '' }
+  - { id: 470, class: vreg_64_align2, preferred-register: '' }
+  - { id: 471, class: vreg_64_align2, preferred-register: '' }
+  - { id: 472, class: sreg_64, preferred-register: '' }
+  - { id: 473, class: sgpr_32, preferred-register: '' }
+  - { id: 474, class: vgpr_32, preferred-register: '' }
+  - { id: 475, class: vgpr_32, preferred-register: '' }
+  - { id: 476, class: vgpr_32, preferred-register: '' }
+  - { id: 477, class: vreg_64_align2, preferred-register: '' }
+  - { id: 478, class: sreg_32, preferred-register: '' }
+  - { id: 479, class: vreg_64_align2, preferred-register: '' }
+  - { id: 480, class: vreg_64_align2, preferred-register: '' }
+  - { id: 481, class: sreg_32, preferred-register: '' }
+  - { id: 482, class: sreg_64, preferred-register: '' }
+  - { id: 483, class: vgpr_32, preferred-register: '' }
+  - { id: 484, class: vgpr_32, preferred-register: '' }
+  - { id: 485, class: vgpr_32, preferred-register: '' }
+  - { id: 486, class: vreg_64_align2, preferred-register: '' }
+  - { id: 487, class: sreg_32, preferred-register: '' }
+  - { id: 488, class: vreg_64_align2, preferred-register: '' }
+  - { id: 489, class: vreg_64_align2, preferred-register: '' }
+  - { id: 490, class: sreg_64, preferred-register: '' }
+  - { id: 491, class: vgpr_32, preferred-register: '' }
+  - { id: 492, class: sreg_32, preferred-register: '' }
+  - { id: 493, class: vgpr_32, preferred-register: '' }
+  - { id: 494, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 495, class: vgpr_32, preferred-register: '' }
+  - { id: 496, class: vgpr_32, preferred-register: '' }
+  - { id: 497, class: vgpr_32, preferred-register: '' }
+  - { id: 498, class: vgpr_32, preferred-register: '' }
+  - { id: 499, class: vreg_64_align2, preferred-register: '' }
+  - { id: 500, class: sreg_32, preferred-register: '' }
+  - { id: 501, class: vreg_64_align2, preferred-register: '' }
+  - { id: 502, class: vgpr_32, preferred-register: '' }
+  - { id: 503, class: vgpr_32, preferred-register: '' }
+  - { id: 504, class: sreg_32, preferred-register: '' }
+  - { id: 505, class: vgpr_32, preferred-register: '' }
+  - { id: 506, class: vgpr_32, preferred-register: '' }
+  - { id: 507, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 508, class: vgpr_32, preferred-register: '' }
+  - { id: 509, class: vgpr_32, preferred-register: '' }
+  - { id: 510, class: vgpr_32, preferred-register: '' }
+  - { id: 511, class: vgpr_32, preferred-register: '' }
+  - { id: 512, class: vgpr_32, preferred-register: '' }
+  - { id: 513, class: vreg_64_align2, preferred-register: '' }
+  - { id: 514, class: vreg_64_align2, preferred-register: '' }
+  - { id: 515, class: vgpr_32, preferred-register: '' }
+  - { id: 516, class: vgpr_32, preferred-register: '' }
+  - { id: 517, class: vgpr_32, preferred-register: '' }
+  - { id: 518, class: vgpr_32, preferred-register: '' }
+  - { id: 519, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 520, class: vgpr_32, preferred-register: '' }
+  - { id: 521, class: vgpr_32, preferred-register: '' }
+  - { id: 522, class: vgpr_32, preferred-register: '' }
+  - { id: 523, class: vgpr_32, preferred-register: '' }
+  - { id: 524, class: vgpr_32, preferred-register: '' }
+  - { id: 525, class: vreg_64_align2, preferred-register: '' }
+  - { id: 526, class: vreg_64_align2, preferred-register: '' }
+  - { id: 527, class: vgpr_32, preferred-register: '' }
+  - { id: 528, class: vgpr_32, preferred-register: '' }
+  - { id: 529, class: sreg_32, preferred-register: '' }
+  - { id: 530, class: vgpr_32, preferred-register: '' }
+  - { id: 531, class: vgpr_32, preferred-register: '' }
+  - { id: 532, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 533, class: vgpr_32, preferred-register: '' }
+  - { id: 534, class: vgpr_32, preferred-register: '' }
+  - { id: 535, class: vgpr_32, preferred-register: '' }
+  - { id: 536, class: vgpr_32, preferred-register: '' }
+  - { id: 537, class: vgpr_32, preferred-register: '' }
+  - { id: 538, class: vreg_64_align2, preferred-register: '' }
+  - { id: 539, class: vreg_64_align2, preferred-register: '' }
+  - { id: 540, class: vgpr_32, preferred-register: '' }
+  - { id: 541, class: sreg_32, preferred-register: '' }
+  - { id: 542, class: vgpr_32, preferred-register: '' }
+  - { id: 543, class: sreg_64, preferred-register: '' }
+  - { id: 544, class: vgpr_32, preferred-register: '' }
+  - { id: 545, class: vgpr_32, preferred-register: '' }
+  - { id: 546, class: vgpr_32, preferred-register: '' }
+  - { id: 547, class: vgpr_32, preferred-register: '' }
+  - { id: 548, class: vgpr_32, preferred-register: '' }
+  - { id: 549, class: vgpr_32, preferred-register: '' }
+  - { id: 550, class: vgpr_32, preferred-register: '' }
+  - { id: 551, class: sreg_32, preferred-register: '' }
+  - { id: 552, class: vgpr_32, preferred-register: '' }
+  - { id: 553, class: sreg_64, preferred-register: '' }
+  - { id: 554, class: vgpr_32, preferred-register: '' }
+  - { id: 555, class: vgpr_32, preferred-register: '' }
+  - { id: 556, class: vgpr_32, preferred-register: '' }
+  - { id: 557, class: vgpr_32, preferred-register: '' }
+  - { id: 558, class: vgpr_32, preferred-register: '' }
+  - { id: 559, class: vgpr_32, preferred-register: '' }
+  - { id: 560, class: vgpr_32, preferred-register: '' }
+  - { id: 561, class: sreg_32, preferred-register: '' }
+  - { id: 562, class: vgpr_32, preferred-register: '' }
+  - { id: 563, class: sreg_64, preferred-register: '' }
+  - { id: 564, class: vgpr_32, preferred-register: '' }
+  - { id: 565, class: vgpr_32, preferred-register: '' }
+  - { id: 566, class: vgpr_32, preferred-register: '' }
+  - { id: 567, class: vgpr_32, preferred-register: '' }
+  - { id: 568, class: vgpr_32, preferred-register: '' }
+  - { id: 569, class: vgpr_32, preferred-register: '' }
+  - { id: 570, class: vgpr_32, preferred-register: '' }
+  - { id: 571, class: sgpr_32, preferred-register: '' }
+  - { id: 572, class: sreg_64, preferred-register: '' }
+  - { id: 573, class: sreg_32, preferred-register: '' }
+  - { id: 574, class: vgpr_32, preferred-register: '' }
+  - { id: 575, class: sreg_32, preferred-register: '' }
+  - { id: 576, class: vgpr_32, preferred-register: '' }
+  - { id: 577, class: sreg_32, preferred-register: '' }
+  - { id: 578, class: sreg_64, preferred-register: '' }
+  - { id: 579, class: sgpr_32, preferred-register: '' }
+  - { id: 580, class: sreg_64, preferred-register: '' }
+  - { id: 581, class: sreg_32, preferred-register: '' }
+  - { id: 582, class: vgpr_32, preferred-register: '' }
+  - { id: 583, class: vgpr_32, preferred-register: '' }
+  - { id: 584, class: vgpr_32, preferred-register: '' }
+  - { id: 585, class: vgpr_32, preferred-register: '' }
+  - { id: 586, class: vgpr_32, preferred-register: '' }
+  - { id: 587, class: vgpr_32, preferred-register: '' }
+  - { id: 588, class: vgpr_32, preferred-register: '' }
+  - { id: 589, class: vgpr_32, preferred-register: '' }
+  - { id: 590, class: vgpr_32, preferred-register: '' }
+  - { id: 591, class: vgpr_32, preferred-register: '' }
+  - { id: 592, class: vgpr_32, preferred-register: '' }
+  - { id: 593, class: vgpr_32, preferred-register: '' }
+  - { id: 594, class: vgpr_32, preferred-register: '' }
+  - { id: 595, class: vgpr_32, preferred-register: '' }
+  - { id: 596, class: vgpr_32, preferred-register: '' }
+  - { id: 597, class: vgpr_32, preferred-register: '' }
+  - { id: 598, class: vgpr_32, preferred-register: '' }
+  - { id: 599, class: vgpr_32, preferred-register: '' }
+  - { id: 600, class: vgpr_32, preferred-register: '' }
+  - { id: 601, class: vgpr_32, preferred-register: '' }
+  - { id: 602, class: vgpr_32, preferred-register: '' }
+  - { id: 603, class: vgpr_32, preferred-register: '' }
+  - { id: 604, class: vgpr_32, preferred-register: '' }
+  - { id: 605, class: vgpr_32, preferred-register: '' }
+  - { id: 606, class: vgpr_32, preferred-register: '' }
+  - { id: 607, class: vgpr_32, preferred-register: '' }
+  - { id: 608, class: vgpr_32, preferred-register: '' }
+  - { id: 609, class: vgpr_32, preferred-register: '' }
+  - { id: 610, class: vgpr_32, preferred-register: '' }
+  - { id: 611, class: vgpr_32, preferred-register: '' }
+  - { id: 612, class: vgpr_32, preferred-register: '' }
+  - { id: 613, class: sreg_64, preferred-register: '' }
+  - { id: 614, class: sgpr_32, preferred-register: '' }
+  - { id: 615, class: sreg_64, preferred-register: '' }
+  - { id: 616, class: vgpr_32, preferred-register: '' }
+  - { id: 617, class: vgpr_32, preferred-register: '' }
+  - { id: 618, class: vgpr_32, preferred-register: '' }
+  - { id: 619, class: vgpr_32, preferred-register: '' }
+  - { id: 620, class: vreg_64_align2, preferred-register: '' }
+  - { id: 621, class: sreg_64, preferred-register: '' }
+  - { id: 622, class: sreg_32, preferred-register: '' }
+  - { id: 623, class: vreg_64_align2, preferred-register: '' }
+  - { id: 624, class: vreg_64_align2, preferred-register: '' }
+  - { id: 625, class: vgpr_32, preferred-register: '' }
+  - { id: 626, class: sreg_64, preferred-register: '' }
+  - { id: 627, class: sreg_64, preferred-register: '' }
+  - { id: 628, class: sreg_32, preferred-register: '' }
+  - { id: 629, class: vreg_64_align2, preferred-register: '' }
+  - { id: 630, class: vreg_64_align2, preferred-register: '' }
+  - { id: 631, class: vgpr_32, preferred-register: '' }
+  - { id: 632, class: vgpr_32, preferred-register: '' }
+  - { id: 633, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 634, class: vgpr_32, preferred-register: '' }
+  - { id: 635, class: vreg_64_align2, preferred-register: '' }
+  - { id: 636, class: vreg_1, preferred-register: '' }
+  - { id: 637, class: vgpr_32, preferred-register: '' }
+  - { id: 638, class: vreg_64_align2, preferred-register: '' }
+  - { id: 639, class: vreg_1, preferred-register: '' }
+  - { id: 640, class: vgpr_32, preferred-register: '' }
+  - { id: 641, class: vreg_64_align2, preferred-register: '' }
+  - { id: 642, class: vreg_1, preferred-register: '' }
+  - { id: 643, class: vreg_1, preferred-register: '' }
+  - { id: 644, class: vgpr_32, preferred-register: '' }
+  - { id: 645, class: vgpr_32, preferred-register: '' }
+  - { id: 646, class: vgpr_32, preferred-register: '' }
+  - { id: 647, class: vgpr_32, preferred-register: '' }
+  - { id: 648, class: vgpr_32, preferred-register: '' }
+  - { id: 649, class: vgpr_32, preferred-register: '' }
+  - { id: 650, class: vgpr_32, preferred-register: '' }
+  - { id: 651, class: vgpr_32, preferred-register: '' }
+  - { id: 652, class: vgpr_32, preferred-register: '' }
+  - { id: 653, class: vgpr_32, preferred-register: '' }
+  - { id: 654, class: vgpr_32, preferred-register: '' }
+  - { id: 655, class: vgpr_32, preferred-register: '' }
+  - { id: 656, class: vgpr_32, preferred-register: '' }
+  - { id: 657, class: vgpr_32, preferred-register: '' }
+  - { id: 658, class: vgpr_32, preferred-register: '' }
+  - { id: 659, class: vgpr_32, preferred-register: '' }
+  - { id: 660, class: vgpr_32, preferred-register: '' }
+  - { id: 661, class: vgpr_32, preferred-register: '' }
+  - { id: 662, class: vgpr_32, preferred-register: '' }
+  - { id: 663, class: vreg_64_align2, preferred-register: '' }
+  - { id: 664, class: vgpr_32, preferred-register: '' }
+  - { id: 665, class: vgpr_32, preferred-register: '' }
+  - { id: 666, class: vreg_64_align2, preferred-register: '' }
+  - { id: 667, class: vreg_1, preferred-register: '' }
+  - { id: 668, class: sreg_64, preferred-register: '' }
+  - { id: 669, class: sreg_64, preferred-register: '' }
+  - { id: 670, class: sreg_64, preferred-register: '' }
+  - { id: 671, class: sreg_64, preferred-register: '' }
+  - { id: 672, class: sreg_64, preferred-register: '' }
+  - { id: 673, class: sreg_64, preferred-register: '' }
+  - { id: 674, class: sreg_64, preferred-register: '' }
+  - { id: 675, class: sreg_64, preferred-register: '' }
+  - { id: 676, class: sreg_64, preferred-register: '' }
+  - { id: 677, class: sreg_64, preferred-register: '' }
+  - { id: 678, class: sreg_64, preferred-register: '' }
+  - { id: 679, class: sreg_64, preferred-register: '' }
+  - { id: 680, class: sreg_32, preferred-register: '' }
+  - { id: 681, class: sreg_32, preferred-register: '' }
+  - { id: 682, class: sreg_32, preferred-register: '' }
+  - { id: 683, class: sreg_32, preferred-register: '' }
+  - { id: 684, class: sreg_32, preferred-register: '' }
+  - { id: 685, class: sreg_32, preferred-register: '' }
+  - { id: 686, class: sreg_32, preferred-register: '' }
+  - { id: 687, class: sreg_32, preferred-register: '' }
+  - { id: 688, class: sreg_32, preferred-register: '' }
+  - { id: 689, class: sreg_32, preferred-register: '' }
+  - { id: 690, class: sreg_32, preferred-register: '' }
+  - { id: 691, class: sreg_32, preferred-register: '' }
+  - { id: 692, class: sreg_32, preferred-register: '' }
+  - { id: 693, class: sreg_32, preferred-register: '' }
+  - { id: 694, class: sreg_32, preferred-register: '' }
+  - { id: 695, class: sreg_32, preferred-register: '' }
+  - { id: 696, class: sreg_32, preferred-register: '' }
+  - { id: 697, class: sreg_32, preferred-register: '' }
+  - { id: 698, class: sreg_32, preferred-register: '' }
+  - { id: 699, class: sreg_32, preferred-register: '' }
+  - { id: 700, class: sreg_32, preferred-register: '' }
+  - { id: 701, class: sreg_32, preferred-register: '' }
+  - { id: 702, class: sreg_32, preferred-register: '' }
+  - { id: 703, class: sreg_32, preferred-register: '' }
+  - { id: 704, class: sreg_32, preferred-register: '' }
+  - { id: 705, class: sreg_32, preferred-register: '' }
+  - { id: 706, class: sreg_32, preferred-register: '' }
+  - { id: 707, class: sreg_32, preferred-register: '' }
+  - { id: 708, class: sreg_32, preferred-register: '' }
+  - { id: 709, class: sreg_32, preferred-register: '' }
+  - { id: 710, class: sreg_32, preferred-register: '' }
+  - { id: 711, class: sreg_32, preferred-register: '' }
+  - { id: 712, class: sreg_32, preferred-register: '' }
+  - { id: 713, class: sreg_32, preferred-register: '' }
+  - { id: 714, class: sreg_32, preferred-register: '' }
+  - { id: 715, class: sreg_32, preferred-register: '' }
+  - { id: 716, class: vgpr_32, preferred-register: '' }
+  - { id: 717, class: vgpr_32, preferred-register: '' }
+  - { id: 718, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 719, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 720, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 721, class: vgpr_32, preferred-register: '' }
+  - { id: 722, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 723, class: vgpr_32, preferred-register: '' }
+  - { id: 724, class: vgpr_32, preferred-register: '' }
+  - { id: 725, class: sreg_32, preferred-register: '' }
+  - { id: 726, class: sreg_32, preferred-register: '' }
+  - { id: 727, class: sreg_32, preferred-register: '' }
+  - { id: 728, class: sreg_32, preferred-register: '' }
+  - { id: 729, class: sreg_32, preferred-register: '' }
+  - { id: 730, class: sreg_32, preferred-register: '' }
+  - { id: 731, class: sreg_32, preferred-register: '' }
+  - { id: 732, class: sreg_32, preferred-register: '' }
+  - { id: 733, class: sreg_32, preferred-register: '' }
+  - { id: 734, class: sreg_32, preferred-register: '' }
+  - { id: 735, class: sreg_32, preferred-register: '' }
+  - { id: 736, class: sreg_32, preferred-register: '' }
+  - { id: 737, class: sreg_32, preferred-register: '' }
+  - { id: 738, class: sreg_32, preferred-register: '' }
+  - { id: 739, class: sreg_32, preferred-register: '' }
+  - { id: 740, class: sreg_32, preferred-register: '' }
+  - { id: 741, class: sreg_32, preferred-register: '' }
+  - { id: 742, class: sreg_32, preferred-register: '' }
+  - { id: 743, class: sreg_32, preferred-register: '' }
+  - { id: 744, class: sreg_32, preferred-register: '' }
+  - { id: 745, class: sreg_32, preferred-register: '' }
+  - { id: 746, class: sreg_32, preferred-register: '' }
+  - { id: 747, class: sreg_32, preferred-register: '' }
+  - { id: 748, class: sreg_32, preferred-register: '' }
+  - { id: 749, class: sreg_32, preferred-register: '' }
+  - { id: 750, class: sreg_32, preferred-register: '' }
+  - { id: 751, class: sreg_32, preferred-register: '' }
+  - { id: 752, class: sreg_32, preferred-register: '' }
+  - { id: 753, class: sreg_32, preferred-register: '' }
+  - { id: 754, class: sreg_32, preferred-register: '' }
+  - { id: 755, class: sreg_32, preferred-register: '' }
+  - { id: 756, class: sreg_32, preferred-register: '' }
+  - { id: 757, class: sreg_32, preferred-register: '' }
+  - { id: 758, class: sreg_32, preferred-register: '' }
+  - { id: 759, class: sreg_32, preferred-register: '' }
+  - { id: 760, class: sreg_32, preferred-register: '' }
+  - { id: 761, class: vgpr_32, preferred-register: '' }
+  - { id: 762, class: vgpr_32, preferred-register: '' }
+  - { id: 763, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 764, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 765, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 766, class: vgpr_32, preferred-register: '' }
+  - { id: 767, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 768, class: vgpr_32, preferred-register: '' }
+  - { id: 769, class: vgpr_32, preferred-register: '' }
+  - { id: 770, class: vgpr_32, preferred-register: '' }
+  - { id: 771, class: vgpr_32, preferred-register: '' }
+  - { id: 772, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 773, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 774, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 775, class: vgpr_32, preferred-register: '' }
+  - { id: 776, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 777, class: vgpr_32, preferred-register: '' }
+  - { id: 778, class: vgpr_32, preferred-register: '' }
+  - { id: 779, class: vgpr_32, preferred-register: '' }
+  - { id: 780, class: vgpr_32, preferred-register: '' }
+  - { id: 781, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 782, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 783, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 784, class: vgpr_32, preferred-register: '' }
+  - { id: 785, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 786, class: vgpr_32, preferred-register: '' }
+  - { id: 787, class: vgpr_32, preferred-register: '' }
+  - { id: 788, class: vgpr_32, preferred-register: '' }
+  - { id: 789, class: vgpr_32, preferred-register: '' }
+  - { id: 790, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 791, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 792, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 793, class: vgpr_32, preferred-register: '' }
+  - { id: 794, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 795, class: vgpr_32, preferred-register: '' }
+  - { id: 796, class: vgpr_32, preferred-register: '' }
+  - { id: 797, class: vgpr_32, preferred-register: '' }
+  - { id: 798, class: vgpr_32, preferred-register: '' }
+  - { id: 799, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 800, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 801, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 802, class: vgpr_32, preferred-register: '' }
+  - { id: 803, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 804, class: vgpr_32, preferred-register: '' }
+  - { id: 805, class: vgpr_32, preferred-register: '' }
+  - { id: 806, class: vgpr_32, preferred-register: '' }
+  - { id: 807, class: vgpr_32, preferred-register: '' }
+  - { id: 808, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 809, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 810, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 811, class: vgpr_32, preferred-register: '' }
+  - { id: 812, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 813, class: vgpr_32, preferred-register: '' }
+  - { id: 814, class: vgpr_32, preferred-register: '' }
+  - { id: 815, class: vgpr_32, preferred-register: '' }
+  - { id: 816, class: vgpr_32, preferred-register: '' }
+  - { id: 817, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 818, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 819, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 820, class: vgpr_32, preferred-register: '' }
+  - { id: 821, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 822, class: vgpr_32, preferred-register: '' }
+  - { id: 823, class: vgpr_32, preferred-register: '' }
+  - { id: 824, class: vgpr_32, preferred-register: '' }
+  - { id: 825, class: vgpr_32, preferred-register: '' }
+  - { id: 826, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 827, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 828, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 829, class: vgpr_32, preferred-register: '' }
+  - { id: 830, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 831, class: vgpr_32, preferred-register: '' }
+  - { id: 832, class: vgpr_32, preferred-register: '' }
+  - { id: 833, class: vgpr_32, preferred-register: '' }
+  - { id: 834, class: vgpr_32, preferred-register: '' }
+  - { id: 835, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 836, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 837, class: vgpr_32, preferred-register: '' }
+  - { id: 838, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 839, class: vgpr_32, preferred-register: '' }
+  - { id: 840, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 841, class: vgpr_32, preferred-register: '' }
+  - { id: 842, class: vgpr_32, preferred-register: '' }
+  - { id: 843, class: vgpr_32, preferred-register: '' }
+  - { id: 844, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 845, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 846, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 847, class: vgpr_32, preferred-register: '' }
+  - { id: 848, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 849, class: vgpr_32, preferred-register: '' }
+  - { id: 850, class: vgpr_32, preferred-register: '' }
+  - { id: 851, class: vgpr_32, preferred-register: '' }
+  - { id: 852, class: vgpr_32, preferred-register: '' }
+  - { id: 853, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 854, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 855, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 856, class: vgpr_32, preferred-register: '' }
+  - { id: 857, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 858, class: vgpr_32, preferred-register: '' }
+  - { id: 859, class: vgpr_32, preferred-register: '' }
+  - { id: 860, class: vgpr_32, preferred-register: '' }
+  - { id: 861, class: vgpr_32, preferred-register: '' }
+  - { id: 862, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 863, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 864, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 865, class: vgpr_32, preferred-register: '' }
+  - { id: 866, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 867, class: vgpr_32, preferred-register: '' }
+  - { id: 868, class: vgpr_32, preferred-register: '' }
+  - { id: 869, class: vgpr_32, preferred-register: '' }
+  - { id: 870, class: vgpr_32, preferred-register: '' }
+  - { id: 871, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 872, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 873, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 874, class: vgpr_32, preferred-register: '' }
+  - { id: 875, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 876, class: vgpr_32, preferred-register: '' }
+  - { id: 877, class: vgpr_32, preferred-register: '' }
+  - { id: 878, class: vgpr_32, preferred-register: '' }
+  - { id: 879, class: vgpr_32, preferred-register: '' }
+  - { id: 880, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 881, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 882, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 883, class: vgpr_32, preferred-register: '' }
+  - { id: 884, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 885, class: vgpr_32, preferred-register: '' }
+  - { id: 886, class: vgpr_32, preferred-register: '' }
+  - { id: 887, class: vgpr_32, preferred-register: '' }
+  - { id: 888, class: vgpr_32, preferred-register: '' }
+  - { id: 889, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 890, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 891, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 892, class: vgpr_32, preferred-register: '' }
+  - { id: 893, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 894, class: vgpr_32, preferred-register: '' }
+  - { id: 895, class: vgpr_32, preferred-register: '' }
+  - { id: 896, class: vgpr_32, preferred-register: '' }
+  - { id: 897, class: vgpr_32, preferred-register: '' }
+  - { id: 898, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 899, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 900, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 901, class: vgpr_32, preferred-register: '' }
+  - { id: 902, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 903, class: vgpr_32, preferred-register: '' }
+  - { id: 904, class: vgpr_32, preferred-register: '' }
+  - { id: 905, class: vgpr_32, preferred-register: '' }
+  - { id: 906, class: vgpr_32, preferred-register: '' }
+  - { id: 907, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 908, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 909, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 910, class: vgpr_32, preferred-register: '' }
+  - { id: 911, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 912, class: vgpr_32, preferred-register: '' }
+  - { id: 913, class: vgpr_32, preferred-register: '' }
+  - { id: 914, class: vgpr_32, preferred-register: '' }
+  - { id: 915, class: vgpr_32, preferred-register: '' }
+  - { id: 916, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 917, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 918, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 919, class: vgpr_32, preferred-register: '' }
+  - { id: 920, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 921, class: vgpr_32, preferred-register: '' }
+  - { id: 922, class: vgpr_32, preferred-register: '' }
+  - { id: 923, class: vgpr_32, preferred-register: '' }
+  - { id: 924, class: vgpr_32, preferred-register: '' }
+  - { id: 925, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 926, class: sreg_64_xexec, preferred-register: '' }
+  - { id: 927, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 928, class: vgpr_32, preferred-register: '' }
+  - { id: 929, class: sreg_32_xexec_hi_and_sreg_32_xm0, preferred-register: '' }
+  - { id: 930, class: vgpr_32, preferred-register: '' }
+  - { id: 931, class: vgpr_32, preferred-register: '' }
+  - { id: 932, class: sreg_64, preferred-register: '' }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '%176' }
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%178' }
+  - { reg: '$sgpr6', virtual-reg: '%179' }
+  - { reg: '$sgpr7', virtual-reg: '%180' }
+  - { reg: '$sgpr8', virtual-reg: '%181' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 136
+  maxKernArgAlign: 8
+  ldsSize:         16384
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     true
+  hasSpilledSGPRs: false
+  hasSpilledVGPRs: false
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sgpr32'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    workGroupIDY:    { reg: '$sgpr7' }
+    workGroupIDZ:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+    workItemIDX:     { reg: '$vgpr0', mask: 1023 }
+    workItemIDY:     { reg: '$vgpr0', mask: 1047552 }
+  psInputAddr:     0
+  psInputEnable:   0
+  mode:
+    ieee:            true
+    dx10-clamp:      true
+    fp32-input-denormals: true
+    fp32-output-denormals: true
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       8
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+  longBranchReservedReg: ''
+body:             |
+  ; CHECK-LABEL: name: _ZL20rocblas_gemvn_kernelILi64ELi16EiffffEviiT3_lPKT4_lT1_lS3_lilS0_lPT5_lil
+  ; CHECK: bb.0 (%ir-block.18):
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.54(0x40000000)
+  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 136
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub0(p4)
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]].sub1(p4)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], [[COPY7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY6]], [[COPY8]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]](p4), 136, 0 :: (invariant load (s64) from %ir.20, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY3]](p4), 144, 0 :: (invariant load (s32) from %ir.20 + 8, addrspace 4)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   S_CMP_LT_U32 [[COPY2]], killed [[COPY10]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 18
+  ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12
+  ; CHECK-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_MOV_B32_3]], killed [[S_MOV_B32_2]], implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub0
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE2]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY11]], [[COPY13]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY12]], [[COPY14]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_1]], %subreg.sub0, [[S_ADDC_U32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (invariant load (s16) from %ir.27, !tbaa !10, addrspace 4)
+  ; CHECK-NEXT:   S_CMP_LT_U32 [[COPY1]], killed [[COPY9]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 20
+  ; CHECK-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 14
+  ; CHECK-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_MOV_B32_5]], killed [[S_MOV_B32_4]], implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub0
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE4]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY15]], [[COPY17]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY16]], [[COPY18]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_2]], %subreg.sub0, [[S_ADDC_U32_2]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE5]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (invariant load (s16) from %ir.33, !tbaa !10, addrspace 4)
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_USHORT_SADDR1]], [[GLOBAL_LOAD_USHORT_SADDR]], implicit $exec
+  ; CHECK-NEXT:   S_CMP_LT_U32 [[COPY]], killed [[S_LOAD_DWORD_IMM]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 22
+  ; CHECK-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+  ; CHECK-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_MOV_B32_7]], killed [[S_MOV_B32_6]], implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_CSELECT_B32_2]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub0
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE6]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY19]], [[COPY21]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY20]], [[COPY22]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_3]], %subreg.sub0, [[S_ADDC_U32_3]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE7]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (invariant load (s16) from %ir.40, !tbaa !10, addrspace 4)
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[V_MUL_LO_U32_e64_]], [[GLOBAL_LOAD_USHORT_SADDR2]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
+  ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 killed [[V_MUL_LO_U32_e64_1]], killed [[S_MOV_B32_8]], implicit $exec
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.44):
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.53(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY3]](p4), 0, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset65, addrspace 4)
+  ; CHECK-NEXT:   [[COPY23:%[0-9]+]]:sgpr_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[COPY3]](p4), 88, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset55, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_9]]
+  ; CHECK-NEXT:   [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64 = contract nofpexcept V_CMP_EQ_F32_e64 0, [[COPY23]], 0, [[COPY24]], 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_10]]
+  ; CHECK-NEXT:   [[V_CMP_EQ_F32_e64_1:%[0-9]+]]:sreg_64 = contract nofpexcept V_CMP_EQ_F32_e64 0, [[S_LOAD_DWORD_IMM1]], 0, [[COPY25]], 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 killed [[V_CMP_EQ_F32_e64_]], killed [[V_CMP_EQ_F32_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[S_AND_B64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = COPY [[S_AND_B64_1]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.53, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.49):
+  ; CHECK-NEXT:   successors: %bb.3(0x50000000), %bb.55(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sgpr_96 = REG_SEQUENCE [[COPY27]], %subreg.sub0, [[COPY26]], %subreg.sub1, [[COPY23]], %subreg.sub2
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE8]].sub0
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY3]](p4), 104, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset57, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY3]](p4), 120, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset61, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]](p4), 128, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset63, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM1]].sub1
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM1]].sub0
+  ; CHECK-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY30]], %subreg.sub0, killed [[COPY29]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM1]].sub3
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM1]].sub2
+  ; CHECK-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY32]], %subreg.sub0, killed [[COPY31]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY33:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+  ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[COPY1]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE11:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_ASHR_I32_]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY34:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE11]].sub1
+  ; CHECK-NEXT:   [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY1]], killed [[COPY33]]
+  ; CHECK-NEXT:   [[COPY35:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+  ; CHECK-NEXT:   [[S_MUL_HI_U32_:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY1]], [[COPY35]]
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_MUL_HI_U32_]], killed [[S_MUL_I32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MUL_I32_1:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[COPY34]], [[COPY35]]
+  ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_]], killed [[S_MUL_I32_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MUL_I32_2:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY1]], [[COPY35]]
+  ; CHECK-NEXT:   [[REG_SEQUENCE12:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_2]], %subreg.sub0, killed [[S_ADD_I32_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[REG_SEQUENCE12]], [[S_MOV_B32_11]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY36:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub0
+  ; CHECK-NEXT:   [[COPY37:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE9]].sub1
+  ; CHECK-NEXT:   [[COPY38:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_]].sub0
+  ; CHECK-NEXT:   [[COPY39:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY36]], [[COPY38]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY37]], [[COPY39]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE13:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_4]], %subreg.sub0, [[S_ADDC_U32_4]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LSHL_B64_1:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[REG_SEQUENCE10]], [[S_MOV_B32_11]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE13]].sub0
+  ; CHECK-NEXT:   [[COPY41:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE13]].sub1
+  ; CHECK-NEXT:   [[COPY42:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_1]].sub0
+  ; CHECK-NEXT:   [[COPY43:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_1]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY40]], [[COPY42]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY41]], [[COPY43]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE14:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_5]], %subreg.sub0, [[S_ADDC_U32_5]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY4]](s32), killed [[S_MOV_B32_12]], implicit $exec
+  ; CHECK-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[COPY4]](s32), 10, 10, implicit $exec
+  ; CHECK-NEXT:   [[V_MAD_U32_U24_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_MAD_U32_U24_e64 [[V_BFE_U32_e64_]], [[GLOBAL_LOAD_USHORT_SADDR]], [[V_AND_B32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_13]]
+  ; CHECK-NEXT:   [[V_CMP_NEQ_F32_e64_:%[0-9]+]]:sreg_64 = nofpexcept V_CMP_NEQ_F32_e64 0, [[COPY23]], 0, [[COPY44]], 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_2:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_NEQ_F32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = COPY [[S_AND_B64_2]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.3, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.55:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+  ; CHECK-NEXT:   [[COPY46:%[0-9]+]]:vreg_64_align2 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3 (%ir-block.61):
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_MAD_U32_U24_e64_]], killed [[S_MOV_B32_14]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+  ; CHECK-NEXT:   [[COPY48:%[0-9]+]]:vreg_64_align2 = COPY [[DEF2]]
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.Flow90:
+  ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.23(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.55, %23, %bb.6
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY45]], %bb.55, %21, %bb.6
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:vreg_64_align2 = PHI [[COPY46]], %bb.55, %22, %bb.6
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.55, [[S_MOV_B64_2]], %bb.6
+  ; CHECK-NEXT:   [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[PHI3]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = COPY [[S_AND_B64_3]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5 (%ir-block.70):
+  ; CHECK-NEXT:   successors: %bb.7(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 8
+  ; CHECK-NEXT:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_15]]
+  ; CHECK-NEXT:   [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_LSHL_ADD_U32_e64 [[COPY2]], [[COPY49]], [[V_MAD_U32_U24_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_LSHL_ADD_U32_e64_]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[DEF5]]
+  ; CHECK-NEXT:   [[COPY51:%[0-9]+]]:vreg_64_align2 = COPY [[DEF4]]
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.Flow91:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI4:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_2]], %bb.3, %669, %bb.9
+  ; CHECK-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[COPY47]], %bb.3, %26, %bb.9
+  ; CHECK-NEXT:   [[PHI6:%[0-9]+]]:vreg_64_align2 = PHI [[COPY48]], %bb.3, %27, %bb.9
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7 (%ir-block.80):
+  ; CHECK-NEXT:   successors: %bb.8(0x50000000), %bb.48(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_16]]
+  ; CHECK-NEXT:   [[V_CMP_EQ_F32_e64_2:%[0-9]+]]:sreg_64 = nofpexcept V_CMP_EQ_F32_e64 0, [[S_LOAD_DWORD_IMM1]], 0, [[COPY52]], 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_LSHL_ADD_U32_e64_]], [[S_LOAD_DWORD_IMM2]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_2]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE15:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_2]], %subreg.sub0, [[V_ASHRREV_I32_e64_]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_AND_B64_4:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_EQ_F32_e64_2]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = COPY [[S_AND_B64_4]]
+  ; CHECK-NEXT:   [[COPY53:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_16]], implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8 (%ir-block.84):
+  ; CHECK-NEXT:   successors: %bb.48(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 killed [[S_MOV_B32_17]], [[REG_SEQUENCE15]], implicit $exec
+  ; CHECK-NEXT:   [[COPY54:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub0
+  ; CHECK-NEXT:   [[COPY55:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub0
+  ; CHECK-NEXT:   [[COPY56:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub1
+  ; CHECK-NEXT:   [[COPY57:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY54]], [[COPY55]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[COPY56]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY58]], [[COPY57]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE16:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE16]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.85, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_MUL_F32_e64 0, killed [[GLOBAL_LOAD_DWORD]], 0, [[S_LOAD_DWORD_IMM1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.48
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9.Flow92:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI7:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_3]], %bb.5, %672, %bb.48
+  ; CHECK-NEXT:   [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[COPY50]], %bb.5, %171, %bb.48
+  ; CHECK-NEXT:   [[PHI9:%[0-9]+]]:vreg_64_align2 = PHI [[COPY51]], %bb.5, %24, %bb.48
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 [[PHI7]], $exec, implicit-def $scc
+  ; CHECK-NEXT:   [[COPY59:%[0-9]+]]:sreg_64 = COPY [[S_AND_B64_5]]
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10 (%ir-block.91):
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY60:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE11]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY3]](p4), 24, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset39, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY3]](p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset43, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY3]](p4), 48, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset45, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]](p4), 64, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset45 + 16, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY3]](p4), 72, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset51, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY3]](p4), 80, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset53, align 16, addrspace 4)
+  ; CHECK-NEXT:   [[COPY61:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM2]].sub1
+  ; CHECK-NEXT:   [[COPY62:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM2]].sub0
+  ; CHECK-NEXT:   [[REG_SEQUENCE17:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY62]], %subreg.sub0, killed [[COPY61]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY63:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM2]].sub3
+  ; CHECK-NEXT:   [[COPY64:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM2]].sub2
+  ; CHECK-NEXT:   [[REG_SEQUENCE18:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY64]], %subreg.sub0, killed [[COPY63]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY65:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM3]].sub0
+  ; CHECK-NEXT:   [[COPY66:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM3]].sub1
+  ; CHECK-NEXT:   [[COPY67:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM2]].sub1
+  ; CHECK-NEXT:   [[COPY68:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
+  ; CHECK-NEXT:   [[COPY69:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM3]].sub3
+  ; CHECK-NEXT:   [[COPY70:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM3]].sub2
+  ; CHECK-NEXT:   [[REG_SEQUENCE19:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY70]], %subreg.sub0, killed [[COPY69]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE20:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY68]], %subreg.sub0, killed [[COPY67]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY71:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM3]].sub1
+  ; CHECK-NEXT:   [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY60]].sub0
+  ; CHECK-NEXT:   [[S_MUL_I32_3:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY72]], killed [[COPY66]]
+  ; CHECK-NEXT:   [[S_MUL_HI_U32_1:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY72]], [[COPY65]]
+  ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_MUL_HI_U32_1]], killed [[S_MUL_I32_3]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY73:%[0-9]+]]:sreg_32 = COPY [[COPY60]].sub1
+  ; CHECK-NEXT:   [[S_MUL_I32_4:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY73]], [[COPY65]]
+  ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_2]], killed [[S_MUL_I32_4]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MUL_I32_5:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY72]], [[COPY65]]
+  ; CHECK-NEXT:   [[REG_SEQUENCE21:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_5]], %subreg.sub0, killed [[S_ADD_I32_3]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[S_LSHL_B64_2:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[REG_SEQUENCE21]], [[S_MOV_B32_18]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY74:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE17]].sub0
+  ; CHECK-NEXT:   [[COPY75:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE17]].sub1
+  ; CHECK-NEXT:   [[COPY76:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_2]].sub0
+  ; CHECK-NEXT:   [[COPY77:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_2]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY74]], [[COPY76]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_6:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY75]], [[COPY77]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE22:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_6]], %subreg.sub0, [[S_ADDC_U32_6]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LSHL_B64_3:%[0-9]+]]:sreg_64 = S_LSHL_B64 killed [[REG_SEQUENCE18]], [[S_MOV_B32_18]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY78:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE22]].sub0
+  ; CHECK-NEXT:   [[COPY79:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE22]].sub1
+  ; CHECK-NEXT:   [[COPY80:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_3]].sub0
+  ; CHECK-NEXT:   [[COPY81:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_3]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY78]], [[COPY80]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_7:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY79]], [[COPY81]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE23:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_7]], %subreg.sub0, [[S_ADDC_U32_7]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MUL_I32_6:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY72]], killed [[COPY71]]
+  ; CHECK-NEXT:   [[COPY82:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM3]].sub0
+  ; CHECK-NEXT:   [[S_MUL_HI_U32_2:%[0-9]+]]:sreg_32 = S_MUL_HI_U32 [[COPY72]], [[COPY82]]
+  ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_MUL_HI_U32_2]], killed [[S_MUL_I32_6]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MUL_I32_7:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY73]], [[COPY82]]
+  ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[S_ADD_I32_4]], killed [[S_MUL_I32_7]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MUL_I32_8:%[0-9]+]]:sreg_32 = S_MUL_I32 [[COPY72]], [[COPY82]]
+  ; CHECK-NEXT:   [[REG_SEQUENCE24:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MUL_I32_8]], %subreg.sub0, killed [[S_ADD_I32_5]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LSHL_B64_4:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[REG_SEQUENCE24]], [[S_MOV_B32_18]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY83:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub0
+  ; CHECK-NEXT:   [[COPY84:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE19]].sub1
+  ; CHECK-NEXT:   [[COPY85:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_4]].sub0
+  ; CHECK-NEXT:   [[COPY86:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_4]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY83]], [[COPY85]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_8:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY84]], [[COPY86]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE25:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_8]], %subreg.sub0, [[S_ADDC_U32_8]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LSHL_B64_5:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[REG_SEQUENCE20]], [[S_MOV_B32_18]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY87:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub0
+  ; CHECK-NEXT:   [[COPY88:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE25]].sub1
+  ; CHECK-NEXT:   [[COPY89:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_5]].sub0
+  ; CHECK-NEXT:   [[COPY90:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_5]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY87]], [[COPY89]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_9:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY88]], [[COPY90]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE26:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_9]], %subreg.sub0, [[S_ADDC_U32_9]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY91:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE8]].sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 8
+  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], killed [[S_MOV_B32_19]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[S_LSHL_B32_]], [[V_AND_B32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 31
+  ; CHECK-NEXT:   [[S_ASHR_I32_1:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[COPY91]], killed [[S_MOV_B32_20]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 26
+  ; CHECK-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 killed [[S_ASHR_I32_1]], killed [[S_MOV_B32_21]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY91]], killed [[S_LSHR_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 -64
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_ADD_I32_6]], killed [[S_MOV_B32_22]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 [[S_MOV_B32_18]], [[V_BFE_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_LSHLREV_B32_e64_]], [[S_AND_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY92:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_23]], implicit $exec
+  ; CHECK-NEXT:   [[COPY93:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_23]], implicit $exec
+  ; CHECK-NEXT:   [[COPY94:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_23]], implicit $exec
+  ; CHECK-NEXT:   [[COPY95:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_23]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_1]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11..lr.ph.i:
+  ; CHECK-NEXT:   successors: %bb.13(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY96:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE19]]
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[COPY97:%[0-9]+]]:sreg_64 = COPY [[V_CMP_LT_I32_e64_3]]
+  ; CHECK-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[S_MOV_B32_24]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_4:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_1]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[COPY98:%[0-9]+]]:sreg_64 = COPY [[V_CMP_LT_I32_e64_4]]
+  ; CHECK-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sreg_32 = S_MOV_B32 128
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_25]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_5:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_2]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[COPY99:%[0-9]+]]:sreg_64 = COPY [[V_CMP_LT_I32_e64_5]]
+  ; CHECK-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sreg_32 = S_MOV_B32 192
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_26]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_6:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_3]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[COPY100:%[0-9]+]]:sreg_64 = COPY [[V_CMP_LT_I32_e64_6]]
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM3]], [[V_LSHLREV_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[V_MUL_LO_U32_e64_3]], [[S_LOAD_DWORD_IMM3]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sreg_32 = S_MOV_B32 6
+  ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[S_LOAD_DWORD_IMM3]], [[S_MOV_B32_27]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = nuw nsw V_ADD_U32_e64 [[V_LSHLREV_B32_e64_]], [[S_MOV_B32_18]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_4:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM3]], [[V_ADD_U32_e64_5]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = nuw nsw V_ADD_U32_e64 [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_28]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_5:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM3]], [[V_ADD_U32_e64_6]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_6:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_BFE_U32_e64_]], [[S_LOAD_DWORD_IMM3]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_18]], killed [[V_MUL_LO_U32_e64_6]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_7:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM4]], [[V_LSHLREV_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[V_MUL_LO_U32_e64_7]], [[S_LOAD_DWORD_IMM4]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[S_LOAD_DWORD_IMM4]], [[S_MOV_B32_27]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_8:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM4]], [[V_ADD_U32_e64_5]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_9:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[S_LOAD_DWORD_IMM4]], [[V_ADD_U32_e64_6]], implicit $exec
+  ; CHECK-NEXT:   [[COPY101:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_4]].sub0
+  ; CHECK-NEXT:   [[COPY102:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_4]].sub1
+  ; CHECK-NEXT:   [[COPY103:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_5]].sub0
+  ; CHECK-NEXT:   [[COPY104:%[0-9]+]]:sreg_32 = COPY [[S_LSHL_B64_5]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY101]], [[COPY103]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_10:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY102]], [[COPY104]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE27:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_10]], %subreg.sub0, [[S_ADDC_U32_10]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_10:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_BFE_U32_e64_]], [[S_LOAD_DWORD_IMM4]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_18]], killed [[V_MUL_LO_U32_e64_10]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_1:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_LSHLREV_B32_e64_2]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE28:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_LSHLREV_B32_e64_2]], %subreg.sub0, [[V_ASHRREV_I32_e64_1]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = nsw V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE28]], implicit $exec
+  ; CHECK-NEXT:   [[COPY105:%[0-9]+]]:sreg_32 = COPY [[COPY96]].sub0
+  ; CHECK-NEXT:   [[COPY106:%[0-9]+]]:sreg_32 = COPY [[COPY96]].sub1
+  ; CHECK-NEXT:   [[COPY107:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE27]].sub0
+  ; CHECK-NEXT:   [[COPY108:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE27]].sub1
+  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY105]], [[COPY107]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADDC_U32_11:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY106]], [[COPY108]], implicit-def $scc, implicit $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE29:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_11]], %subreg.sub0, [[S_ADDC_U32_11]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY109:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE29]].sub0
+  ; CHECK-NEXT:   [[COPY110:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_1]].sub0
+  ; CHECK-NEXT:   [[COPY111:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE29]].sub1
+  ; CHECK-NEXT:   [[COPY112:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_1]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY109]], [[COPY110]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY113:%[0-9]+]]:vgpr_32 = COPY [[COPY111]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY113]], [[COPY112]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE30:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE31:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_LSHL_B32_2]], %subreg.sub0, [[S_ASHR_I32_2]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_LSHL_B64_6:%[0-9]+]]:sreg_64 = nsw S_LSHL_B64 killed [[REG_SEQUENCE31]], [[S_MOV_B32_18]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT:   [[COPY114:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]], implicit $exec
+  ; CHECK-NEXT:   [[COPY115:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]], implicit $exec
+  ; CHECK-NEXT:   [[COPY116:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]], implicit $exec
+  ; CHECK-NEXT:   [[COPY117:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]], implicit $exec
+  ; CHECK-NEXT:   [[COPY118:%[0-9]+]]:sreg_64 = COPY [[COPY97]]
+  ; CHECK-NEXT:   [[COPY119:%[0-9]+]]:sreg_64 = COPY [[COPY98]]
+  ; CHECK-NEXT:   [[COPY120:%[0-9]+]]:sreg_64 = COPY [[COPY99]]
+  ; CHECK-NEXT:   [[COPY121:%[0-9]+]]:sreg_64 = COPY [[COPY100]]
+  ; CHECK-NEXT:   S_BRANCH %bb.13
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12.Flow89:
+  ; CHECK-NEXT:   successors: %bb.24(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI10:%[0-9]+]]:vgpr_32 = PHI [[V_LSHLREV_B32_e64_]], %bb.10, %98, %bb.22
+  ; CHECK-NEXT:   [[PHI11:%[0-9]+]]:vgpr_32 = PHI [[COPY92]], %bb.10, %106, %bb.22
+  ; CHECK-NEXT:   [[PHI12:%[0-9]+]]:vgpr_32 = PHI [[COPY93]], %bb.10, %105, %bb.22
+  ; CHECK-NEXT:   [[PHI13:%[0-9]+]]:vgpr_32 = PHI [[COPY94]], %bb.10, %104, %bb.22
+  ; CHECK-NEXT:   [[PHI14:%[0-9]+]]:vgpr_32 = PHI [[COPY95]], %bb.10, %103, %bb.22
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY91]], [[S_AND_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.24
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13 (%ir-block.146):
+  ; CHECK-NEXT:   successors: %bb.14(0x40000000), %bb.21(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI15:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_4]], %bb.11, %102, %bb.21
+  ; CHECK-NEXT:   [[PHI16:%[0-9]+]]:vreg_64_align2 = PHI [[REG_SEQUENCE30]], %bb.11, %101, %bb.21
+  ; CHECK-NEXT:   [[PHI17:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_29]], %bb.11, %100, %bb.21
+  ; CHECK-NEXT:   [[PHI18:%[0-9]+]]:vgpr_32 = PHI [[V_ADD_U32_e64_]], %bb.11, %99, %bb.21
+  ; CHECK-NEXT:   [[PHI19:%[0-9]+]]:vgpr_32 = PHI [[V_LSHLREV_B32_e64_]], %bb.11, %98, %bb.21
+  ; CHECK-NEXT:   [[PHI20:%[0-9]+]]:vgpr_32 = PHI [[COPY114]], %bb.11, %97, %bb.21
+  ; CHECK-NEXT:   [[PHI21:%[0-9]+]]:vgpr_32 = PHI [[COPY115]], %bb.11, %96, %bb.21
+  ; CHECK-NEXT:   [[PHI22:%[0-9]+]]:vgpr_32 = PHI [[COPY116]], %bb.11, %95, %bb.21
+  ; CHECK-NEXT:   [[PHI23:%[0-9]+]]:vgpr_32 = PHI [[COPY117]], %bb.11, %94, %bb.21
+  ; CHECK-NEXT:   [[SI_IF3:%[0-9]+]]:sreg_64 = SI_IF [[COPY118]], %bb.21, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14 (%ir-block.150):
+  ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.20(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[PHI16]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.lsr.iv33, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_7]], [[PHI17]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_2:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_8]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE32:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_8]], %subreg.sub0, [[V_ASHRREV_I32_e64_2]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE32]], implicit $exec
+  ; CHECK-NEXT:   [[COPY122:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY123:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_2]].sub0
+  ; CHECK-NEXT:   [[COPY124:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY125:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_2]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY122]], [[COPY123]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY126:%[0-9]+]]:vgpr_32 = COPY [[COPY124]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_4:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY126]], [[COPY125]], killed [[V_ADD_CO_U32_e64_5]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE33:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_4]], %subreg.sub0, [[V_ADDC_U32_e64_4]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD2:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE33]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.154, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_8]], [[PHI17]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_3:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_9]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE34:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_9]], %subreg.sub0, [[V_ASHRREV_I32_e64_3]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_3:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE34]], implicit $exec
+  ; CHECK-NEXT:   [[COPY127:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY128:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_3]].sub0
+  ; CHECK-NEXT:   [[COPY129:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY130:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_3]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_6:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY127]], [[COPY128]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY131:%[0-9]+]]:vgpr_32 = COPY [[COPY129]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_6:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY131]], [[COPY130]], killed [[V_ADD_CO_U32_e64_7]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE35:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_6]], %subreg.sub0, [[V_ADDC_U32_e64_6]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE35]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.158, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_9]], [[PHI17]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_4:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_10]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE36:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_10]], %subreg.sub0, [[V_ASHRREV_I32_e64_4]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_4:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE36]], implicit $exec
+  ; CHECK-NEXT:   [[COPY132:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY133:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_4]].sub0
+  ; CHECK-NEXT:   [[COPY134:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY135:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_4]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_8:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_9:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY132]], [[COPY133]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY136:%[0-9]+]]:vgpr_32 = COPY [[COPY134]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_8:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_9:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY136]], [[COPY135]], killed [[V_ADD_CO_U32_e64_9]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE37:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_8]], %subreg.sub0, [[V_ADDC_U32_e64_8]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE37]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.162, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_LSHLREV_B32_e64_1]], [[PHI18]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_5:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_11]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE38:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_11]], %subreg.sub0, [[V_ASHRREV_I32_e64_5]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_5:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE38]], implicit $exec
+  ; CHECK-NEXT:   [[COPY137:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY138:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_5]].sub0
+  ; CHECK-NEXT:   [[COPY139:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY140:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_5]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_10:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_11:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY137]], [[COPY138]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY141:%[0-9]+]]:vgpr_32 = COPY [[COPY139]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_10:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_11:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY141]], [[COPY140]], killed [[V_ADD_CO_U32_e64_11]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE39:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_10]], %subreg.sub0, [[V_ADDC_U32_e64_10]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.166, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD5]], 0, [[PHI20]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_4]], [[PHI18]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_6:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_12]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE40:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_12]], %subreg.sub0, [[V_ASHRREV_I32_e64_6]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_6:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE40]], implicit $exec
+  ; CHECK-NEXT:   [[COPY142:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY143:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_6]].sub0
+  ; CHECK-NEXT:   [[COPY144:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY145:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_6]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_12:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_13:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY142]], [[COPY143]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY146:%[0-9]+]]:vgpr_32 = COPY [[COPY144]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_12:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_13:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY146]], [[COPY145]], killed [[V_ADD_CO_U32_e64_13]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE41:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_12]], %subreg.sub0, [[V_ADDC_U32_e64_12]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.172, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD6]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_4]], [[PHI18]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_7:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_13]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE42:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_13]], %subreg.sub0, [[V_ASHRREV_I32_e64_7]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_7:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE42]], implicit $exec
+  ; CHECK-NEXT:   [[COPY147:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY148:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_7]].sub0
+  ; CHECK-NEXT:   [[COPY149:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY150:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_7]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_14:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_15:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY147]], [[COPY148]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY151:%[0-9]+]]:vgpr_32 = COPY [[COPY149]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_14:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_15:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY151]], [[COPY150]], killed [[V_ADD_CO_U32_e64_15]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE43:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_14]], %subreg.sub0, [[V_ADDC_U32_e64_14]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.178, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD7]], 0, [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_5]], [[PHI18]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_8:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_14]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE44:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_14]], %subreg.sub0, [[V_ASHRREV_I32_e64_8]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_8:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_18]], killed [[REG_SEQUENCE44]], implicit $exec
+  ; CHECK-NEXT:   [[COPY152:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY153:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_8]].sub0
+  ; CHECK-NEXT:   [[COPY154:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY155:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_8]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_16:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_17:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY152]], [[COPY153]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY156:%[0-9]+]]:vgpr_32 = COPY [[COPY154]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_16:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_17:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY156]], [[COPY155]], killed [[V_ADD_CO_U32_e64_17]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE45:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_16]], %subreg.sub0, [[V_ADDC_U32_e64_16]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.184, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD8]], 0, [[V_FMAC_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF4:%[0-9]+]]:sreg_64 = SI_IF [[COPY119]], %bb.20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.15
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.15 (%ir-block.191):
+  ; CHECK-NEXT:   successors: %bb.16(0x40000000), %bb.19(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.192, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_4:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD9]], 0, [[PHI21]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.196, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_5:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD10]], 0, [[V_FMAC_F32_e64_4]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.200, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_6:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD11]], 0, [[V_FMAC_F32_e64_5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.204, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_7:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD12]], 0, [[V_FMAC_F32_e64_6]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF5:%[0-9]+]]:sreg_64 = SI_IF [[COPY120]], %bb.19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.16
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.16 (%ir-block.211):
+  ; CHECK-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.212, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_8:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD13]], 0, [[PHI22]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.216, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_9:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD14]], 0, [[V_FMAC_F32_e64_8]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.220, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_10:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD15]], 0, [[V_FMAC_F32_e64_9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD16:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.224, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_11:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD16]], 0, [[V_FMAC_F32_e64_10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF6:%[0-9]+]]:sreg_64 = SI_IF [[COPY121]], %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.17 (%ir-block.231):
+  ; CHECK-NEXT:   successors: %bb.18(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD17:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.232, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_12:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD17]], 0, [[PHI23]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD18:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.236, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_13:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD18]], 0, [[V_FMAC_F32_e64_12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD19:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.240, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_14:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD19]], 0, [[V_FMAC_F32_e64_13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD20:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.244, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_15:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD20]], 0, [[V_FMAC_F32_e64_14]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.18.Flow85:
+  ; CHECK-NEXT:   successors: %bb.19(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI24:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.16, [[V_FMAC_F32_e64_15]], %bb.17
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF6]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.19.Flow86:
+  ; CHECK-NEXT:   successors: %bb.20(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI25:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.15, [[V_FMAC_F32_e64_11]], %bb.18
+  ; CHECK-NEXT:   [[PHI26:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.15, [[PHI24]], %bb.18
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.20.Flow87:
+  ; CHECK-NEXT:   successors: %bb.21(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI27:%[0-9]+]]:vgpr_32 = PHI [[PHI21]], %bb.14, [[V_FMAC_F32_e64_7]], %bb.19
+  ; CHECK-NEXT:   [[PHI28:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.14, [[PHI25]], %bb.19
+  ; CHECK-NEXT:   [[PHI29:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.14, [[PHI26]], %bb.19
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF4]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.21 (%ir-block.254):
+  ; CHECK-NEXT:   successors: %bb.22(0x04000000), %bb.13(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI30:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.13, [[PHI29]], %bb.20
+  ; CHECK-NEXT:   [[PHI31:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.13, [[PHI28]], %bb.20
+  ; CHECK-NEXT:   [[PHI32:%[0-9]+]]:vgpr_32 = PHI [[PHI21]], %bb.13, [[PHI27]], %bb.20
+  ; CHECK-NEXT:   [[PHI33:%[0-9]+]]:vgpr_32 = PHI [[PHI20]], %bb.13, [[V_FMAC_F32_e64_3]], %bb.20
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = nuw nsw V_ADD_U32_e64 [[PHI19]], [[S_MOV_B32_24]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI18]], [[S_LSHL_B32_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[PHI17]], [[S_LSHL_B32_2]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY157:%[0-9]+]]:vgpr_32 = COPY [[PHI16]].sub0
+  ; CHECK-NEXT:   [[COPY158:%[0-9]+]]:sreg_32_xm0 = COPY [[S_LSHL_B64_6]].sub0
+  ; CHECK-NEXT:   [[COPY159:%[0-9]+]]:vgpr_32 = COPY [[PHI16]].sub1
+  ; CHECK-NEXT:   [[COPY160:%[0-9]+]]:sreg_32_xm0 = COPY [[S_LSHL_B64_6]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_18:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_19:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY157]], [[COPY158]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY161:%[0-9]+]]:vgpr_32 = COPY [[COPY160]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_18:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_19:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY159]], [[COPY161]], killed [[V_ADD_CO_U32_e64_19]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE46:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_18]], %subreg.sub0, [[V_ADDC_U32_e64_18]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_CMP_GE_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_I32_e64 [[V_ADD_U32_e64_15]], [[S_AND_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_GE_I32_e64_]], [[PHI15]], implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.22
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.22.Flow88:
+  ; CHECK-NEXT:   successors: %bb.12(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI34:%[0-9]+]]:vgpr_32 = PHI [[PHI30]], %bb.21
+  ; CHECK-NEXT:   [[PHI35:%[0-9]+]]:vgpr_32 = PHI [[PHI31]], %bb.21
+  ; CHECK-NEXT:   [[PHI36:%[0-9]+]]:vgpr_32 = PHI [[PHI32]], %bb.21
+  ; CHECK-NEXT:   [[PHI37:%[0-9]+]]:vgpr_32 = PHI [[PHI33]], %bb.21
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF_BREAK]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.23.Flow93:
+  ; CHECK-NEXT:   successors: %bb.51(0x40000000), %bb.52(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI38:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.4, %165, %bb.45
+  ; CHECK-NEXT:   [[PHI39:%[0-9]+]]:vreg_64_align2 = PHI [[PHI2]], %bb.4, %166, %bb.45
+  ; CHECK-NEXT:   [[PHI40:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.4, %167, %bb.45
+  ; CHECK-NEXT:   [[SI_IF7:%[0-9]+]]:sreg_64 = SI_IF [[PHI40]], %bb.52, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.51
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.24.._crit_edge.i:
+  ; CHECK-NEXT:   successors: %bb.25(0x50000000), %bb.33(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; CHECK-NEXT:   S_CMP_LT_I32 [[S_SUB_I32_]], killed [[S_MOV_B32_30]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.33, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.25
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.25 (%ir-block.266):
+  ; CHECK-NEXT:   successors: %bb.26(0x40000000), %bb.34(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_7:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[PHI10]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY162:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_31]], implicit $exec
+  ; CHECK-NEXT:   [[COPY163:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_31]], implicit $exec
+  ; CHECK-NEXT:   [[COPY164:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_31]], implicit $exec
+  ; CHECK-NEXT:   [[COPY165:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_31]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF8:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_7]], %bb.34, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.26
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.26 (%ir-block.271):
+  ; CHECK-NEXT:   successors: %bb.27(0x40000000), %bb.32(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_11:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[PHI10]], [[S_LOAD_DWORD_IMM4]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_9:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_11]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE47:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_11]], %subreg.sub0, [[V_ASHRREV_I32_e64_9]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_9:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_32]], killed [[REG_SEQUENCE47]], implicit $exec
+  ; CHECK-NEXT:   [[COPY166:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY167:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_9]].sub0
+  ; CHECK-NEXT:   [[COPY168:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY169:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_9]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_20:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_21:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY166]], [[COPY167]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY170:%[0-9]+]]:vgpr_32 = COPY [[COPY168]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_20:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_21:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY170]], [[COPY169]], killed [[V_ADD_CO_U32_e64_21]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE48:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_20]], %subreg.sub0, [[V_ADDC_U32_e64_20]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD21:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE48]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.274, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], killed [[S_MOV_B32_33]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_8:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_OR_B32_e64_]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY171:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_34]], implicit $exec
+  ; CHECK-NEXT:   [[COPY172:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_34]], implicit $exec
+  ; CHECK-NEXT:   [[COPY173:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_34]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF9:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_8]], %bb.32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.27
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.27 (%ir-block.281):
+  ; CHECK-NEXT:   successors: %bb.28(0x40000000), %bb.31(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_12:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_]], [[S_LOAD_DWORD_IMM4]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_10:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_12]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE49:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_12]], %subreg.sub0, [[V_ASHRREV_I32_e64_10]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_10:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_32]], killed [[REG_SEQUENCE49]], implicit $exec
+  ; CHECK-NEXT:   [[COPY174:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY175:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_10]].sub0
+  ; CHECK-NEXT:   [[COPY176:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY177:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_10]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_22:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_23:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY174]], [[COPY175]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY178:%[0-9]+]]:vgpr_32 = COPY [[COPY176]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_22:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_23:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY178]], [[COPY177]], killed [[V_ADD_CO_U32_e64_23]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE50:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_22]], %subreg.sub0, [[V_ADDC_U32_e64_22]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD22:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE50]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.284, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], [[S_MOV_B32_32]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_9:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_OR_B32_e64_1]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY179:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_35]], implicit $exec
+  ; CHECK-NEXT:   [[COPY180:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_35]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF10:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_9]], %bb.31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.28
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.28 (%ir-block.291):
+  ; CHECK-NEXT:   successors: %bb.29(0x40000000), %bb.30(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_13:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_1]], [[S_LOAD_DWORD_IMM4]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_11:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_13]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE51:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_13]], %subreg.sub0, [[V_ASHRREV_I32_e64_11]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_11:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_36]], killed [[REG_SEQUENCE51]], implicit $exec
+  ; CHECK-NEXT:   [[COPY181:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY182:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_11]].sub0
+  ; CHECK-NEXT:   [[COPY183:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY184:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_11]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_24:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_25:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY181]], [[COPY182]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY185:%[0-9]+]]:vgpr_32 = COPY [[COPY183]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_24:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_25:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY185]], [[COPY184]], killed [[V_ADD_CO_U32_e64_25]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE52:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_24]], %subreg.sub0, [[V_ADDC_U32_e64_24]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD23:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE52]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.294, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; CHECK-NEXT:   [[V_OR_B32_e64_2:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], killed [[S_MOV_B32_37]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_10:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_OR_B32_e64_2]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF11:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_10]], %bb.30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.29
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.29 (%ir-block.301):
+  ; CHECK-NEXT:   successors: %bb.30(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_14:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_2]], [[S_LOAD_DWORD_IMM4]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_12:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_14]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE53:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_14]], %subreg.sub0, [[V_ASHRREV_I32_e64_12]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_12:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_36]], killed [[REG_SEQUENCE53]], implicit $exec
+  ; CHECK-NEXT:   [[COPY186:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub0
+  ; CHECK-NEXT:   [[COPY187:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_12]].sub0
+  ; CHECK-NEXT:   [[COPY188:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE26]].sub1
+  ; CHECK-NEXT:   [[COPY189:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_12]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_26:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_27:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY186]], [[COPY187]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY190:%[0-9]+]]:vgpr_32 = COPY [[COPY188]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_26:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_27:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY190]], [[COPY189]], killed [[V_ADD_CO_U32_e64_27]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE54:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_26]], %subreg.sub0, [[V_ADDC_U32_e64_26]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD24:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE54]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.304, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.30.Flow81:
+  ; CHECK-NEXT:   successors: %bb.31(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI41:%[0-9]+]]:vgpr_32 = PHI [[V_MOV_B32_e32_1]], %bb.28, [[GLOBAL_LOAD_DWORD24]], %bb.29
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF11]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.31.Flow82:
+  ; CHECK-NEXT:   successors: %bb.32(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI42:%[0-9]+]]:vgpr_32 = PHI [[COPY179]], %bb.27, [[GLOBAL_LOAD_DWORD23]], %bb.30
+  ; CHECK-NEXT:   [[PHI43:%[0-9]+]]:vgpr_32 = PHI [[COPY180]], %bb.27, [[PHI41]], %bb.30
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF10]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.32.Flow83:
+  ; CHECK-NEXT:   successors: %bb.34(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI44:%[0-9]+]]:vgpr_32 = PHI [[COPY171]], %bb.26, [[GLOBAL_LOAD_DWORD22]], %bb.31
+  ; CHECK-NEXT:   [[PHI45:%[0-9]+]]:vgpr_32 = PHI [[COPY172]], %bb.26, [[PHI42]], %bb.31
+  ; CHECK-NEXT:   [[PHI46:%[0-9]+]]:vgpr_32 = PHI [[COPY173]], %bb.26, [[PHI43]], %bb.31
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF9]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.34
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.33.Flow84:
+  ; CHECK-NEXT:   successors: %bb.43(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI47:%[0-9]+]]:vgpr_32 = PHI [[PHI11]], %bb.24, %157, %bb.42
+  ; CHECK-NEXT:   [[PHI48:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.24, %158, %bb.42
+  ; CHECK-NEXT:   [[PHI49:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.24, %159, %bb.42
+  ; CHECK-NEXT:   [[PHI50:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.24, %160, %bb.42
+  ; CHECK-NEXT:   S_BRANCH %bb.43
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.34 (%ir-block.316):
+  ; CHECK-NEXT:   successors: %bb.35(0x40000000), %bb.42(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI51:%[0-9]+]]:vgpr_32 = PHI [[COPY162]], %bb.25, [[PHI46]], %bb.32
+  ; CHECK-NEXT:   [[PHI52:%[0-9]+]]:vgpr_32 = PHI [[COPY163]], %bb.25, [[PHI45]], %bb.32
+  ; CHECK-NEXT:   [[PHI53:%[0-9]+]]:vgpr_32 = PHI [[COPY164]], %bb.25, [[PHI44]], %bb.32
+  ; CHECK-NEXT:   [[PHI54:%[0-9]+]]:vgpr_32 = PHI [[COPY165]], %bb.25, [[GLOBAL_LOAD_DWORD21]], %bb.32
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF8]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_11:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF12:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_11]], %bb.42, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.35
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.35 (%ir-block.321):
+  ; CHECK-NEXT:   successors: %bb.36(0x40000000), %bb.41(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY191:%[0-9]+]]:sreg_64_xexec = COPY [[V_CMP_LT_I32_e64_7]]
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_15:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[PHI10]], [[S_LOAD_DWORD_IMM3]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY192:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_38]]
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY192]], 0, killed [[V_MUL_LO_U32_e64_15]], [[COPY191]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 killed [[V_CNDMASK_B32_e64_]], [[V_ADD_U32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_13:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_17]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE55:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_17]], %subreg.sub0, [[V_ASHRREV_I32_e64_13]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_13:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_39]], killed [[REG_SEQUENCE55]], implicit $exec
+  ; CHECK-NEXT:   [[COPY193:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY194:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_13]].sub0
+  ; CHECK-NEXT:   [[COPY195:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY196:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_13]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_28:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_29:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY193]], [[COPY194]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY197:%[0-9]+]]:vgpr_32 = COPY [[COPY195]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_28:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_29:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY197]], [[COPY196]], killed [[V_ADD_CO_U32_e64_29]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE56:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_28]], %subreg.sub0, [[V_ADDC_U32_e64_28]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD25:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.326, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_16:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD25]], 0, [[PHI11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[V_OR_B32_e64_3:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], killed [[S_MOV_B32_40]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_16:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_3]], [[S_LOAD_DWORD_IMM3]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_12:%[0-9]+]]:sreg_64_xexec = V_CMP_LT_I32_e64 [[V_OR_B32_e64_3]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[COPY198:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_38]]
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY198]], 0, killed [[V_MUL_LO_U32_e64_16]], killed [[V_CMP_LT_I32_e64_12]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 killed [[V_CNDMASK_B32_e64_1]], [[V_ADD_U32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_14:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_18]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE57:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_18]], %subreg.sub0, [[V_ASHRREV_I32_e64_14]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_14:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_39]], killed [[REG_SEQUENCE57]], implicit $exec
+  ; CHECK-NEXT:   [[COPY199:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY200:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_14]].sub0
+  ; CHECK-NEXT:   [[COPY201:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY202:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_14]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_30:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_31:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY199]], [[COPY200]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY203:%[0-9]+]]:vgpr_32 = COPY [[COPY201]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_30:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_31:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY203]], [[COPY202]], killed [[V_ADD_CO_U32_e64_31]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE58:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_30]], %subreg.sub0, [[V_ADDC_U32_e64_30]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD26:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.336, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_17:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD26]], 0, [[V_FMAC_F32_e64_16]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_4:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], [[S_MOV_B32_39]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_17:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_4]], [[S_LOAD_DWORD_IMM3]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_13:%[0-9]+]]:sreg_64_xexec = V_CMP_LT_I32_e64 [[V_OR_B32_e64_4]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[COPY204:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_38]]
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY204]], 0, killed [[V_MUL_LO_U32_e64_17]], killed [[V_CMP_LT_I32_e64_13]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 killed [[V_CNDMASK_B32_e64_2]], [[V_ADD_U32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_15:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_19]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE59:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_19]], %subreg.sub0, [[V_ASHRREV_I32_e64_15]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_15:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_39]], killed [[REG_SEQUENCE59]], implicit $exec
+  ; CHECK-NEXT:   [[COPY205:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY206:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_15]].sub0
+  ; CHECK-NEXT:   [[COPY207:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY208:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_15]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_32:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_33:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY205]], [[COPY206]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY209:%[0-9]+]]:vgpr_32 = COPY [[COPY207]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_32:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_33:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY209]], [[COPY208]], killed [[V_ADD_CO_U32_e64_33]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE60:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_32]], %subreg.sub0, [[V_ADDC_U32_e64_32]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD27:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.346, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_18:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD27]], 0, [[V_FMAC_F32_e64_17]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; CHECK-NEXT:   [[V_OR_B32_e64_5:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI10]], killed [[S_MOV_B32_41]], implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_18:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_OR_B32_e64_5]], [[S_LOAD_DWORD_IMM3]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_14:%[0-9]+]]:sreg_64_xexec = V_CMP_LT_I32_e64 [[V_OR_B32_e64_5]], [[COPY91]], implicit $exec
+  ; CHECK-NEXT:   [[COPY210:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_38]]
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY210]], 0, killed [[V_MUL_LO_U32_e64_18]], killed [[V_CMP_LT_I32_e64_14]], implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 killed [[V_CNDMASK_B32_e64_3]], [[V_ADD_U32_e64_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_16:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_20]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE61:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_20]], %subreg.sub0, [[V_ASHRREV_I32_e64_16]], %subreg.sub1
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_16:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 [[S_MOV_B32_39]], killed [[REG_SEQUENCE61]], implicit $exec
+  ; CHECK-NEXT:   [[COPY211:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub0
+  ; CHECK-NEXT:   [[COPY212:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_16]].sub0
+  ; CHECK-NEXT:   [[COPY213:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE23]].sub1
+  ; CHECK-NEXT:   [[COPY214:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_16]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_34:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_35:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY211]], [[COPY212]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY215:%[0-9]+]]:vgpr_32 = COPY [[COPY213]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_34:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_35:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY215]], [[COPY214]], killed [[V_ADD_CO_U32_e64_35]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE62:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_34]], %subreg.sub0, [[V_ADDC_U32_e64_34]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD28:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.356, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_19:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD28]], 0, [[V_FMAC_F32_e64_18]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_42]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_15:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_21]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF13:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_15]], %bb.41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.36
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.36 (%ir-block.365):
+  ; CHECK-NEXT:   successors: %bb.37(0x40000000), %bb.40(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD29:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.366, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_20:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD29]], 0, [[PHI12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD30:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.370, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_21:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD30]], 0, [[V_FMAC_F32_e64_20]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD31:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.374, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_22:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD31]], 0, [[V_FMAC_F32_e64_21]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD32:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.378, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_23:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD32]], 0, [[V_FMAC_F32_e64_22]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sreg_32 = S_MOV_B32 128
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_43]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_16:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_22]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF14:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_16]], %bb.40, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.37
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.37 (%ir-block.387):
+  ; CHECK-NEXT:   successors: %bb.38(0x40000000), %bb.39(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD33:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.388, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_24:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD33]], 0, [[PHI13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD34:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.392, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_25:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD34]], 0, [[V_FMAC_F32_e64_24]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD35:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.396, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_26:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD35]], 0, [[V_FMAC_F32_e64_25]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD36:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.400, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_27:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD36]], 0, [[V_FMAC_F32_e64_26]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sreg_32 = S_MOV_B32 192
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_44]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_17:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_23]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[SI_IF15:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_17]], %bb.39, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.38
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.38 (%ir-block.409):
+  ; CHECK-NEXT:   successors: %bb.39(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD37:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.410, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_28:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD37]], 0, [[PHI14]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD38:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.414, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_29:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD38]], 0, [[V_FMAC_F32_e64_28]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD39:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.418, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_30:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD39]], 0, [[V_FMAC_F32_e64_29]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD40:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.422, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_31:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD40]], 0, [[V_FMAC_F32_e64_30]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.39.Flow77:
+  ; CHECK-NEXT:   successors: %bb.40(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI55:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.37, [[V_FMAC_F32_e64_31]], %bb.38
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF15]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.40.Flow78:
+  ; CHECK-NEXT:   successors: %bb.41(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI56:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.36, [[V_FMAC_F32_e64_27]], %bb.39
+  ; CHECK-NEXT:   [[PHI57:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.36, [[PHI55]], %bb.39
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF14]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.41.Flow79:
+  ; CHECK-NEXT:   successors: %bb.42(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI58:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.35, [[V_FMAC_F32_e64_23]], %bb.40
+  ; CHECK-NEXT:   [[PHI59:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.35, [[PHI56]], %bb.40
+  ; CHECK-NEXT:   [[PHI60:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.35, [[PHI57]], %bb.40
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF13]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.42.Flow80:
+  ; CHECK-NEXT:   successors: %bb.33(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI61:%[0-9]+]]:vgpr_32 = PHI [[PHI11]], %bb.34, [[V_FMAC_F32_e64_19]], %bb.41
+  ; CHECK-NEXT:   [[PHI62:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.34, [[PHI58]], %bb.41
+  ; CHECK-NEXT:   [[PHI63:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.34, [[PHI59]], %bb.41
+  ; CHECK-NEXT:   [[PHI64:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.34, [[PHI60]], %bb.41
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF12]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.33
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.43 (%ir-block.436):
+  ; CHECK-NEXT:   successors: %bb.44(0x40000000), %bb.45(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sreg_32 = S_MOV_B32 8
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_3:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_45]], [[V_BFE_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_ADD_LSHL_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_LSHL_U32_e64 killed [[V_LSHLREV_B32_e64_3]], [[V_AND_B32_e64_]], [[S_MOV_B32_46]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_ADD_LSHL_U32_e64_]], [[PHI47]], 0, 0, implicit $exec :: (store (s32) into %ir.439, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_ADD_LSHL_U32_e64_]], [[PHI48]], 256, 0, implicit $exec :: (store (s32) into %ir.440, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_ADD_LSHL_U32_e64_]], [[PHI49]], 512, 0, implicit $exec :: (store (s32) into %ir.441, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_ADD_LSHL_U32_e64_]], [[PHI50]], 768, 0, implicit $exec :: (store (s32) into %ir.442, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   ATOMIC_FENCE 5, 3
+  ; CHECK-NEXT:   S_BARRIER
+  ; CHECK-NEXT:   ATOMIC_FENCE 4, 3
+  ; CHECK-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+  ; CHECK-NEXT:   [[V_CMP_LT_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 [[V_MAD_U32_U24_e64_]], killed [[S_MOV_B32_47]], implicit $exec
+  ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY216:%[0-9]+]]:vgpr_32 = COPY [[DEF7]]
+  ; CHECK-NEXT:   [[COPY217:%[0-9]+]]:vreg_64_align2 = COPY [[DEF6]]
+  ; CHECK-NEXT:   [[SI_IF16:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_1]], %bb.45, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.44
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.44..preheader.i:
+  ; CHECK-NEXT:   successors: %bb.46(0x40000000), %bb.50(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_4:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_46]], [[V_MAD_U32_U24_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 0, 0, implicit $exec :: (load (s32) from %ir.447, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 1024, 0, implicit $exec :: (load (s32) from %ir.448, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_]], 0, killed [[DS_READ_B32_gfx9_1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 2048, 0, implicit $exec :: (load (s32) from %ir.451, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_2]], 0, killed [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 3072, 0, implicit $exec :: (load (s32) from %ir.454, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_3]], 0, killed [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 4096, 0, implicit $exec :: (load (s32) from %ir.457, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_4]], 0, killed [[V_ADD_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_5:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 5120, 0, implicit $exec :: (load (s32) from %ir.460, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_5]], 0, killed [[V_ADD_F32_e64_3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_6:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 6144, 0, implicit $exec :: (load (s32) from %ir.463, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_6]], 0, killed [[V_ADD_F32_e64_4]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_7:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 7168, 0, implicit $exec :: (load (s32) from %ir.466, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_7]], 0, killed [[V_ADD_F32_e64_5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_8:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 8192, 0, implicit $exec :: (load (s32) from %ir.469, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_7:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_8]], 0, killed [[V_ADD_F32_e64_6]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_9:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 9216, 0, implicit $exec :: (load (s32) from %ir.472, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_8:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_9]], 0, killed [[V_ADD_F32_e64_7]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_10:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 10240, 0, implicit $exec :: (load (s32) from %ir.475, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_9:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_10]], 0, killed [[V_ADD_F32_e64_8]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_11:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 11264, 0, implicit $exec :: (load (s32) from %ir.478, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_10:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_11]], 0, killed [[V_ADD_F32_e64_9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_12:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 12288, 0, implicit $exec :: (load (s32) from %ir.481, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_11:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_12]], 0, killed [[V_ADD_F32_e64_10]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_13:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 13312, 0, implicit $exec :: (load (s32) from %ir.484, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_12:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_13]], 0, killed [[V_ADD_F32_e64_11]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_14:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 14336, 0, implicit $exec :: (load (s32) from %ir.487, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_13:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_14]], 0, killed [[V_ADD_F32_e64_12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_15:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_LSHLREV_B32_e64_4]], 15360, 0, implicit $exec :: (load (s32) from %ir.490, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_14:%[0-9]+]]:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed [[DS_READ_B32_gfx9_15]], 0, killed [[V_ADD_F32_e64_13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B32_gfx9 [[V_LSHLREV_B32_e64_4]], [[V_ADD_F32_e64_14]], 0, 0, implicit $exec :: (store (s32) into %ir.447, !tbaa !13, addrspace 3)
+  ; CHECK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MAD_U32_U24_e64_]], [[S_LSHL_B32_]], 0, implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_18:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_24]], [[COPY28]], implicit $exec
+  ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY218:%[0-9]+]]:vgpr_32 = COPY [[DEF9]]
+  ; CHECK-NEXT:   [[COPY219:%[0-9]+]]:vreg_64_align2 = COPY [[DEF8]]
+  ; CHECK-NEXT:   [[SI_IF17:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_18]], %bb.50, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.46
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.45.Flow94:
+  ; CHECK-NEXT:   successors: %bb.23(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI65:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.43, %674, %bb.50
+  ; CHECK-NEXT:   [[PHI66:%[0-9]+]]:vgpr_32 = PHI [[COPY216]], %bb.43, %173, %bb.50
+  ; CHECK-NEXT:   [[PHI67:%[0-9]+]]:vreg_64_align2 = PHI [[COPY217]], %bb.43, %174, %bb.50
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF16]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.46 (%ir-block.501):
+  ; CHECK-NEXT:   successors: %bb.47(0x50000000), %bb.49(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY220:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_48]]
+  ; CHECK-NEXT:   [[V_CMP_EQ_F32_e64_3:%[0-9]+]]:sreg_64 = nofpexcept V_CMP_EQ_F32_e64 0, [[S_LOAD_DWORD_IMM1]], 0, [[COPY220]], 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_MUL_F32_e64 0, [[V_ADD_F32_e64_14]], 0, [[COPY23]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_19:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[V_ADD_U32_e64_24]], [[S_LOAD_DWORD_IMM2]], implicit $exec
+  ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_17:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_MUL_LO_U32_e64_19]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE63:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_MUL_LO_U32_e64_19]], %subreg.sub0, [[V_ASHRREV_I32_e64_17]], %subreg.sub1
+  ; CHECK-NEXT:   [[S_AND_B64_6:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[V_CMP_EQ_F32_e64_3]], implicit-def dead $scc
+  ; CHECK-NEXT:   $vcc = COPY [[S_AND_B64_6]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.49, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.47
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.47 (%ir-block.506):
+  ; CHECK-NEXT:   successors: %bb.49(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_17:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 killed [[S_MOV_B32_49]], [[REG_SEQUENCE63]], implicit $exec
+  ; CHECK-NEXT:   [[COPY221:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub0
+  ; CHECK-NEXT:   [[COPY222:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_17]].sub0
+  ; CHECK-NEXT:   [[COPY223:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub1
+  ; CHECK-NEXT:   [[COPY224:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_17]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_36:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_37:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY221]], [[COPY222]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY225:%[0-9]+]]:vgpr_32 = COPY [[COPY223]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_36:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_37:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY225]], [[COPY224]], killed [[V_ADD_CO_U32_e64_37]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE64:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_36]], %subreg.sub0, [[V_ADDC_U32_e64_36]], %subreg.sub1
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD41:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[REG_SEQUENCE64]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.507, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_32:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, killed [[GLOBAL_LOAD_DWORD41]], 0, [[S_LOAD_DWORD_IMM1]], 0, [[V_MUL_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.49
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.48.Flow:
+  ; CHECK-NEXT:   successors: %bb.9(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI68:%[0-9]+]]:vgpr_32 = PHI [[COPY53]], %bb.7, [[V_MUL_F32_e64_]], %bb.8
+  ; CHECK-NEXT:   [[COPY226:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE15]]
+  ; CHECK-NEXT:   [[COPY227:%[0-9]+]]:sreg_64 = COPY $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.49.Flow76:
+  ; CHECK-NEXT:   successors: %bb.50(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI69:%[0-9]+]]:vgpr_32 = PHI [[V_MUL_F32_e64_1]], %bb.46, [[V_FMAC_F32_e64_32]], %bb.47
+  ; CHECK-NEXT:   [[COPY228:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE63]]
+  ; CHECK-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[PHI]], $exec, implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.50.Flow95:
+  ; CHECK-NEXT:   successors: %bb.45(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI70:%[0-9]+]]:sreg_64 = PHI [[PHI]], %bb.44, [[S_OR_B64_]], %bb.49
+  ; CHECK-NEXT:   [[PHI71:%[0-9]+]]:vgpr_32 = PHI [[COPY218]], %bb.44, [[PHI69]], %bb.49
+  ; CHECK-NEXT:   [[PHI72:%[0-9]+]]:vreg_64_align2 = PHI [[COPY219]], %bb.44, [[COPY228]], %bb.49
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF17]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[S_ANDN2_B64_:%[0-9]+]]:sreg_64 = S_ANDN2_B64 [[PHI]], $exec, implicit-def $scc
+  ; CHECK-NEXT:   [[S_AND_B64_7:%[0-9]+]]:sreg_64 = S_AND_B64 [[PHI70]], $exec, implicit-def $scc
+  ; CHECK-NEXT:   [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_ANDN2_B64_]], [[S_AND_B64_7]], implicit-def $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.45
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.51..sink.split.i:
+  ; CHECK-NEXT:   successors: %bb.52(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_18:%[0-9]+]]:vreg_64_align2 = V_LSHLREV_B64_e64 killed [[S_MOV_B32_50]], [[PHI39]], implicit $exec
+  ; CHECK-NEXT:   [[COPY229:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub0
+  ; CHECK-NEXT:   [[COPY230:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_18]].sub0
+  ; CHECK-NEXT:   [[COPY231:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY [[REG_SEQUENCE14]].sub1
+  ; CHECK-NEXT:   [[COPY232:%[0-9]+]]:vgpr_32 = COPY [[V_LSHLREV_B64_e64_18]].sub1
+  ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_38:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_39:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY229]], [[COPY230]], 0, implicit $exec
+  ; CHECK-NEXT:   [[COPY233:%[0-9]+]]:vgpr_32 = COPY [[COPY231]]
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_38:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_39:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY233]], [[COPY232]], killed [[V_ADD_CO_U32_e64_39]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE65:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_38]], %subreg.sub0, [[V_ADDC_U32_e64_38]], %subreg.sub1
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD killed [[REG_SEQUENCE65]], [[PHI38]], 0, 0, implicit $exec :: (store (s32) into %ir.516, !tbaa !13, addrspace 1)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.52.Flow96:
+  ; CHECK-NEXT:   successors: %bb.53(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF7]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.53.Flow97:
+  ; CHECK-NEXT:   successors: %bb.54(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.54._Z25rocblas_gemvn_kernel_calcILi64ELi16EiffLi0EEviiT3_PKT2_T1_S3_iS0_PS1_i.exit:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0 (%ir-block.18):
+    successors: %bb.1(0x40000000), %bb.54(0x40000000)
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8
+
+    %181:sgpr_32 = COPY $sgpr8
+    %180:sgpr_32 = COPY $sgpr7
+    %179:sgpr_32 = COPY $sgpr6
+    %178:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    %176:vgpr_32(s32) = COPY $vgpr0
+    %183:sreg_32 = S_MOV_B32 0
+    %184:sreg_32 = S_MOV_B32 136
+    %185:sreg_64 = REG_SEQUENCE killed %184, %subreg.sub0, %183, %subreg.sub1
+    %682:sreg_32 = COPY %178.sub0(p4)
+    %683:sreg_32 = COPY %178.sub1(p4)
+    %684:sreg_32 = COPY %185.sub0
+    %685:sreg_32 = COPY %185.sub1
+    %680:sreg_32 = S_ADD_U32 %682, %684, implicit-def $scc
+    %681:sreg_32 = S_ADDC_U32 %683, %685, implicit-def $scc, implicit $scc
+    %186:sreg_64 = REG_SEQUENCE %680, %subreg.sub0, %681, %subreg.sub1
+    %187:sreg_64_xexec = S_LOAD_DWORDX2_IMM %178(p4), 136, 0 :: (invariant load (s64) from %ir.20, align 4, addrspace 4)
+    %188:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %178(p4), 144, 0 :: (invariant load (s32) from %ir.20 + 8, addrspace 4)
+    %189:sreg_32 = COPY %187.sub1
+    %190:sreg_32 = COPY %187.sub0
+    S_CMP_LT_U32 %179, killed %190, implicit-def $scc
+    %191:sreg_32 = S_MOV_B32 18
+    %192:sreg_32 = S_MOV_B32 12
+    %193:sreg_32 = S_CSELECT_B32 killed %192, killed %191, implicit $scc
+    %194:sreg_64 = REG_SEQUENCE killed %193, %subreg.sub0, %183, %subreg.sub1
+    %688:sreg_32 = COPY %186.sub0
+    %689:sreg_32 = COPY %186.sub1
+    %690:sreg_32 = COPY %194.sub0
+    %691:sreg_32 = COPY %194.sub1
+    %686:sreg_32 = S_ADD_U32 %688, %690, implicit-def $scc
+    %687:sreg_32 = S_ADDC_U32 %689, %691, implicit-def $scc, implicit $scc
+    %195:sreg_64 = REG_SEQUENCE %686, %subreg.sub0, %687, %subreg.sub1
+    %196:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %197:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed %195, %196, 0, 0, implicit $exec :: (invariant load (s16) from %ir.27, !tbaa !10, addrspace 4)
+    S_CMP_LT_U32 %180, killed %189, implicit-def $scc
+    %198:sreg_32 = S_MOV_B32 20
+    %199:sreg_32 = S_MOV_B32 14
+    %200:sreg_32 = S_CSELECT_B32 killed %199, killed %198, implicit $scc
+    %201:sreg_64 = REG_SEQUENCE killed %200, %subreg.sub0, %183, %subreg.sub1
+    %694:sreg_32 = COPY %186.sub0
+    %695:sreg_32 = COPY %186.sub1
+    %696:sreg_32 = COPY %201.sub0
+    %697:sreg_32 = COPY %201.sub1
+    %692:sreg_32 = S_ADD_U32 %694, %696, implicit-def $scc
+    %693:sreg_32 = S_ADDC_U32 %695, %697, implicit-def $scc, implicit $scc
+    %202:sreg_64 = REG_SEQUENCE %692, %subreg.sub0, %693, %subreg.sub1
+    %203:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed %202, %196, 0, 0, implicit $exec :: (invariant load (s16) from %ir.33, !tbaa !10, addrspace 4)
+    %631:vgpr_32 = nuw V_MUL_LO_U32_e64 %203, %197, implicit $exec
+    S_CMP_LT_U32 %181, killed %188, implicit-def $scc
+    %207:sreg_32 = S_MOV_B32 22
+    %208:sreg_32 = S_MOV_B32 16
+    %209:sreg_32 = S_CSELECT_B32 killed %208, killed %207, implicit $scc
+    %210:sreg_64 = REG_SEQUENCE killed %209, %subreg.sub0, %183, %subreg.sub1
+    %700:sreg_32 = COPY %186.sub0
+    %701:sreg_32 = COPY %186.sub1
+    %702:sreg_32 = COPY %210.sub0
+    %703:sreg_32 = COPY %210.sub1
+    %698:sreg_32 = S_ADD_U32 %700, %702, implicit-def $scc
+    %699:sreg_32 = S_ADDC_U32 %701, %703, implicit-def $scc, implicit $scc
+    %211:sreg_64 = REG_SEQUENCE %698, %subreg.sub0, %699, %subreg.sub1
+    %212:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed %211, %196, 0, 0, implicit $exec :: (invariant load (s16) from %ir.40, !tbaa !10, addrspace 4)
+    %632:vgpr_32 = V_MUL_LO_U32_e64 killed %631, %212, implicit $exec
+    %215:sreg_32 = S_MOV_B32 1024
+    %633:sreg_64_xexec = V_CMP_NE_U32_e64 killed %632, killed %215, implicit $exec
+    $vcc = S_AND_B64 $exec, %633, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.54, implicit $vcc
+    S_BRANCH %bb.1
+
+  bb.1 (%ir-block.44):
+    successors: %bb.2(0x40000000), %bb.53(0x40000000)
+
+    %216:sgpr_128 = S_LOAD_DWORDX4_IMM %178(p4), 0, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset65, addrspace 4)
+    %5:sgpr_32 = COPY %216.sub2
+    %217:sreg_32 = COPY %216.sub1
+    %218:sreg_32 = COPY %216.sub0
+    %219:sgpr_96 = REG_SEQUENCE killed %218, %subreg.sub0, killed %217, %subreg.sub1, %5, %subreg.sub2
+    %4:sgpr_96 = COPY %219
+    %220:sgpr_32 = S_LOAD_DWORD_IMM %178(p4), 88, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset55, align 8, addrspace 4)
+    %221:sgpr_32 = S_MOV_B32 0
+    %223:vgpr_32 = COPY killed %221
+    %222:sreg_64 = contract nofpexcept V_CMP_EQ_F32_e64 0, %5, 0, %223, 0, implicit $mode, implicit $exec
+    %224:sgpr_32 = S_MOV_B32 1065353216
+    %226:vgpr_32 = COPY killed %224
+    %225:sreg_64 = contract nofpexcept V_CMP_EQ_F32_e64 0, %220, 0, %226, 0, implicit $mode, implicit $exec
+    %227:sreg_64 = S_AND_B64 killed %222, killed %225, implicit-def dead $scc
+    %228:sreg_64 = S_AND_B64 $exec, killed %227, implicit-def dead $scc
+    $vcc = COPY %228
+    S_CBRANCH_VCCNZ %bb.53, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2 (%ir-block.49):
+    successors: %bb.3(0x50000000), %bb.4(0x30000000)
+
+    %7:sreg_32 = COPY %4.sub0
+    %233:sgpr_128 = S_LOAD_DWORDX4_IMM %178(p4), 104, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset57, align 8, addrspace 4)
+    %234:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %178(p4), 120, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset61, align 8, addrspace 4)
+    %235:sreg_64_xexec = S_LOAD_DWORDX2_IMM %178(p4), 128, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset63, align 16, addrspace 4)
+    %236:sreg_32 = COPY %233.sub1
+    %237:sreg_32 = COPY %233.sub0
+    %238:sreg_64 = REG_SEQUENCE killed %237, %subreg.sub0, killed %236, %subreg.sub1
+    %239:sreg_32 = COPY %233.sub3
+    %240:sreg_32 = COPY %233.sub2
+    %241:sreg_64 = REG_SEQUENCE killed %240, %subreg.sub0, killed %239, %subreg.sub1
+    %242:sreg_32 = COPY %235.sub1
+    %243:sreg_32_xm0 = S_ASHR_I32 %180, 31, implicit-def dead $scc
+    %245:sreg_64 = REG_SEQUENCE %180, %subreg.sub0, %243, %subreg.sub1
+    %246:sreg_32 = COPY %245.sub1
+    %9:sreg_64 = COPY %245
+    %247:sreg_32 = S_MUL_I32 %180, killed %242
+    %248:sreg_32 = COPY %235.sub0
+    %249:sreg_32 = S_MUL_HI_U32 %180, %248
+    %250:sreg_32 = S_ADD_I32 killed %249, killed %247, implicit-def dead $scc
+    %251:sreg_32 = S_MUL_I32 killed %246, %248
+    %252:sreg_32 = S_ADD_I32 killed %250, killed %251, implicit-def dead $scc
+    %253:sreg_32 = S_MUL_I32 %180, %248
+    %254:sreg_64 = REG_SEQUENCE killed %253, %subreg.sub0, killed %252, %subreg.sub1
+    %255:sreg_32 = S_MOV_B32 2
+    %256:sreg_64 = S_LSHL_B64 killed %254, %255, implicit-def dead $scc
+    %706:sreg_32 = COPY %238.sub0
+    %707:sreg_32 = COPY %238.sub1
+    %708:sreg_32 = COPY %256.sub0
+    %709:sreg_32 = COPY %256.sub1
+    %704:sreg_32 = S_ADD_U32 %706, %708, implicit-def $scc
+    %705:sreg_32 = S_ADDC_U32 %707, %709, implicit-def $scc, implicit $scc
+    %257:sreg_64 = REG_SEQUENCE %704, %subreg.sub0, %705, %subreg.sub1
+    %258:sreg_64 = S_LSHL_B64 killed %241, %255, implicit-def dead $scc
+    %712:sreg_32 = COPY %257.sub0
+    %713:sreg_32 = COPY %257.sub1
+    %714:sreg_32 = COPY %258.sub0
+    %715:sreg_32 = COPY %258.sub1
+    %710:sreg_32 = S_ADD_U32 %712, %714, implicit-def $scc
+    %711:sreg_32 = S_ADDC_U32 %713, %715, implicit-def $scc, implicit $scc
+    %10:sreg_64 = REG_SEQUENCE %710, %subreg.sub0, %711, %subreg.sub1
+    %259:sreg_32 = S_MOV_B32 1023
+    %11:vgpr_32 = V_AND_B32_e64 %176(s32), killed %259, implicit $exec
+    %12:vgpr_32 = V_BFE_U32_e64 %176(s32), 10, 10, implicit $exec
+    %13:vgpr_32 = nuw nsw V_MAD_U32_U24_e64 %12, %197, %11, 0, implicit $exec
+    %260:sgpr_32 = S_MOV_B32 0
+    %262:vgpr_32 = COPY killed %260
+    %261:sreg_64 = nofpexcept V_CMP_NEQ_F32_e64 0, %5, 0, %262, 0, implicit $mode, implicit $exec
+    %232:sreg_64 = S_MOV_B64 -1
+    %231:sreg_64 = S_MOV_B64 0
+    %230:sreg_64 = IMPLICIT_DEF
+    %229:sgpr_32 = IMPLICIT_DEF
+    %263:sreg_64 = S_AND_B64 $exec, killed %261, implicit-def dead $scc
+    $vcc = COPY %263
+    %634:vgpr_32 = COPY %229
+    %635:vreg_64_align2 = COPY %230
+    S_CBRANCH_VCCNZ %bb.4, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.3 (%ir-block.61):
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %267:sreg_32 = S_MOV_B32 256
+    %268:sreg_64 = V_CMP_LT_U32_e64 %13, killed %267, implicit $exec
+    %266:sreg_64 = S_MOV_B64 0
+    %265:sreg_64 = IMPLICIT_DEF
+    %264:sgpr_32 = IMPLICIT_DEF
+    %637:vgpr_32 = COPY %264
+    %638:vreg_64_align2 = COPY %265
+    %14:sreg_64 = SI_IF killed %268, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.4.Flow90:
+    successors: %bb.10(0x40000000), %bb.23(0x40000000)
+
+    %17:sreg_64 = PHI %231, %bb.2, %23, %bb.6
+    %15:vgpr_32 = PHI %634, %bb.2, %21, %bb.6
+    %16:vreg_64_align2 = PHI %635, %bb.2, %22, %bb.6
+    %18:sreg_64 = PHI %232, %bb.2, %266, %bb.6
+    %289:sreg_64 = S_AND_B64 $exec, %18, implicit-def dead $scc
+    $vcc = COPY %289
+    S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+    S_BRANCH %bb.23
+
+  bb.5 (%ir-block.70):
+    successors: %bb.7(0x40000000), %bb.9(0x40000000)
+
+    %272:sreg_32 = S_MOV_B32 8
+    %273:vgpr_32 = COPY killed %272
+    %19:vgpr_32 = nuw V_LSHL_ADD_U32_e64 %179, %273, %13, implicit $exec
+    %274:sreg_64 = V_CMP_LT_I32_e64 %19, %7, implicit $exec
+    %271:sreg_64 = S_MOV_B64 0
+    %270:sreg_64 = IMPLICIT_DEF
+    %269:sgpr_32 = IMPLICIT_DEF
+    %640:vgpr_32 = COPY %269
+    %641:vreg_64_align2 = COPY %270
+    %20:sreg_64 = SI_IF killed %274, %bb.9, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.7
+
+  bb.6.Flow91:
+    successors: %bb.4(0x80000000)
+
+    %23:sreg_64 = PHI %266, %bb.3, %669, %bb.9
+    %21:vgpr_32 = PHI %637, %bb.3, %26, %bb.9
+    %22:vreg_64_align2 = PHI %638, %bb.3, %27, %bb.9
+    SI_END_CF %14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.7 (%ir-block.80):
+    successors: %bb.8(0x50000000), %bb.48(0x30000000)
+
+    %275:sgpr_32 = S_MOV_B32 0
+    %277:vgpr_32 = COPY %275
+    %276:sreg_64 = nofpexcept V_CMP_EQ_F32_e64 0, %220, 0, %277, 0, implicit $mode, implicit $exec
+    %278:vgpr_32 = nsw V_MUL_LO_U32_e64 %19, %234, implicit $exec
+    %279:vgpr_32 = V_ASHRREV_I32_e64 31, %278, implicit $exec
+    %281:vreg_64_align2 = REG_SEQUENCE %278, %subreg.sub0, %279, %subreg.sub1
+    %24:vreg_64_align2 = COPY %281
+    %282:sreg_64 = S_AND_B64 $exec, killed %276, implicit-def dead $scc
+    $vcc = COPY %282
+    %664:vgpr_32 = COPY %275, implicit $exec
+    S_CBRANCH_VCCNZ %bb.48, implicit $vcc
+    S_BRANCH %bb.8
+
+  bb.8 (%ir-block.84):
+    successors: %bb.48(0x80000000)
+
+    %283:sreg_32 = S_MOV_B32 2
+    %284:vreg_64_align2 = V_LSHLREV_B64_e64 killed %283, %281, implicit $exec
+    %720:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub0
+    %721:vgpr_32 = COPY %284.sub0
+    %722:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub1
+    %723:vgpr_32 = COPY %284.sub1
+    %716:vgpr_32, %718:sreg_64_xexec = V_ADD_CO_U32_e64 %720, %721, 0, implicit $exec
+    %724:vgpr_32 = COPY %722
+    %717:vgpr_32, dead %719:sreg_64_xexec = V_ADDC_U32_e64 %724, %723, killed %718, 0, implicit $exec
+    %285:vreg_64_align2 = REG_SEQUENCE %716, %subreg.sub0, %717, %subreg.sub1
+    %286:vgpr_32 = GLOBAL_LOAD_DWORD killed %285, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.85, !tbaa !13, addrspace 1)
+    %25:vgpr_32 = contract nofpexcept V_MUL_F32_e64 0, killed %286, 0, %220, 0, 0, implicit $mode, implicit $exec
+    S_BRANCH %bb.48
+
+  bb.9.Flow92:
+    successors: %bb.6(0x80000000)
+
+    %28:sreg_64 = PHI %271, %bb.5, %672, %bb.48
+    %26:vgpr_32 = PHI %640, %bb.5, %171, %bb.48
+    %27:vreg_64_align2 = PHI %641, %bb.5, %24, %bb.48
+    SI_END_CF %20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %670:sreg_64 = S_AND_B64 %28, $exec, implicit-def $scc
+    %669:sreg_64 = COPY %670
+    S_BRANCH %bb.6
+
+  bb.10 (%ir-block.91):
+    successors: %bb.11(0x40000000), %bb.12(0x40000000)
+
+    %291:sgpr_128 = S_LOAD_DWORDX4_IMM %178(p4), 24, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset39, align 8, addrspace 4)
+    %292:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %178(p4), 40, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset43, align 8, addrspace 4)
+    %293:sgpr_128 = S_LOAD_DWORDX4_IMM %178(p4), 48, 0 :: (dereferenceable invariant load (s128) from %ir..kernarg.offset45, addrspace 4)
+    %294:sreg_64_xexec = S_LOAD_DWORDX2_IMM %178(p4), 64, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset45 + 16, align 16, addrspace 4)
+    %295:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %178(p4), 72, 0 :: (dereferenceable invariant load (s32) from %ir..kernarg.offset51, align 8, addrspace 4)
+    %296:sreg_64_xexec = S_LOAD_DWORDX2_IMM %178(p4), 80, 0 :: (dereferenceable invariant load (s64) from %ir..kernarg.offset53, align 16, addrspace 4)
+    %297:sreg_32 = COPY %291.sub1
+    %298:sreg_32 = COPY %291.sub0
+    %299:sreg_64 = REG_SEQUENCE killed %298, %subreg.sub0, killed %297, %subreg.sub1
+    %300:sreg_32 = COPY %291.sub3
+    %301:sreg_32 = COPY %291.sub2
+    %302:sreg_64 = REG_SEQUENCE killed %301, %subreg.sub0, killed %300, %subreg.sub1
+    %303:sreg_32 = COPY %293.sub0
+    %304:sreg_32 = COPY %293.sub1
+    %305:sreg_32 = COPY %294.sub1
+    %306:sreg_32 = COPY %294.sub0
+    %307:sreg_32 = COPY %293.sub3
+    %308:sreg_32 = COPY %293.sub2
+    %309:sreg_64 = REG_SEQUENCE killed %308, %subreg.sub0, killed %307, %subreg.sub1
+    %29:sreg_64 = COPY %309
+    %310:sreg_64 = REG_SEQUENCE killed %306, %subreg.sub0, killed %305, %subreg.sub1
+    %311:sreg_32 = COPY %296.sub1
+    %312:sreg_32 = COPY %9.sub0
+    %313:sreg_32 = S_MUL_I32 %312, killed %304
+    %314:sreg_32 = S_MUL_HI_U32 %312, %303
+    %315:sreg_32 = S_ADD_I32 killed %314, killed %313, implicit-def dead $scc
+    %316:sreg_32 = COPY %9.sub1
+    %317:sreg_32 = S_MUL_I32 %316, %303
+    %318:sreg_32 = S_ADD_I32 killed %315, killed %317, implicit-def dead $scc
+    %319:sreg_32 = S_MUL_I32 %312, %303
+    %320:sreg_64 = REG_SEQUENCE killed %319, %subreg.sub0, killed %318, %subreg.sub1
+    %321:sreg_32 = S_MOV_B32 2
+    %322:sreg_64 = S_LSHL_B64 killed %320, %321, implicit-def dead $scc
+    %727:sreg_32 = COPY %299.sub0
+    %728:sreg_32 = COPY %299.sub1
+    %729:sreg_32 = COPY %322.sub0
+    %730:sreg_32 = COPY %322.sub1
+    %725:sreg_32 = S_ADD_U32 %727, %729, implicit-def $scc
+    %726:sreg_32 = S_ADDC_U32 %728, %730, implicit-def $scc, implicit $scc
+    %323:sreg_64 = REG_SEQUENCE %725, %subreg.sub0, %726, %subreg.sub1
+    %324:sreg_64 = S_LSHL_B64 killed %302, %321, implicit-def dead $scc
+    %733:sreg_32 = COPY %323.sub0
+    %734:sreg_32 = COPY %323.sub1
+    %735:sreg_32 = COPY %324.sub0
+    %736:sreg_32 = COPY %324.sub1
+    %731:sreg_32 = S_ADD_U32 %733, %735, implicit-def $scc
+    %732:sreg_32 = S_ADDC_U32 %734, %736, implicit-def $scc, implicit $scc
+    %31:sreg_64 = REG_SEQUENCE %731, %subreg.sub0, %732, %subreg.sub1
+    %325:sreg_32 = S_MUL_I32 %312, killed %311
+    %326:sreg_32 = COPY %296.sub0
+    %327:sreg_32 = S_MUL_HI_U32 %312, %326
+    %328:sreg_32 = S_ADD_I32 killed %327, killed %325, implicit-def dead $scc
+    %329:sreg_32 = S_MUL_I32 %316, %326
+    %330:sreg_32 = S_ADD_I32 killed %328, killed %329, implicit-def dead $scc
+    %331:sreg_32 = S_MUL_I32 %312, %326
+    %332:sreg_64 = REG_SEQUENCE killed %331, %subreg.sub0, killed %330, %subreg.sub1
+    %333:sreg_64 = S_LSHL_B64 %332, %321, implicit-def dead $scc
+    %739:sreg_32 = COPY %309.sub0
+    %740:sreg_32 = COPY %309.sub1
+    %741:sreg_32 = COPY %333.sub0
+    %742:sreg_32 = COPY %333.sub1
+    %737:sreg_32 = S_ADD_U32 %739, %741, implicit-def $scc
+    %738:sreg_32 = S_ADDC_U32 %740, %742, implicit-def $scc, implicit $scc
+    %334:sreg_64 = REG_SEQUENCE %737, %subreg.sub0, %738, %subreg.sub1
+    %335:sreg_64 = S_LSHL_B64 %310, %321, implicit-def dead $scc
+    %745:sreg_32 = COPY %334.sub0
+    %746:sreg_32 = COPY %334.sub1
+    %747:sreg_32 = COPY %335.sub0
+    %748:sreg_32 = COPY %335.sub1
+    %743:sreg_32 = S_ADD_U32 %745, %747, implicit-def $scc
+    %744:sreg_32 = S_ADDC_U32 %746, %748, implicit-def $scc, implicit $scc
+    %33:sreg_64 = REG_SEQUENCE %743, %subreg.sub0, %744, %subreg.sub1
+    %34:sreg_32 = COPY %4.sub1
+    %336:sreg_32 = S_MOV_B32 8
+    %37:sreg_32 = S_LSHL_B32 %179, killed %336, implicit-def dead $scc
+    %38:vgpr_32 = V_ADD_U32_e64 %37, %11, 0, implicit $exec
+    %337:sreg_32 = S_MOV_B32 31
+    %338:sreg_32 = S_ASHR_I32 %34, killed %337, implicit-def dead $scc
+    %339:sreg_32 = S_MOV_B32 26
+    %340:sreg_32 = S_LSHR_B32 killed %338, killed %339, implicit-def dead $scc
+    %341:sreg_32 = S_ADD_I32 %34, killed %340, implicit-def dead $scc
+    %342:sreg_32 = S_MOV_B32 -64
+    %41:sreg_32 = S_AND_B32 killed %341, killed %342, implicit-def dead $scc
+    %39:sreg_32 = S_SUB_I32 %34, %41, implicit-def dead $scc
+    %40:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 %321, %12, implicit $exec
+    %343:sreg_64 = V_CMP_LT_I32_e64 %40, %41, implicit $exec
+    %290:sgpr_32 = S_MOV_B32 0
+    %644:vgpr_32 = COPY %290, implicit $exec
+    %645:vgpr_32 = COPY %290, implicit $exec
+    %646:vgpr_32 = COPY %290, implicit $exec
+    %647:vgpr_32 = COPY %290, implicit $exec
+    %932:sreg_64 = V_CMP_LT_I32_e64 %38, %7, implicit $exec
+    %42:sreg_64 = SI_IF killed %343, %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.11
+
+  bb.11..lr.ph.i:
+    successors: %bb.13(0x80000000)
+
+    %347:sreg_64 = V_CMP_LT_I32_e64 %38, %7, implicit $exec
+    %43:sreg_64 = COPY %347
+    %348:sreg_32 = S_MOV_B32 64
+    %349:vgpr_32 = nsw V_ADD_U32_e64 %38, %348, 0, implicit $exec
+    %350:sreg_64 = V_CMP_LT_I32_e64 killed %349, %7, implicit $exec
+    %44:sreg_64 = COPY %350
+    %351:sreg_32 = S_MOV_B32 128
+    %352:vgpr_32 = nsw V_ADD_U32_e64 %38, killed %351, 0, implicit $exec
+    %353:sreg_64 = V_CMP_LT_I32_e64 killed %352, %7, implicit $exec
+    %45:sreg_64 = COPY %353
+    %354:sreg_32 = S_MOV_B32 192
+    %355:vgpr_32 = nsw V_ADD_U32_e64 %38, killed %354, 0, implicit $exec
+    %356:sreg_64 = V_CMP_LT_I32_e64 killed %355, %7, implicit $exec
+    %46:sreg_64 = COPY %356
+    %357:vgpr_32 = V_MUL_LO_U32_e64 %292, %40, implicit $exec
+    %47:vgpr_32 = V_ADD_U32_e64 killed %357, %292, 0, implicit $exec
+    %358:sreg_32 = S_MOV_B32 6
+    %48:sreg_32 = S_LSHL_B32 %292, %358, implicit-def dead $scc
+    %360:vgpr_32 = nuw nsw V_ADD_U32_e64 %40, %321, 0, implicit $exec
+    %49:vgpr_32 = V_MUL_LO_U32_e64 %292, %360, implicit $exec
+    %361:sreg_32 = S_MOV_B32 3
+    %362:vgpr_32 = nuw nsw V_ADD_U32_e64 %40, killed %361, 0, implicit $exec
+    %50:vgpr_32 = V_MUL_LO_U32_e64 %292, %362, implicit $exec
+    %363:vgpr_32 = V_MUL_LO_U32_e64 %12, %292, implicit $exec
+    %51:vgpr_32 = V_LSHLREV_B32_e64 %321, killed %363, implicit $exec
+    %364:vgpr_32 = V_MUL_LO_U32_e64 %295, %40, implicit $exec
+    %52:vgpr_32 = V_ADD_U32_e64 killed %364, %295, 0, implicit $exec
+    %53:sreg_32 = S_LSHL_B32 %295, %358, implicit-def dead $scc
+    %54:vgpr_32 = V_MUL_LO_U32_e64 %295, %360, implicit $exec
+    %55:vgpr_32 = V_MUL_LO_U32_e64 %295, %362, implicit $exec
+    %751:sreg_32 = COPY %333.sub0
+    %752:sreg_32 = COPY %333.sub1
+    %753:sreg_32 = COPY %335.sub0
+    %754:sreg_32 = COPY %335.sub1
+    %749:sreg_32 = S_ADD_U32 %751, %753, implicit-def $scc
+    %750:sreg_32 = S_ADDC_U32 %752, %754, implicit-def $scc, implicit $scc
+    %367:sreg_64 = REG_SEQUENCE %749, %subreg.sub0, %750, %subreg.sub1
+    %368:vgpr_32 = V_MUL_LO_U32_e64 %12, %295, implicit $exec
+    %369:vgpr_32 = V_LSHLREV_B32_e64 %321, killed %368, implicit $exec
+    %370:vgpr_32 = V_ASHRREV_I32_e64 31, %369, implicit $exec
+    %372:vreg_64_align2 = REG_SEQUENCE %369, %subreg.sub0, %370, %subreg.sub1
+    %373:vreg_64_align2 = nsw V_LSHLREV_B64_e64 %321, killed %372, implicit $exec
+    %757:sreg_32 = COPY %29.sub0
+    %758:sreg_32 = COPY %29.sub1
+    %759:sreg_32 = COPY %367.sub0
+    %760:sreg_32 = COPY %367.sub1
+    %755:sreg_32 = S_ADD_U32 %757, %759, implicit-def $scc
+    %756:sreg_32 = S_ADDC_U32 %758, %760, implicit-def $scc, implicit $scc
+    %374:sreg_64 = REG_SEQUENCE %755, %subreg.sub0, %756, %subreg.sub1
+    %765:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %374.sub0
+    %766:vgpr_32 = COPY %373.sub0
+    %767:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %374.sub1
+    %768:vgpr_32 = COPY %373.sub1
+    %761:vgpr_32, %763:sreg_64_xexec = V_ADD_CO_U32_e64 %765, %766, 0, implicit $exec
+    %769:vgpr_32 = COPY %767
+    %762:vgpr_32, dead %764:sreg_64_xexec = V_ADDC_U32_e64 %769, %768, killed %763, 0, implicit $exec
+    %56:vreg_64_align2 = REG_SEQUENCE %761, %subreg.sub0, %762, %subreg.sub1
+    %375:sreg_32_xm0 = S_ASHR_I32 %53, 31, implicit-def dead $scc
+    %377:sreg_64 = REG_SEQUENCE %53, %subreg.sub0, %375, %subreg.sub1
+    %57:sreg_64 = nsw S_LSHL_B64 killed %377, %321, implicit-def dead $scc
+    %346:sgpr_32 = S_MOV_B32 0
+    %344:sreg_64 = S_MOV_B64 0
+    %648:vgpr_32 = COPY %346, implicit $exec
+    %649:vgpr_32 = COPY %346, implicit $exec
+    %650:vgpr_32 = COPY %346, implicit $exec
+    %651:vgpr_32 = COPY %346, implicit $exec
+    %378:sreg_64 = COPY %43
+    %425:sreg_64 = COPY %44
+    %433:sreg_64 = COPY %45
+    %441:sreg_64 = COPY %46
+    S_BRANCH %bb.13
+
+  bb.12.Flow89:
+    successors: %bb.24(0x80000000)
+
+    %58:vgpr_32 = PHI %40, %bb.10, %98, %bb.22
+    %59:vgpr_32 = PHI %644, %bb.10, %106, %bb.22
+    %60:vgpr_32 = PHI %645, %bb.10, %105, %bb.22
+    %61:vgpr_32 = PHI %646, %bb.10, %104, %bb.22
+    %62:vgpr_32 = PHI %647, %bb.10, %103, %bb.22
+    SI_END_CF %42, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.24
+
+  bb.13 (%ir-block.146):
+    successors: %bb.14(0x40000000), %bb.21(0x40000000)
+
+    %63:sreg_64 = PHI %344, %bb.11, %102, %bb.21
+    %64:vreg_64_align2 = PHI %56, %bb.11, %101, %bb.21
+    %65:sreg_32 = PHI %346, %bb.11, %100, %bb.21
+    %66:vgpr_32 = PHI %38, %bb.11, %99, %bb.21
+    %67:vgpr_32 = PHI %40, %bb.11, %98, %bb.21
+    %68:vgpr_32 = PHI %648, %bb.11, %97, %bb.21
+    %69:vgpr_32 = PHI %649, %bb.11, %96, %bb.21
+    %70:vgpr_32 = PHI %650, %bb.11, %95, %bb.21
+    %71:vgpr_32 = PHI %651, %bb.11, %94, %bb.21
+    %72:sreg_64 = SI_IF %378, %bb.21, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.14
+
+  bb.14 (%ir-block.150):
+    successors: %bb.15(0x40000000), %bb.20(0x40000000)
+
+    %73:vgpr_32 = GLOBAL_LOAD_DWORD %64, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.lsr.iv33, !tbaa !13, addrspace 1)
+    %379:vgpr_32 = V_ADD_U32_e64 %52, %65, 0, implicit $exec
+    %380:vgpr_32 = V_ASHRREV_I32_e64 31, %379, implicit $exec
+    %382:vreg_64_align2 = REG_SEQUENCE %379, %subreg.sub0, %380, %subreg.sub1
+    %384:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %382, implicit $exec
+    %774:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %775:vgpr_32 = COPY %384.sub0
+    %776:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %777:vgpr_32 = COPY %384.sub1
+    %770:vgpr_32, %772:sreg_64_xexec = V_ADD_CO_U32_e64 %774, %775, 0, implicit $exec
+    %778:vgpr_32 = COPY %776
+    %771:vgpr_32, dead %773:sreg_64_xexec = V_ADDC_U32_e64 %778, %777, killed %772, 0, implicit $exec
+    %385:vreg_64_align2 = REG_SEQUENCE %770, %subreg.sub0, %771, %subreg.sub1
+    %74:vgpr_32 = GLOBAL_LOAD_DWORD killed %385, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.154, !tbaa !13, addrspace 1)
+    %386:vgpr_32 = V_ADD_U32_e64 %54, %65, 0, implicit $exec
+    %387:vgpr_32 = V_ASHRREV_I32_e64 31, %386, implicit $exec
+    %389:vreg_64_align2 = REG_SEQUENCE %386, %subreg.sub0, %387, %subreg.sub1
+    %390:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %389, implicit $exec
+    %783:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %784:vgpr_32 = COPY %390.sub0
+    %785:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %786:vgpr_32 = COPY %390.sub1
+    %779:vgpr_32, %781:sreg_64_xexec = V_ADD_CO_U32_e64 %783, %784, 0, implicit $exec
+    %787:vgpr_32 = COPY %785
+    %780:vgpr_32, dead %782:sreg_64_xexec = V_ADDC_U32_e64 %787, %786, killed %781, 0, implicit $exec
+    %391:vreg_64_align2 = REG_SEQUENCE %779, %subreg.sub0, %780, %subreg.sub1
+    %75:vgpr_32 = GLOBAL_LOAD_DWORD killed %391, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.158, !tbaa !13, addrspace 1)
+    %392:vgpr_32 = V_ADD_U32_e64 %55, %65, 0, implicit $exec
+    %393:vgpr_32 = V_ASHRREV_I32_e64 31, %392, implicit $exec
+    %395:vreg_64_align2 = REG_SEQUENCE %392, %subreg.sub0, %393, %subreg.sub1
+    %396:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %395, implicit $exec
+    %792:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %793:vgpr_32 = COPY %396.sub0
+    %794:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %795:vgpr_32 = COPY %396.sub1
+    %788:vgpr_32, %790:sreg_64_xexec = V_ADD_CO_U32_e64 %792, %793, 0, implicit $exec
+    %796:vgpr_32 = COPY %794
+    %789:vgpr_32, dead %791:sreg_64_xexec = V_ADDC_U32_e64 %796, %795, killed %790, 0, implicit $exec
+    %397:vreg_64_align2 = REG_SEQUENCE %788, %subreg.sub0, %789, %subreg.sub1
+    %76:vgpr_32 = GLOBAL_LOAD_DWORD killed %397, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.162, !tbaa !13, addrspace 1)
+    %398:vgpr_32 = V_ADD_U32_e64 %51, %66, 0, implicit $exec
+    %399:vgpr_32 = V_ASHRREV_I32_e64 31, %398, implicit $exec
+    %401:vreg_64_align2 = REG_SEQUENCE %398, %subreg.sub0, %399, %subreg.sub1
+    %402:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %401, implicit $exec
+    %801:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %802:vgpr_32 = COPY %402.sub0
+    %803:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %804:vgpr_32 = COPY %402.sub1
+    %797:vgpr_32, %799:sreg_64_xexec = V_ADD_CO_U32_e64 %801, %802, 0, implicit $exec
+    %805:vgpr_32 = COPY %803
+    %798:vgpr_32, dead %800:sreg_64_xexec = V_ADDC_U32_e64 %805, %804, killed %799, 0, implicit $exec
+    %77:vreg_64_align2 = REG_SEQUENCE %797, %subreg.sub0, %798, %subreg.sub1
+    %403:vgpr_32 = GLOBAL_LOAD_DWORD %77, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.166, !tbaa !13, addrspace 1)
+    %404:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %73, 0, killed %403, 0, %68, 0, 0, implicit $mode, implicit $exec
+    %405:vgpr_32 = V_ADD_U32_e64 %47, %66, 0, implicit $exec
+    %406:vgpr_32 = V_ASHRREV_I32_e64 31, %405, implicit $exec
+    %408:vreg_64_align2 = REG_SEQUENCE %405, %subreg.sub0, %406, %subreg.sub1
+    %409:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %408, implicit $exec
+    %810:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %811:vgpr_32 = COPY %409.sub0
+    %812:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %813:vgpr_32 = COPY %409.sub1
+    %806:vgpr_32, %808:sreg_64_xexec = V_ADD_CO_U32_e64 %810, %811, 0, implicit $exec
+    %814:vgpr_32 = COPY %812
+    %807:vgpr_32, dead %809:sreg_64_xexec = V_ADDC_U32_e64 %814, %813, killed %808, 0, implicit $exec
+    %78:vreg_64_align2 = REG_SEQUENCE %806, %subreg.sub0, %807, %subreg.sub1
+    %410:vgpr_32 = GLOBAL_LOAD_DWORD %78, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.172, !tbaa !13, addrspace 1)
+    %411:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %74, 0, killed %410, 0, %404, 0, 0, implicit $mode, implicit $exec
+    %412:vgpr_32 = V_ADD_U32_e64 %49, %66, 0, implicit $exec
+    %413:vgpr_32 = V_ASHRREV_I32_e64 31, %412, implicit $exec
+    %415:vreg_64_align2 = REG_SEQUENCE %412, %subreg.sub0, %413, %subreg.sub1
+    %416:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %415, implicit $exec
+    %819:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %820:vgpr_32 = COPY %416.sub0
+    %821:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %822:vgpr_32 = COPY %416.sub1
+    %815:vgpr_32, %817:sreg_64_xexec = V_ADD_CO_U32_e64 %819, %820, 0, implicit $exec
+    %823:vgpr_32 = COPY %821
+    %816:vgpr_32, dead %818:sreg_64_xexec = V_ADDC_U32_e64 %823, %822, killed %817, 0, implicit $exec
+    %79:vreg_64_align2 = REG_SEQUENCE %815, %subreg.sub0, %816, %subreg.sub1
+    %417:vgpr_32 = GLOBAL_LOAD_DWORD %79, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.178, !tbaa !13, addrspace 1)
+    %418:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %75, 0, killed %417, 0, %411, 0, 0, implicit $mode, implicit $exec
+    %419:vgpr_32 = V_ADD_U32_e64 %50, %66, 0, implicit $exec
+    %420:vgpr_32 = V_ASHRREV_I32_e64 31, %419, implicit $exec
+    %422:vreg_64_align2 = REG_SEQUENCE %419, %subreg.sub0, %420, %subreg.sub1
+    %423:vreg_64_align2 = V_LSHLREV_B64_e64 %321, killed %422, implicit $exec
+    %828:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %829:vgpr_32 = COPY %423.sub0
+    %830:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %831:vgpr_32 = COPY %423.sub1
+    %824:vgpr_32, %826:sreg_64_xexec = V_ADD_CO_U32_e64 %828, %829, 0, implicit $exec
+    %832:vgpr_32 = COPY %830
+    %825:vgpr_32, dead %827:sreg_64_xexec = V_ADDC_U32_e64 %832, %831, killed %826, 0, implicit $exec
+    %80:vreg_64_align2 = REG_SEQUENCE %824, %subreg.sub0, %825, %subreg.sub1
+    %424:vgpr_32 = GLOBAL_LOAD_DWORD %80, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.184, !tbaa !13, addrspace 1)
+    %81:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %76, 0, killed %424, 0, %418, 0, 0, implicit $mode, implicit $exec
+    %82:sreg_64 = SI_IF %425, %bb.20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.15
+
+  bb.15 (%ir-block.191):
+    successors: %bb.16(0x40000000), %bb.19(0x40000000)
+
+    %426:vgpr_32 = GLOBAL_LOAD_DWORD %77, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.192, !tbaa !13, addrspace 1)
+    %427:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %73, 0, killed %426, 0, %69, 0, 0, implicit $mode, implicit $exec
+    %428:vgpr_32 = GLOBAL_LOAD_DWORD %78, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.196, !tbaa !13, addrspace 1)
+    %429:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %74, 0, killed %428, 0, %427, 0, 0, implicit $mode, implicit $exec
+    %430:vgpr_32 = GLOBAL_LOAD_DWORD %79, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.200, !tbaa !13, addrspace 1)
+    %431:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %75, 0, killed %430, 0, %429, 0, 0, implicit $mode, implicit $exec
+    %432:vgpr_32 = GLOBAL_LOAD_DWORD %80, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.204, !tbaa !13, addrspace 1)
+    %83:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %76, 0, killed %432, 0, %431, 0, 0, implicit $mode, implicit $exec
+    %84:sreg_64 = SI_IF %433, %bb.19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.16
+
+  bb.16 (%ir-block.211):
+    successors: %bb.17(0x40000000), %bb.18(0x40000000)
+
+    %434:vgpr_32 = GLOBAL_LOAD_DWORD %77, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.212, !tbaa !13, addrspace 1)
+    %435:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %73, 0, killed %434, 0, %70, 0, 0, implicit $mode, implicit $exec
+    %436:vgpr_32 = GLOBAL_LOAD_DWORD %78, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.216, !tbaa !13, addrspace 1)
+    %437:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %74, 0, killed %436, 0, %435, 0, 0, implicit $mode, implicit $exec
+    %438:vgpr_32 = GLOBAL_LOAD_DWORD %79, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.220, !tbaa !13, addrspace 1)
+    %439:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %75, 0, killed %438, 0, %437, 0, 0, implicit $mode, implicit $exec
+    %440:vgpr_32 = GLOBAL_LOAD_DWORD %80, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.224, !tbaa !13, addrspace 1)
+    %85:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %76, 0, killed %440, 0, %439, 0, 0, implicit $mode, implicit $exec
+    %86:sreg_64 = SI_IF %441, %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.17
+
+  bb.17 (%ir-block.231):
+    successors: %bb.18(0x80000000)
+
+    %442:vgpr_32 = GLOBAL_LOAD_DWORD %77, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.232, !tbaa !13, addrspace 1)
+    %443:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %73, 0, killed %442, 0, %71, 0, 0, implicit $mode, implicit $exec
+    %444:vgpr_32 = GLOBAL_LOAD_DWORD %78, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.236, !tbaa !13, addrspace 1)
+    %445:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %74, 0, killed %444, 0, %443, 0, 0, implicit $mode, implicit $exec
+    %446:vgpr_32 = GLOBAL_LOAD_DWORD %79, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.240, !tbaa !13, addrspace 1)
+    %447:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %75, 0, killed %446, 0, %445, 0, 0, implicit $mode, implicit $exec
+    %448:vgpr_32 = GLOBAL_LOAD_DWORD %80, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.244, !tbaa !13, addrspace 1)
+    %87:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %76, 0, killed %448, 0, %447, 0, 0, implicit $mode, implicit $exec
+
+  bb.18.Flow85:
+    successors: %bb.19(0x80000000)
+
+    %88:vgpr_32 = PHI %71, %bb.16, %87, %bb.17
+    SI_END_CF %86, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.19.Flow86:
+    successors: %bb.20(0x80000000)
+
+    %89:vgpr_32 = PHI %70, %bb.15, %85, %bb.18
+    %90:vgpr_32 = PHI %71, %bb.15, %88, %bb.18
+    SI_END_CF %84, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.20.Flow87:
+    successors: %bb.21(0x80000000)
+
+    %91:vgpr_32 = PHI %69, %bb.14, %83, %bb.19
+    %92:vgpr_32 = PHI %70, %bb.14, %89, %bb.19
+    %93:vgpr_32 = PHI %71, %bb.14, %90, %bb.19
+    SI_END_CF %82, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.21 (%ir-block.254):
+    successors: %bb.22(0x04000000), %bb.13(0x7c000000)
+
+    %94:vgpr_32 = PHI %71, %bb.13, %93, %bb.20
+    %95:vgpr_32 = PHI %70, %bb.13, %92, %bb.20
+    %96:vgpr_32 = PHI %69, %bb.13, %91, %bb.20
+    %97:vgpr_32 = PHI %68, %bb.13, %81, %bb.20
+    SI_END_CF %72, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %98:vgpr_32 = nuw nsw V_ADD_U32_e64 %67, %348, 0, implicit $exec
+    %99:vgpr_32 = V_ADD_U32_e64 %66, %48, 0, implicit $exec
+    %100:sreg_32 = S_ADD_I32 %65, %53, implicit-def dead $scc
+    %837:vgpr_32 = COPY %64.sub0
+    %838:sreg_32_xm0 = COPY %57.sub0
+    %839:vgpr_32 = COPY %64.sub1
+    %840:sreg_32_xm0 = COPY %57.sub1
+    %833:vgpr_32, %835:sreg_64_xexec = V_ADD_CO_U32_e64 %837, %838, 0, implicit $exec
+    %841:vgpr_32 = COPY %840
+    %834:vgpr_32, dead %836:sreg_64_xexec = V_ADDC_U32_e64 %839, %841, killed %835, 0, implicit $exec
+    %101:vreg_64_align2 = REG_SEQUENCE %833, %subreg.sub0, %834, %subreg.sub1
+    %450:sreg_64 = V_CMP_GE_I32_e64 %98, %41, implicit $exec
+    %102:sreg_64 = SI_IF_BREAK killed %450, %63, implicit-def dead $scc
+    SI_LOOP %102, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.22
+
+  bb.22.Flow88:
+    successors: %bb.12(0x80000000)
+
+    %103:vgpr_32 = PHI %94, %bb.21
+    %104:vgpr_32 = PHI %95, %bb.21
+    %105:vgpr_32 = PHI %96, %bb.21
+    %106:vgpr_32 = PHI %97, %bb.21
+    SI_END_CF %102, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.12
+
+  bb.23.Flow93:
+    successors: %bb.51(0x40000000), %bb.52(0x40000000)
+
+    %109:vgpr_32 = PHI %15, %bb.4, %165, %bb.45
+    %110:vreg_64_align2 = PHI %16, %bb.4, %166, %bb.45
+    %111:sreg_64 = PHI %17, %bb.4, %167, %bb.45
+    %627:sreg_64 = COPY %111
+    %112:sreg_64 = SI_IF %627, %bb.52, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.51
+
+  bb.24.._crit_edge.i:
+    successors: %bb.25(0x50000000), %bb.33(0x30000000)
+
+    %451:sreg_32 = S_MOV_B32 1
+    S_CMP_LT_I32 %39, killed %451, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.33, implicit $scc
+    S_BRANCH %bb.25
+
+  bb.25 (%ir-block.266):
+    successors: %bb.26(0x40000000), %bb.34(0x40000000)
+
+    %453:sreg_64 = V_CMP_LT_I32_e64 %58, %34, implicit $exec
+    %113:sreg_64_xexec = COPY %453
+    %452:sgpr_32 = S_MOV_B32 0
+    %658:vgpr_32 = COPY %452, implicit $exec
+    %659:vgpr_32 = COPY %452, implicit $exec
+    %660:vgpr_32 = COPY %452, implicit $exec
+    %661:vgpr_32 = COPY %452, implicit $exec
+    %114:sreg_64 = SI_IF %453, %bb.34, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.26
+
+  bb.26 (%ir-block.271):
+    successors: %bb.27(0x40000000), %bb.32(0x40000000)
+
+    %455:vgpr_32 = nsw V_MUL_LO_U32_e64 %58, %295, implicit $exec
+    %456:vgpr_32 = V_ASHRREV_I32_e64 31, %455, implicit $exec
+    %458:vreg_64_align2 = REG_SEQUENCE %455, %subreg.sub0, %456, %subreg.sub1
+    %459:sreg_32 = S_MOV_B32 2
+    %460:vreg_64_align2 = V_LSHLREV_B64_e64 %459, killed %458, implicit $exec
+    %846:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %847:vgpr_32 = COPY %460.sub0
+    %848:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %849:vgpr_32 = COPY %460.sub1
+    %842:vgpr_32, %844:sreg_64_xexec = V_ADD_CO_U32_e64 %846, %847, 0, implicit $exec
+    %850:vgpr_32 = COPY %848
+    %843:vgpr_32, dead %845:sreg_64_xexec = V_ADDC_U32_e64 %850, %849, killed %844, 0, implicit $exec
+    %461:vreg_64_align2 = REG_SEQUENCE %842, %subreg.sub0, %843, %subreg.sub1
+    %115:vgpr_32 = GLOBAL_LOAD_DWORD killed %461, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.274, !tbaa !13, addrspace 1)
+    %462:sreg_32 = S_MOV_B32 1
+    %116:vgpr_32 = V_OR_B32_e64 %58, killed %462, implicit $exec
+    %463:sreg_64 = V_CMP_LT_I32_e64 %116, %34, implicit $exec
+    %454:sgpr_32 = S_MOV_B32 0
+    %655:vgpr_32 = COPY %454, implicit $exec
+    %656:vgpr_32 = COPY %454, implicit $exec
+    %657:vgpr_32 = COPY %454, implicit $exec
+    %117:sreg_64 = SI_IF killed %463, %bb.32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.27
+
+  bb.27 (%ir-block.281):
+    successors: %bb.28(0x40000000), %bb.31(0x40000000)
+
+    %465:vgpr_32 = nsw V_MUL_LO_U32_e64 %116, %295, implicit $exec
+    %466:vgpr_32 = V_ASHRREV_I32_e64 31, %465, implicit $exec
+    %468:vreg_64_align2 = REG_SEQUENCE %465, %subreg.sub0, %466, %subreg.sub1
+    %470:vreg_64_align2 = V_LSHLREV_B64_e64 %459, killed %468, implicit $exec
+    %855:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %856:vgpr_32 = COPY %470.sub0
+    %857:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %858:vgpr_32 = COPY %470.sub1
+    %851:vgpr_32, %853:sreg_64_xexec = V_ADD_CO_U32_e64 %855, %856, 0, implicit $exec
+    %859:vgpr_32 = COPY %857
+    %852:vgpr_32, dead %854:sreg_64_xexec = V_ADDC_U32_e64 %859, %858, killed %853, 0, implicit $exec
+    %471:vreg_64_align2 = REG_SEQUENCE %851, %subreg.sub0, %852, %subreg.sub1
+    %118:vgpr_32 = GLOBAL_LOAD_DWORD killed %471, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.284, !tbaa !13, addrspace 1)
+    %119:vgpr_32 = V_OR_B32_e64 %58, %459, implicit $exec
+    %472:sreg_64 = V_CMP_LT_I32_e64 %119, %34, implicit $exec
+    %464:sgpr_32 = S_MOV_B32 0
+    %653:vgpr_32 = COPY %464, implicit $exec
+    %654:vgpr_32 = COPY %464, implicit $exec
+    %120:sreg_64 = SI_IF killed %472, %bb.31, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.28
+
+  bb.28 (%ir-block.291):
+    successors: %bb.29(0x40000000), %bb.30(0x40000000)
+
+    %474:vgpr_32 = nsw V_MUL_LO_U32_e64 %119, %295, implicit $exec
+    %475:vgpr_32 = V_ASHRREV_I32_e64 31, %474, implicit $exec
+    %477:vreg_64_align2 = REG_SEQUENCE %474, %subreg.sub0, %475, %subreg.sub1
+    %478:sreg_32 = S_MOV_B32 2
+    %479:vreg_64_align2 = V_LSHLREV_B64_e64 %478, killed %477, implicit $exec
+    %864:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %865:vgpr_32 = COPY %479.sub0
+    %866:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %867:vgpr_32 = COPY %479.sub1
+    %860:vgpr_32, %862:sreg_64_xexec = V_ADD_CO_U32_e64 %864, %865, 0, implicit $exec
+    %868:vgpr_32 = COPY %866
+    %861:vgpr_32, dead %863:sreg_64_xexec = V_ADDC_U32_e64 %868, %867, killed %862, 0, implicit $exec
+    %480:vreg_64_align2 = REG_SEQUENCE %860, %subreg.sub0, %861, %subreg.sub1
+    %121:vgpr_32 = GLOBAL_LOAD_DWORD killed %480, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.294, !tbaa !13, addrspace 1)
+    %481:sreg_32 = S_MOV_B32 3
+    %122:vgpr_32 = V_OR_B32_e64 %58, killed %481, implicit $exec
+    %482:sreg_64 = V_CMP_LT_I32_e64 %122, %34, implicit $exec
+    %652:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %123:sreg_64 = SI_IF killed %482, %bb.30, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.29
+
+  bb.29 (%ir-block.301):
+    successors: %bb.30(0x80000000)
+
+    %483:vgpr_32 = nsw V_MUL_LO_U32_e64 %122, %295, implicit $exec
+    %484:vgpr_32 = V_ASHRREV_I32_e64 31, %483, implicit $exec
+    %486:vreg_64_align2 = REG_SEQUENCE %483, %subreg.sub0, %484, %subreg.sub1
+    %488:vreg_64_align2 = V_LSHLREV_B64_e64 %478, killed %486, implicit $exec
+    %873:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub0
+    %874:vgpr_32 = COPY %488.sub0
+    %875:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %33.sub1
+    %876:vgpr_32 = COPY %488.sub1
+    %869:vgpr_32, %871:sreg_64_xexec = V_ADD_CO_U32_e64 %873, %874, 0, implicit $exec
+    %877:vgpr_32 = COPY %875
+    %870:vgpr_32, dead %872:sreg_64_xexec = V_ADDC_U32_e64 %877, %876, killed %871, 0, implicit $exec
+    %489:vreg_64_align2 = REG_SEQUENCE %869, %subreg.sub0, %870, %subreg.sub1
+    %124:vgpr_32 = GLOBAL_LOAD_DWORD killed %489, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.304, !tbaa !13, addrspace 1)
+
+  bb.30.Flow81:
+    successors: %bb.31(0x80000000)
+
+    %125:vgpr_32 = PHI %652, %bb.28, %124, %bb.29
+    SI_END_CF %123, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.31.Flow82:
+    successors: %bb.32(0x80000000)
+
+    %126:vgpr_32 = PHI %653, %bb.27, %121, %bb.30
+    %127:vgpr_32 = PHI %654, %bb.27, %125, %bb.30
+    SI_END_CF %120, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.32.Flow83:
+    successors: %bb.34(0x80000000)
+
+    %128:vgpr_32 = PHI %655, %bb.26, %118, %bb.31
+    %129:vgpr_32 = PHI %656, %bb.26, %126, %bb.31
+    %130:vgpr_32 = PHI %657, %bb.26, %127, %bb.31
+    SI_END_CF %117, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.34
+
+  bb.33.Flow84:
+    successors: %bb.43(0x80000000)
+
+    %131:vgpr_32 = PHI %59, %bb.24, %157, %bb.42
+    %132:vgpr_32 = PHI %60, %bb.24, %158, %bb.42
+    %133:vgpr_32 = PHI %61, %bb.24, %159, %bb.42
+    %134:vgpr_32 = PHI %62, %bb.24, %160, %bb.42
+    S_BRANCH %bb.43
+
+  bb.34 (%ir-block.316):
+    successors: %bb.35(0x40000000), %bb.42(0x40000000)
+
+    %135:vgpr_32 = PHI %658, %bb.25, %130, %bb.32
+    %136:vgpr_32 = PHI %659, %bb.25, %129, %bb.32
+    %137:vgpr_32 = PHI %660, %bb.25, %128, %bb.32
+    %138:vgpr_32 = PHI %661, %bb.25, %115, %bb.32
+    SI_END_CF %114, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %490:sreg_64 = V_CMP_LT_I32_e64 %38, %7, implicit $exec
+    %139:sreg_64 = SI_IF killed %490, %bb.42, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.35
+
+  bb.35 (%ir-block.321):
+    successors: %bb.36(0x40000000), %bb.41(0x40000000)
+
+    %491:vgpr_32 = nsw V_MUL_LO_U32_e64 %58, %292, implicit $exec
+    %492:sreg_32 = S_MOV_B32 0
+    %495:vgpr_32 = COPY %492
+    %493:vgpr_32 = V_CNDMASK_B32_e64 0, %495, 0, killed %491, %113, implicit $exec
+    %496:vgpr_32 = nsw V_ADD_U32_e64 killed %493, %38, 0, implicit $exec
+    %497:vgpr_32 = V_ASHRREV_I32_e64 31, %496, implicit $exec
+    %499:vreg_64_align2 = REG_SEQUENCE %496, %subreg.sub0, %497, %subreg.sub1
+    %500:sreg_32 = S_MOV_B32 2
+    %501:vreg_64_align2 = V_LSHLREV_B64_e64 %500, killed %499, implicit $exec
+    %882:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %883:vgpr_32 = COPY %501.sub0
+    %884:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %885:vgpr_32 = COPY %501.sub1
+    %878:vgpr_32, %880:sreg_64_xexec = V_ADD_CO_U32_e64 %882, %883, 0, implicit $exec
+    %886:vgpr_32 = COPY %884
+    %879:vgpr_32, dead %881:sreg_64_xexec = V_ADDC_U32_e64 %886, %885, killed %880, 0, implicit $exec
+    %140:vreg_64_align2 = REG_SEQUENCE %878, %subreg.sub0, %879, %subreg.sub1
+    %502:vgpr_32 = GLOBAL_LOAD_DWORD %140, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.326, !tbaa !13, addrspace 1)
+    %503:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %138, 0, killed %502, 0, %59, 0, 0, implicit $mode, implicit $exec
+    %504:sreg_32 = S_MOV_B32 1
+    %505:vgpr_32 = V_OR_B32_e64 %58, killed %504, implicit $exec
+    %506:vgpr_32 = nsw V_MUL_LO_U32_e64 %505, %292, implicit $exec
+    %507:sreg_64_xexec = V_CMP_LT_I32_e64 %505, %34, implicit $exec
+    %509:vgpr_32 = COPY %492
+    %508:vgpr_32 = V_CNDMASK_B32_e64 0, %509, 0, killed %506, killed %507, implicit $exec
+    %510:vgpr_32 = nsw V_ADD_U32_e64 killed %508, %38, 0, implicit $exec
+    %511:vgpr_32 = V_ASHRREV_I32_e64 31, %510, implicit $exec
+    %513:vreg_64_align2 = REG_SEQUENCE %510, %subreg.sub0, %511, %subreg.sub1
+    %514:vreg_64_align2 = V_LSHLREV_B64_e64 %500, killed %513, implicit $exec
+    %891:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %892:vgpr_32 = COPY %514.sub0
+    %893:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %894:vgpr_32 = COPY %514.sub1
+    %887:vgpr_32, %889:sreg_64_xexec = V_ADD_CO_U32_e64 %891, %892, 0, implicit $exec
+    %895:vgpr_32 = COPY %893
+    %888:vgpr_32, dead %890:sreg_64_xexec = V_ADDC_U32_e64 %895, %894, killed %889, 0, implicit $exec
+    %141:vreg_64_align2 = REG_SEQUENCE %887, %subreg.sub0, %888, %subreg.sub1
+    %515:vgpr_32 = GLOBAL_LOAD_DWORD %141, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.336, !tbaa !13, addrspace 1)
+    %516:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %137, 0, killed %515, 0, %503, 0, 0, implicit $mode, implicit $exec
+    %517:vgpr_32 = V_OR_B32_e64 %58, %500, implicit $exec
+    %518:vgpr_32 = nsw V_MUL_LO_U32_e64 %517, %292, implicit $exec
+    %519:sreg_64_xexec = V_CMP_LT_I32_e64 %517, %34, implicit $exec
+    %521:vgpr_32 = COPY %492
+    %520:vgpr_32 = V_CNDMASK_B32_e64 0, %521, 0, killed %518, killed %519, implicit $exec
+    %522:vgpr_32 = nsw V_ADD_U32_e64 killed %520, %38, 0, implicit $exec
+    %523:vgpr_32 = V_ASHRREV_I32_e64 31, %522, implicit $exec
+    %525:vreg_64_align2 = REG_SEQUENCE %522, %subreg.sub0, %523, %subreg.sub1
+    %526:vreg_64_align2 = V_LSHLREV_B64_e64 %500, killed %525, implicit $exec
+    %900:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %901:vgpr_32 = COPY %526.sub0
+    %902:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %903:vgpr_32 = COPY %526.sub1
+    %896:vgpr_32, %898:sreg_64_xexec = V_ADD_CO_U32_e64 %900, %901, 0, implicit $exec
+    %904:vgpr_32 = COPY %902
+    %897:vgpr_32, dead %899:sreg_64_xexec = V_ADDC_U32_e64 %904, %903, killed %898, 0, implicit $exec
+    %142:vreg_64_align2 = REG_SEQUENCE %896, %subreg.sub0, %897, %subreg.sub1
+    %527:vgpr_32 = GLOBAL_LOAD_DWORD %142, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.346, !tbaa !13, addrspace 1)
+    %528:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %136, 0, killed %527, 0, %516, 0, 0, implicit $mode, implicit $exec
+    %529:sreg_32 = S_MOV_B32 3
+    %530:vgpr_32 = V_OR_B32_e64 %58, killed %529, implicit $exec
+    %531:vgpr_32 = nsw V_MUL_LO_U32_e64 %530, %292, implicit $exec
+    %532:sreg_64_xexec = V_CMP_LT_I32_e64 %530, %34, implicit $exec
+    %534:vgpr_32 = COPY %492
+    %533:vgpr_32 = V_CNDMASK_B32_e64 0, %534, 0, killed %531, killed %532, implicit $exec
+    %535:vgpr_32 = nsw V_ADD_U32_e64 killed %533, %38, 0, implicit $exec
+    %536:vgpr_32 = V_ASHRREV_I32_e64 31, %535, implicit $exec
+    %538:vreg_64_align2 = REG_SEQUENCE %535, %subreg.sub0, %536, %subreg.sub1
+    %539:vreg_64_align2 = V_LSHLREV_B64_e64 %500, killed %538, implicit $exec
+    %909:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub0
+    %910:vgpr_32 = COPY %539.sub0
+    %911:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %31.sub1
+    %912:vgpr_32 = COPY %539.sub1
+    %905:vgpr_32, %907:sreg_64_xexec = V_ADD_CO_U32_e64 %909, %910, 0, implicit $exec
+    %913:vgpr_32 = COPY %911
+    %906:vgpr_32, dead %908:sreg_64_xexec = V_ADDC_U32_e64 %913, %912, killed %907, 0, implicit $exec
+    %143:vreg_64_align2 = REG_SEQUENCE %905, %subreg.sub0, %906, %subreg.sub1
+    %540:vgpr_32 = GLOBAL_LOAD_DWORD %143, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.356, !tbaa !13, addrspace 1)
+    %144:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %135, 0, killed %540, 0, %528, 0, 0, implicit $mode, implicit $exec
+    %541:sreg_32 = S_MOV_B32 64
+    %542:vgpr_32 = nsw V_ADD_U32_e64 %38, killed %541, 0, implicit $exec
+    %543:sreg_64 = V_CMP_LT_I32_e64 killed %542, %7, implicit $exec
+    %145:sreg_64 = SI_IF killed %543, %bb.41, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.36
+
+  bb.36 (%ir-block.365):
+    successors: %bb.37(0x40000000), %bb.40(0x40000000)
+
+    %544:vgpr_32 = GLOBAL_LOAD_DWORD %140, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.366, !tbaa !13, addrspace 1)
+    %545:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %138, 0, killed %544, 0, %60, 0, 0, implicit $mode, implicit $exec
+    %546:vgpr_32 = GLOBAL_LOAD_DWORD %141, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.370, !tbaa !13, addrspace 1)
+    %547:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %137, 0, killed %546, 0, %545, 0, 0, implicit $mode, implicit $exec
+    %548:vgpr_32 = GLOBAL_LOAD_DWORD %142, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.374, !tbaa !13, addrspace 1)
+    %549:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %136, 0, killed %548, 0, %547, 0, 0, implicit $mode, implicit $exec
+    %550:vgpr_32 = GLOBAL_LOAD_DWORD %143, 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.378, !tbaa !13, addrspace 1)
+    %146:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %135, 0, killed %550, 0, %549, 0, 0, implicit $mode, implicit $exec
+    %551:sreg_32 = S_MOV_B32 128
+    %552:vgpr_32 = nsw V_ADD_U32_e64 %38, killed %551, 0, implicit $exec
+    %553:sreg_64 = V_CMP_LT_I32_e64 killed %552, %7, implicit $exec
+    %147:sreg_64 = SI_IF killed %553, %bb.40, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.37
+
+  bb.37 (%ir-block.387):
+    successors: %bb.38(0x40000000), %bb.39(0x40000000)
+
+    %554:vgpr_32 = GLOBAL_LOAD_DWORD %140, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.388, !tbaa !13, addrspace 1)
+    %555:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %138, 0, killed %554, 0, %61, 0, 0, implicit $mode, implicit $exec
+    %556:vgpr_32 = GLOBAL_LOAD_DWORD %141, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.392, !tbaa !13, addrspace 1)
+    %557:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %137, 0, killed %556, 0, %555, 0, 0, implicit $mode, implicit $exec
+    %558:vgpr_32 = GLOBAL_LOAD_DWORD %142, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.396, !tbaa !13, addrspace 1)
+    %559:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %136, 0, killed %558, 0, %557, 0, 0, implicit $mode, implicit $exec
+    %560:vgpr_32 = GLOBAL_LOAD_DWORD %143, 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.400, !tbaa !13, addrspace 1)
+    %148:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %135, 0, killed %560, 0, %559, 0, 0, implicit $mode, implicit $exec
+    %561:sreg_32 = S_MOV_B32 192
+    %562:vgpr_32 = nsw V_ADD_U32_e64 %38, killed %561, 0, implicit $exec
+    %563:sreg_64 = V_CMP_LT_I32_e64 killed %562, %7, implicit $exec
+    %149:sreg_64 = SI_IF killed %563, %bb.39, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.38
+
+  bb.38 (%ir-block.409):
+    successors: %bb.39(0x80000000)
+
+    %564:vgpr_32 = GLOBAL_LOAD_DWORD %140, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.410, !tbaa !13, addrspace 1)
+    %565:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %138, 0, killed %564, 0, %62, 0, 0, implicit $mode, implicit $exec
+    %566:vgpr_32 = GLOBAL_LOAD_DWORD %141, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.414, !tbaa !13, addrspace 1)
+    %567:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %137, 0, killed %566, 0, %565, 0, 0, implicit $mode, implicit $exec
+    %568:vgpr_32 = GLOBAL_LOAD_DWORD %142, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.418, !tbaa !13, addrspace 1)
+    %569:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %136, 0, killed %568, 0, %567, 0, 0, implicit $mode, implicit $exec
+    %570:vgpr_32 = GLOBAL_LOAD_DWORD %143, 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.422, !tbaa !13, addrspace 1)
+    %150:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %135, 0, killed %570, 0, %569, 0, 0, implicit $mode, implicit $exec
+
+  bb.39.Flow77:
+    successors: %bb.40(0x80000000)
+
+    %151:vgpr_32 = PHI %62, %bb.37, %150, %bb.38
+    SI_END_CF %149, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.40.Flow78:
+    successors: %bb.41(0x80000000)
+
+    %152:vgpr_32 = PHI %61, %bb.36, %148, %bb.39
+    %153:vgpr_32 = PHI %62, %bb.36, %151, %bb.39
+    SI_END_CF %147, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.41.Flow79:
+    successors: %bb.42(0x80000000)
+
+    %154:vgpr_32 = PHI %60, %bb.35, %146, %bb.40
+    %155:vgpr_32 = PHI %61, %bb.35, %152, %bb.40
+    %156:vgpr_32 = PHI %62, %bb.35, %153, %bb.40
+    SI_END_CF %145, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.42.Flow80:
+    successors: %bb.33(0x80000000)
+
+    %157:vgpr_32 = PHI %59, %bb.34, %144, %bb.41
+    %158:vgpr_32 = PHI %60, %bb.34, %154, %bb.41
+    %159:vgpr_32 = PHI %61, %bb.34, %155, %bb.41
+    %160:vgpr_32 = PHI %62, %bb.34, %156, %bb.41
+    SI_END_CF %139, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.33
+
+  bb.43 (%ir-block.436):
+    successors: %bb.44(0x40000000), %bb.45(0x40000000)
+
+    %573:sreg_32 = S_MOV_B32 8
+    %574:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed %573, %12, implicit $exec
+    %575:sreg_32 = S_MOV_B32 2
+    %576:vgpr_32 = V_ADD_LSHL_U32_e64 killed %574, %11, %575, implicit $exec
+    DS_WRITE_B32_gfx9 %576, %131, 0, 0, implicit $exec :: (store (s32) into %ir.439, !tbaa !13, addrspace 3)
+    DS_WRITE_B32_gfx9 %576, %132, 256, 0, implicit $exec :: (store (s32) into %ir.440, !tbaa !13, addrspace 3)
+    DS_WRITE_B32_gfx9 %576, %133, 512, 0, implicit $exec :: (store (s32) into %ir.441, !tbaa !13, addrspace 3)
+    DS_WRITE_B32_gfx9 %576, %134, 768, 0, implicit $exec :: (store (s32) into %ir.442, !tbaa !13, addrspace 3)
+    ATOMIC_FENCE 5, 3
+    S_BARRIER
+    ATOMIC_FENCE 4, 3
+    %577:sreg_32 = S_MOV_B32 256
+    %578:sreg_64 = V_CMP_LT_U32_e64 %13, killed %577, implicit $exec
+    %572:sreg_64 = IMPLICIT_DEF
+    %571:sgpr_32 = IMPLICIT_DEF
+    %662:vgpr_32 = COPY %571
+    %663:vreg_64_align2 = COPY %572
+    %161:sreg_64 = SI_IF killed %578, %bb.45, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.44
+
+  bb.44..preheader.i:
+    successors: %bb.46(0x40000000), %bb.50(0x40000000)
+
+    %582:vgpr_32 = V_LSHLREV_B32_e64 %575, %13, implicit $exec
+    %583:vgpr_32 = DS_READ_B32_gfx9 %582, 0, 0, implicit $exec :: (load (s32) from %ir.447, !tbaa !13, addrspace 3)
+    %584:vgpr_32 = DS_READ_B32_gfx9 %582, 1024, 0, implicit $exec :: (load (s32) from %ir.448, !tbaa !13, addrspace 3)
+    %585:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %583, 0, killed %584, 0, 0, implicit $mode, implicit $exec
+    %586:vgpr_32 = DS_READ_B32_gfx9 %582, 2048, 0, implicit $exec :: (load (s32) from %ir.451, !tbaa !13, addrspace 3)
+    %587:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %586, 0, killed %585, 0, 0, implicit $mode, implicit $exec
+    %588:vgpr_32 = DS_READ_B32_gfx9 %582, 3072, 0, implicit $exec :: (load (s32) from %ir.454, !tbaa !13, addrspace 3)
+    %589:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %588, 0, killed %587, 0, 0, implicit $mode, implicit $exec
+    %590:vgpr_32 = DS_READ_B32_gfx9 %582, 4096, 0, implicit $exec :: (load (s32) from %ir.457, !tbaa !13, addrspace 3)
+    %591:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %590, 0, killed %589, 0, 0, implicit $mode, implicit $exec
+    %592:vgpr_32 = DS_READ_B32_gfx9 %582, 5120, 0, implicit $exec :: (load (s32) from %ir.460, !tbaa !13, addrspace 3)
+    %593:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %592, 0, killed %591, 0, 0, implicit $mode, implicit $exec
+    %594:vgpr_32 = DS_READ_B32_gfx9 %582, 6144, 0, implicit $exec :: (load (s32) from %ir.463, !tbaa !13, addrspace 3)
+    %595:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %594, 0, killed %593, 0, 0, implicit $mode, implicit $exec
+    %596:vgpr_32 = DS_READ_B32_gfx9 %582, 7168, 0, implicit $exec :: (load (s32) from %ir.466, !tbaa !13, addrspace 3)
+    %597:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %596, 0, killed %595, 0, 0, implicit $mode, implicit $exec
+    %598:vgpr_32 = DS_READ_B32_gfx9 %582, 8192, 0, implicit $exec :: (load (s32) from %ir.469, !tbaa !13, addrspace 3)
+    %599:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %598, 0, killed %597, 0, 0, implicit $mode, implicit $exec
+    %600:vgpr_32 = DS_READ_B32_gfx9 %582, 9216, 0, implicit $exec :: (load (s32) from %ir.472, !tbaa !13, addrspace 3)
+    %601:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %600, 0, killed %599, 0, 0, implicit $mode, implicit $exec
+    %602:vgpr_32 = DS_READ_B32_gfx9 %582, 10240, 0, implicit $exec :: (load (s32) from %ir.475, !tbaa !13, addrspace 3)
+    %603:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %602, 0, killed %601, 0, 0, implicit $mode, implicit $exec
+    %604:vgpr_32 = DS_READ_B32_gfx9 %582, 11264, 0, implicit $exec :: (load (s32) from %ir.478, !tbaa !13, addrspace 3)
+    %605:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %604, 0, killed %603, 0, 0, implicit $mode, implicit $exec
+    %606:vgpr_32 = DS_READ_B32_gfx9 %582, 12288, 0, implicit $exec :: (load (s32) from %ir.481, !tbaa !13, addrspace 3)
+    %607:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %606, 0, killed %605, 0, 0, implicit $mode, implicit $exec
+    %608:vgpr_32 = DS_READ_B32_gfx9 %582, 13312, 0, implicit $exec :: (load (s32) from %ir.484, !tbaa !13, addrspace 3)
+    %609:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %608, 0, killed %607, 0, 0, implicit $mode, implicit $exec
+    %610:vgpr_32 = DS_READ_B32_gfx9 %582, 14336, 0, implicit $exec :: (load (s32) from %ir.487, !tbaa !13, addrspace 3)
+    %611:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %610, 0, killed %609, 0, 0, implicit $mode, implicit $exec
+    %612:vgpr_32 = DS_READ_B32_gfx9 %582, 15360, 0, implicit $exec :: (load (s32) from %ir.490, !tbaa !13, addrspace 3)
+    %162:vgpr_32 = contract nofpexcept V_ADD_F32_e64 0, killed %612, 0, killed %611, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B32_gfx9 %582, %162, 0, 0, implicit $exec :: (store (s32) into %ir.447, !tbaa !13, addrspace 3)
+    %163:vgpr_32 = V_ADD_U32_e64 %13, %37, 0, implicit $exec
+    %613:sreg_64 = V_CMP_LT_I32_e64 %163, %7, implicit $exec
+    %580:sreg_64 = IMPLICIT_DEF
+    %579:sgpr_32 = IMPLICIT_DEF
+    %665:vgpr_32 = COPY %579
+    %666:vreg_64_align2 = COPY %580
+    %164:sreg_64 = SI_IF killed %613, %bb.50, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.46
+
+  bb.45.Flow94:
+    successors: %bb.23(0x80000000)
+
+    %167:sreg_64 = PHI %17, %bb.43, %674, %bb.50
+    %165:vgpr_32 = PHI %662, %bb.43, %173, %bb.50
+    %166:vreg_64_align2 = PHI %663, %bb.43, %174, %bb.50
+    SI_END_CF %161, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.23
+
+  bb.46 (%ir-block.501):
+    successors: %bb.47(0x50000000), %bb.49(0x30000000)
+
+    %614:sgpr_32 = S_MOV_B32 0
+    %616:vgpr_32 = COPY killed %614
+    %615:sreg_64 = nofpexcept V_CMP_EQ_F32_e64 0, %220, 0, %616, 0, implicit $mode, implicit $exec
+    %168:vgpr_32 = contract nofpexcept V_MUL_F32_e64 0, %162, 0, %5, 0, 0, implicit $mode, implicit $exec
+    %617:vgpr_32 = nsw V_MUL_LO_U32_e64 %163, %234, implicit $exec
+    %618:vgpr_32 = V_ASHRREV_I32_e64 31, %617, implicit $exec
+    %620:vreg_64_align2 = REG_SEQUENCE %617, %subreg.sub0, %618, %subreg.sub1
+    %169:vreg_64_align2 = COPY %620
+    %621:sreg_64 = S_AND_B64 $exec, killed %615, implicit-def dead $scc
+    $vcc = COPY %621
+    S_CBRANCH_VCCNZ %bb.49, implicit $vcc
+    S_BRANCH %bb.47
+
+  bb.47 (%ir-block.506):
+    successors: %bb.49(0x80000000)
+
+    %622:sreg_32 = S_MOV_B32 2
+    %623:vreg_64_align2 = V_LSHLREV_B64_e64 killed %622, %620, implicit $exec
+    %918:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub0
+    %919:vgpr_32 = COPY %623.sub0
+    %920:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub1
+    %921:vgpr_32 = COPY %623.sub1
+    %914:vgpr_32, %916:sreg_64_xexec = V_ADD_CO_U32_e64 %918, %919, 0, implicit $exec
+    %922:vgpr_32 = COPY %920
+    %915:vgpr_32, dead %917:sreg_64_xexec = V_ADDC_U32_e64 %922, %921, killed %916, 0, implicit $exec
+    %624:vreg_64_align2 = REG_SEQUENCE %914, %subreg.sub0, %915, %subreg.sub1
+    %625:vgpr_32 = GLOBAL_LOAD_DWORD killed %624, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.507, !tbaa !13, addrspace 1)
+    %170:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, killed %625, 0, %220, 0, %168, 0, 0, implicit $mode, implicit $exec
+    S_BRANCH %bb.49
+
+  bb.48.Flow:
+    successors: %bb.9(0x80000000)
+
+    %171:vgpr_32 = PHI %664, %bb.7, %25, %bb.8
+    %672:sreg_64 = COPY $exec
+    S_BRANCH %bb.9
+
+  bb.49.Flow76:
+    successors: %bb.50(0x80000000)
+
+    %172:vgpr_32 = PHI %168, %bb.46, %170, %bb.47
+    %678:sreg_64 = S_OR_B64 %17, $exec, implicit-def $scc
+
+  bb.50.Flow95:
+    successors: %bb.45(0x80000000)
+
+    %175:sreg_64 = PHI %17, %bb.44, %678, %bb.49
+    %173:vgpr_32 = PHI %665, %bb.44, %172, %bb.49
+    %174:vreg_64_align2 = PHI %666, %bb.44, %169, %bb.49
+    SI_END_CF %164, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %675:sreg_64 = S_ANDN2_B64 %17, $exec, implicit-def $scc
+    %676:sreg_64 = S_AND_B64 %175, $exec, implicit-def $scc
+    %674:sreg_64 = S_OR_B64 %675, %676, implicit-def $scc
+    S_BRANCH %bb.45
+
+  bb.51..sink.split.i:
+    successors: %bb.52(0x80000000)
+
+    %628:sreg_32 = S_MOV_B32 2
+    %629:vreg_64_align2 = V_LSHLREV_B64_e64 killed %628, %110, implicit $exec
+    %927:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub0
+    %928:vgpr_32 = COPY %629.sub0
+    %929:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY %10.sub1
+    %930:vgpr_32 = COPY %629.sub1
+    %923:vgpr_32, %925:sreg_64_xexec = V_ADD_CO_U32_e64 %927, %928, 0, implicit $exec
+    %931:vgpr_32 = COPY %929
+    %924:vgpr_32, dead %926:sreg_64_xexec = V_ADDC_U32_e64 %931, %930, killed %925, 0, implicit $exec
+    %630:vreg_64_align2 = REG_SEQUENCE %923, %subreg.sub0, %924, %subreg.sub1
+    GLOBAL_STORE_DWORD killed %630, %109, 0, 0, implicit $exec :: (store (s32) into %ir.516, !tbaa !13, addrspace 1)
+
+  bb.52.Flow96:
+    successors: %bb.53(0x80000000)
+
+    SI_END_CF %112, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+
+  bb.53.Flow97:
+    successors: %bb.54(0x80000000)
+
+
+  bb.54._Z25rocblas_gemvn_kernel_calcILi64ELi16EiffLi0EEviiT3_PKT2_T1_S3_iS0_PS1_i.exit:
+    S_ENDPGM 0
+
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# CHECK: {{.*}}

>From 0a642094f5bec27297a46eee0f3ba85819a1a047 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 7 Sep 2023 15:46:45 +0200
Subject: [PATCH 2/2] MachineSink/AMDGPU: Allow sinking past SI_END_CF in a
 simple case

SWEDEV-414443. Fixes performance regression introduced by D155343.
Allows sinking in a simple case when there are no loops.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  9 +++
 llvm/lib/CodeGen/MachineSink.cpp              |  4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 33 +++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  4 +
 .../AMDGPU/machine-sink-swdev414443.mir       | 74 +++++++++----------
 5 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 1c2ca867834647..310cb22d4862f2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -140,6 +140,15 @@ class TargetInstrInfo : public MCInstrInfo {
     return false;
   }
 
+  /// Attempting to move \p MoveCandidate after \p ModifierInstr .
+  /// \p MoveCandidate uses \p Reg but \p ModifierInstr redefines \p Reg.
+  /// Let target check it redefines it using same value.
+  virtual bool
+  modifiesRegisterImplicitly(Register Reg, const MachineInstr *MoveCandidate,
+                             const MachineInstr *ModifierInstr) const {
+    return true;
+  }
+
 protected:
   /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
   /// set, this hook lets the target specify whether the instruction is actually
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b4cbb93d758ef2..aa89dddb05d7e5 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -290,8 +290,10 @@ static bool blockPrologueInterferes(const MachineBasicBlock *BB,
       if (MO.isUse()) {
         if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
           continue;
-        if (PI->modifiesRegister(Reg, TRI))
+        if (PI->modifiesRegister(Reg, TRI) &&
+            TII->modifiesRegisterImplicitly(Reg, &MI, &*PI)) {
           return true;
+        }
       } else {
         if (PI->readsRegister(Reg, TRI))
           return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38b5e0114903cd..c8a2728bb3445a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -171,6 +171,39 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
          isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
 }
 
+bool SIInstrInfo::modifiesRegisterImplicitly(
+    Register Reg, const MachineInstr *MoveCandidate,
+    const MachineInstr *ModifierInstr) const {
+
+  if (ModifierInstr->getOpcode() == AMDGPU::SI_END_CF && Reg == AMDGPU::EXEC) {
+    const MachineRegisterInfo &MRI = MoveCandidate->getMF()->getRegInfo();
+
+    // Looking if this is a simple case of:
+    //
+    //  %0 = MoveCandidate %1, %2, implicit $exec
+    //  %EndCF:sreg_64 = SI_IF %cond, %bb.B
+    //  S_BRANCH %bb.A
+    //
+    // bb.A
+    //  ...
+    //
+    // bb.B
+    //  SI_END_CF %EndCF, implicit-def dead $exec
+    //  ... MoveCandidate should be moved here
+
+    // MoveCandidate is from block that started divergent control flow via SI_IF
+    // it is a simple SI_IF (no loops) - only user of SI_IF is SI_END_CF
+    // SI_END_CF restores exec mask as it was before SI_IF (unchanged)
+    Register EndCF = ModifierInstr->getOperand(0).getReg();
+    MachineInstr *SIIF = MRI.getVRegDef(EndCF);
+    if (SIIF->getOpcode() == AMDGPU::SI_IF && MRI.hasOneUse(EndCF) &&
+        SIIF->getParent() == MoveCandidate->getParent())
+      return false;
+  }
+
+  return true;
+}
+
 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
                                           int64_t &Offset0,
                                           int64_t &Offset1) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e85917a4c0f329..16187b91d3734f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -222,6 +222,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isIgnorableUse(const MachineOperand &MO) const override;
 
+  bool
+  modifiesRegisterImplicitly(Register Reg, const MachineInstr *MoveCandidate,
+                             const MachineInstr *ModifierInstr) const override;
+
   bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
                                int64_t &Offset1) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir
index 84fc2a619a5c70..cb1e496721282e 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-swdev414443.mir
@@ -2330,7 +2330,6 @@ body:             |
   ; CHECK-NEXT:   [[V_ADDC_U32_e64_10:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_11:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY141]], [[COPY140]], killed [[V_ADD_CO_U32_e64_11]], 0, implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE39:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_10]], %subreg.sub0, [[V_ADDC_U32_e64_10]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.166, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD5]], 0, [[PHI20]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_4]], [[PHI18]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_6:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_12]], implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE40:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_12]], %subreg.sub0, [[V_ASHRREV_I32_e64_6]], %subreg.sub1
@@ -2344,7 +2343,6 @@ body:             |
   ; CHECK-NEXT:   [[V_ADDC_U32_e64_12:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_13:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY146]], [[COPY145]], killed [[V_ADD_CO_U32_e64_13]], 0, implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE41:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_12]], %subreg.sub0, [[V_ADDC_U32_e64_12]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD6:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.172, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD6]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_4]], [[PHI18]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_7:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_13]], implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE42:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_13]], %subreg.sub0, [[V_ASHRREV_I32_e64_7]], %subreg.sub1
@@ -2358,7 +2356,6 @@ body:             |
   ; CHECK-NEXT:   [[V_ADDC_U32_e64_14:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_15:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY151]], [[COPY150]], killed [[V_ADD_CO_U32_e64_15]], 0, implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE43:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_14]], %subreg.sub0, [[V_ADDC_U32_e64_14]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD7:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.178, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD7]], 0, [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MUL_LO_U32_e64_5]], [[PHI18]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ASHRREV_I32_e64_8:%[0-9]+]]:vgpr_32 = V_ASHRREV_I32_e64 31, [[V_ADD_U32_e64_14]], implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE44:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_U32_e64_14]], %subreg.sub0, [[V_ASHRREV_I32_e64_8]], %subreg.sub1
@@ -2372,7 +2369,6 @@ body:             |
   ; CHECK-NEXT:   [[V_ADDC_U32_e64_16:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_17:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY156]], [[COPY155]], killed [[V_ADD_CO_U32_e64_17]], 0, implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE45:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_16]], %subreg.sub0, [[V_ADDC_U32_e64_16]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD8:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.184, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD8]], 0, [[V_FMAC_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[SI_IF4:%[0-9]+]]:sreg_64 = SI_IF [[COPY119]], %bb.20, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.15
   ; CHECK-NEXT: {{  $}}
@@ -2380,13 +2376,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.16(0x40000000), %bb.19(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD9:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.192, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_4:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD9]], 0, [[PHI21]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD10:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.196, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_5:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD10]], 0, [[V_FMAC_F32_e64_4]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD11:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.200, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_6:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD11]], 0, [[V_FMAC_F32_e64_5]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD12:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.204, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_7:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD12]], 0, [[V_FMAC_F32_e64_6]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[SI_IF5:%[0-9]+]]:sreg_64 = SI_IF [[COPY120]], %bb.19, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.16
   ; CHECK-NEXT: {{  $}}
@@ -2394,13 +2386,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD13:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.212, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_8:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD13]], 0, [[PHI22]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD14:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.216, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_9:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD14]], 0, [[V_FMAC_F32_e64_8]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD15:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.220, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_10:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD15]], 0, [[V_FMAC_F32_e64_9]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD16:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.224, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_11:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD16]], 0, [[V_FMAC_F32_e64_10]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[SI_IF6:%[0-9]+]]:sreg_64 = SI_IF [[COPY121]], %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.17
   ; CHECK-NEXT: {{  $}}
@@ -2408,34 +2396,46 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.18(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD17:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE39]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.232, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_12:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD17]], 0, [[PHI23]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, killed [[GLOBAL_LOAD_DWORD17]], 0, [[PHI23]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD18:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE41]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.236, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_13:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD18]], 0, [[V_FMAC_F32_e64_12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, killed [[GLOBAL_LOAD_DWORD18]], 0, [[V_FMAC_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD19:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE43]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.240, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_14:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD19]], 0, [[V_FMAC_F32_e64_13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, killed [[GLOBAL_LOAD_DWORD19]], 0, [[V_FMAC_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD20:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE45]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.244, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_15:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD20]], 0, [[V_FMAC_F32_e64_14]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, killed [[GLOBAL_LOAD_DWORD20]], 0, [[V_FMAC_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.18.Flow85:
   ; CHECK-NEXT:   successors: %bb.19(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI24:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.16, [[V_FMAC_F32_e64_15]], %bb.17
+  ; CHECK-NEXT:   [[PHI24:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.16, [[V_FMAC_F32_e64_3]], %bb.17
   ; CHECK-NEXT:   SI_END_CF [[SI_IF6]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_4:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[GLOBAL_LOAD_DWORD13]], 0, [[PHI22]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_5:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, [[GLOBAL_LOAD_DWORD14]], 0, [[V_FMAC_F32_e64_4]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_6:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, [[GLOBAL_LOAD_DWORD15]], 0, [[V_FMAC_F32_e64_5]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_7:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, [[GLOBAL_LOAD_DWORD16]], 0, [[V_FMAC_F32_e64_6]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.19.Flow86:
   ; CHECK-NEXT:   successors: %bb.20(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI25:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.15, [[V_FMAC_F32_e64_11]], %bb.18
+  ; CHECK-NEXT:   [[PHI25:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.15, [[V_FMAC_F32_e64_7]], %bb.18
   ; CHECK-NEXT:   [[PHI26:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.15, [[PHI24]], %bb.18
   ; CHECK-NEXT:   SI_END_CF [[SI_IF5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_8:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[GLOBAL_LOAD_DWORD9]], 0, [[PHI21]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_9:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, [[GLOBAL_LOAD_DWORD10]], 0, [[V_FMAC_F32_e64_8]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_10:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, [[GLOBAL_LOAD_DWORD11]], 0, [[V_FMAC_F32_e64_9]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_11:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, [[GLOBAL_LOAD_DWORD12]], 0, [[V_FMAC_F32_e64_10]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.20.Flow87:
   ; CHECK-NEXT:   successors: %bb.21(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI27:%[0-9]+]]:vgpr_32 = PHI [[PHI21]], %bb.14, [[V_FMAC_F32_e64_7]], %bb.19
+  ; CHECK-NEXT:   [[PHI27:%[0-9]+]]:vgpr_32 = PHI [[PHI21]], %bb.14, [[V_FMAC_F32_e64_11]], %bb.19
   ; CHECK-NEXT:   [[PHI28:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.14, [[PHI25]], %bb.19
   ; CHECK-NEXT:   [[PHI29:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.14, [[PHI26]], %bb.19
   ; CHECK-NEXT:   SI_END_CF [[SI_IF4]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_12:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[GLOBAL_LOAD_DWORD5]], 0, [[PHI20]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_13:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD2]], 0, [[GLOBAL_LOAD_DWORD6]], 0, [[V_FMAC_F32_e64_12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_14:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD3]], 0, [[GLOBAL_LOAD_DWORD7]], 0, [[V_FMAC_F32_e64_13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_15:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD4]], 0, [[GLOBAL_LOAD_DWORD8]], 0, [[V_FMAC_F32_e64_14]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.21 (%ir-block.254):
   ; CHECK-NEXT:   successors: %bb.22(0x04000000), %bb.13(0x7c000000)
@@ -2443,7 +2443,7 @@ body:             |
   ; CHECK-NEXT:   [[PHI30:%[0-9]+]]:vgpr_32 = PHI [[PHI23]], %bb.13, [[PHI29]], %bb.20
   ; CHECK-NEXT:   [[PHI31:%[0-9]+]]:vgpr_32 = PHI [[PHI22]], %bb.13, [[PHI28]], %bb.20
   ; CHECK-NEXT:   [[PHI32:%[0-9]+]]:vgpr_32 = PHI [[PHI21]], %bb.13, [[PHI27]], %bb.20
-  ; CHECK-NEXT:   [[PHI33:%[0-9]+]]:vgpr_32 = PHI [[PHI20]], %bb.13, [[V_FMAC_F32_e64_3]], %bb.20
+  ; CHECK-NEXT:   [[PHI33:%[0-9]+]]:vgpr_32 = PHI [[PHI20]], %bb.13, [[V_FMAC_F32_e64_15]], %bb.20
   ; CHECK-NEXT:   SI_END_CF [[SI_IF3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = nuw nsw V_ADD_U32_e64 [[PHI19]], [[S_MOV_B32_24]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[PHI18]], [[S_LSHL_B32_1]], 0, implicit $exec
@@ -2716,7 +2716,6 @@ body:             |
   ; CHECK-NEXT:   [[V_ADDC_U32_e64_34:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_35:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY215]], [[COPY214]], killed [[V_ADD_CO_U32_e64_35]], 0, implicit $exec
   ; CHECK-NEXT:   [[REG_SEQUENCE62:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_ADD_CO_U32_e64_34]], %subreg.sub0, [[V_ADDC_U32_e64_34]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD28:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.356, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_19:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD28]], 0, [[V_FMAC_F32_e64_18]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sreg_32 = S_MOV_B32 64
   ; CHECK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_42]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_15:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_21]], [[COPY28]], implicit $exec
@@ -2727,13 +2726,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.37(0x40000000), %bb.40(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD29:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.366, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_20:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD29]], 0, [[PHI12]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_19:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD29]], 0, [[PHI12]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD30:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.370, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_21:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD30]], 0, [[V_FMAC_F32_e64_20]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_20:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD30]], 0, [[V_FMAC_F32_e64_19]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD31:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.374, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_22:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD31]], 0, [[V_FMAC_F32_e64_21]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_21:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD31]], 0, [[V_FMAC_F32_e64_20]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD32:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 256, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.378, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_23:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD32]], 0, [[V_FMAC_F32_e64_22]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sreg_32 = S_MOV_B32 128
   ; CHECK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_43]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_16:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_22]], [[COPY28]], implicit $exec
@@ -2744,13 +2742,12 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.38(0x40000000), %bb.39(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD33:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.388, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_24:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD33]], 0, [[PHI13]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_22:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD33]], 0, [[PHI13]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD34:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.392, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_25:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD34]], 0, [[V_FMAC_F32_e64_24]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_23:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD34]], 0, [[V_FMAC_F32_e64_22]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD35:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.396, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_26:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD35]], 0, [[V_FMAC_F32_e64_25]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_24:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD35]], 0, [[V_FMAC_F32_e64_23]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD36:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 512, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.400, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_27:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD36]], 0, [[V_FMAC_F32_e64_26]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sreg_32 = S_MOV_B32 192
   ; CHECK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[S_MOV_B32_44]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_CMP_LT_I32_e64_17:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_ADD_U32_e64_23]], [[COPY28]], implicit $exec
@@ -2761,39 +2758,42 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.39(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD37:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE56]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.410, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_28:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD37]], 0, [[PHI14]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_25:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI54]], 0, killed [[GLOBAL_LOAD_DWORD37]], 0, [[PHI14]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD38:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE58]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.414, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_29:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD38]], 0, [[V_FMAC_F32_e64_28]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_26:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI53]], 0, killed [[GLOBAL_LOAD_DWORD38]], 0, [[V_FMAC_F32_e64_25]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD39:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE60]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.418, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_30:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD39]], 0, [[V_FMAC_F32_e64_29]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_27:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI52]], 0, killed [[GLOBAL_LOAD_DWORD39]], 0, [[V_FMAC_F32_e64_26]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD40:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE62]], 768, 0, implicit $exec :: ("amdgpu-noclobber" load (s32) from %ir.422, !tbaa !13, addrspace 1)
-  ; CHECK-NEXT:   [[V_FMAC_F32_e64_31:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD40]], 0, [[V_FMAC_F32_e64_30]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_28:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, killed [[GLOBAL_LOAD_DWORD40]], 0, [[V_FMAC_F32_e64_27]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.39.Flow77:
   ; CHECK-NEXT:   successors: %bb.40(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI55:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.37, [[V_FMAC_F32_e64_31]], %bb.38
+  ; CHECK-NEXT:   [[PHI55:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.37, [[V_FMAC_F32_e64_28]], %bb.38
   ; CHECK-NEXT:   SI_END_CF [[SI_IF15]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_29:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, [[GLOBAL_LOAD_DWORD36]], 0, [[V_FMAC_F32_e64_24]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.40.Flow78:
   ; CHECK-NEXT:   successors: %bb.41(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI56:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.36, [[V_FMAC_F32_e64_27]], %bb.39
+  ; CHECK-NEXT:   [[PHI56:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.36, [[V_FMAC_F32_e64_29]], %bb.39
   ; CHECK-NEXT:   [[PHI57:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.36, [[PHI55]], %bb.39
   ; CHECK-NEXT:   SI_END_CF [[SI_IF14]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_30:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, [[GLOBAL_LOAD_DWORD32]], 0, [[V_FMAC_F32_e64_21]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.41.Flow79:
   ; CHECK-NEXT:   successors: %bb.42(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI58:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.35, [[V_FMAC_F32_e64_23]], %bb.40
+  ; CHECK-NEXT:   [[PHI58:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.35, [[V_FMAC_F32_e64_30]], %bb.40
   ; CHECK-NEXT:   [[PHI59:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.35, [[PHI56]], %bb.40
   ; CHECK-NEXT:   [[PHI60:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.35, [[PHI57]], %bb.40
   ; CHECK-NEXT:   SI_END_CF [[SI_IF13]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[V_FMAC_F32_e64_31:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[PHI51]], 0, [[GLOBAL_LOAD_DWORD28]], 0, [[V_FMAC_F32_e64_18]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.42.Flow80:
   ; CHECK-NEXT:   successors: %bb.33(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI61:%[0-9]+]]:vgpr_32 = PHI [[PHI11]], %bb.34, [[V_FMAC_F32_e64_19]], %bb.41
+  ; CHECK-NEXT:   [[PHI61:%[0-9]+]]:vgpr_32 = PHI [[PHI11]], %bb.34, [[V_FMAC_F32_e64_31]], %bb.41
   ; CHECK-NEXT:   [[PHI62:%[0-9]+]]:vgpr_32 = PHI [[PHI12]], %bb.34, [[PHI58]], %bb.41
   ; CHECK-NEXT:   [[PHI63:%[0-9]+]]:vgpr_32 = PHI [[PHI13]], %bb.34, [[PHI59]], %bb.41
   ; CHECK-NEXT:   [[PHI64:%[0-9]+]]:vgpr_32 = PHI [[PHI14]], %bb.34, [[PHI60]], %bb.41



More information about the llvm-commits mailing list