[llvm] 716ca2e - [AMDGPU] Pre-sink IR input for some tests

Thu Jul 21 06:27:45 PDT 2022

Author: Jay Foad
Date: 2022-07-21T14:25:44+01:00
New Revision: 716ca2e3ef37c9aa9517216042653d8a88d76ecf

URL: https://github.com/llvm/llvm-project/commit/716ca2e3ef37c9aa9517216042653d8a88d76ecf
DIFF: https://github.com/llvm/llvm-project/commit/716ca2e3ef37c9aa9517216042653d8a88d76ecf.diff

LOG: [AMDGPU] Pre-sink IR input for some tests

Edit the IR input for some codegen tests to simulate what the IR code
sinking pass would do to it. This makes the tests immune to the presence
or absence of the code sinking pass in the codegen pass pipeline, which
does not belong there.

Differential Revision: https://reviews.llvm.org/D130169

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
    llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
    llvm/test/CodeGen/AMDGPU/multilevel-break.ll
    llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
    llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
    llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 746f2591db7d..af883e8d6a91 100644

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -168,11 +168,11 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = load i32, i32 addrspace(4)* @external_constant
-  %ptr = load float*, float* addrspace(4)* @const.ptr
   %tmp1 = icmp ne i32 %tmp, 0
   br i1 %tmp1, label %bb12, label %bb2
 
 bb2:
+  %ptr = load float*, float* addrspace(4)* @const.ptr
   %tmp4 = load float, float* %ptr, align 4
   %tmp5 = fcmp olt float %tmp4, 1.0
   %tmp6 = or i1 %tmp5, false

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a94719e66cea..908874e073e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1536,7 +1536,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX11_W64-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
   %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
@@ -1555,6 +1554,7 @@ bb:
 
 exit:
   %cond = phi i1 [false, %entry], [%cmp1, %bb]
+  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
   store float %result, float addrspace(1)* %gep.out, align 4
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3c729182d1b0..fb5d81350808 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -793,7 +793,6 @@ bb:
 
 bb9:                                              ; preds = %bb12, %bb
   %i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
-  %i11 = icmp slt i64 %i10, 0
   br i1 undef, label %bb14, label %bb12
 
 bb12:                                             ; preds = %bb58, %bb9
@@ -801,6 +800,7 @@ bb12:                                             ; preds = %bb58, %bb9
   br label %bb9
 
 bb14:                                             ; preds = %bb9
+  %i11 = icmp slt i64 %i10, 0
   %i15 = load i64, i64 addrspace(1)* null, align 8
   br label %bb16
 
@@ -825,23 +825,23 @@ bb16:                                             ; preds = %bb58, %bb14
   %i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
   %i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
   %i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
+  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
+  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+  fence syncscope("workgroup") acquire
+  br i1 %i11, label %bb58, label %bb51
+
+bb51:                                             ; preds = %bb16
   %i37 = fpext <2 x half> %arg4 to <2 x float>
   %i39 = fpext <2 x half> %i27 to <2 x float>
   %i40 = fpext <2 x half> %i30 to <2 x float>
   %i41 = fpext <2 x half> %i33 to <2 x float>
   %i42 = fpext <2 x half> %i36 to <2 x float>
-  %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
   %i44 = fadd contract <2 x float> %i37, %i43
   %i45 = fadd contract <2 x float> %i43, zeroinitializer
-  %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
   %i47 = fadd contract <2 x float> %i39, %i46
   %i48 = fadd contract <2 x float> %i40, %i43
   %i49 = fadd contract <2 x float> %i41, zeroinitializer
   %i50 = fadd contract <2 x float> %i42, zeroinitializer
-  fence syncscope("workgroup") acquire
-  br i1 %i11, label %bb58, label %bb51
-
-bb51:                                             ; preds = %bb16
   %i52 = fadd contract <2 x float> %i18, %i44
   %i53 = fadd contract <2 x float> %i19, %i45
   %i54 = fadd contract <2 x float> %i20, %i47

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index a4e2af802b73..ecc951a9cacd 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -508,11 +508,11 @@ define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i3
 bb:
   %tmp = icmp slt i32 %arg2, 9
   %tmp6 = icmp eq i32 %arg1, 0
-  %tmp7 = icmp sgt i32 %arg4, 0
   %tmp8 = icmp sgt i32 %arg4, 5
   br i1 %tmp8, label %bb9, label %bb13
 
 bb9:                                              ; preds = %bb
+  %tmp7 = icmp sgt i32 %arg4, 0
   %tmp10 = and i1 %tmp7, %tmp
   %tmp11 = icmp slt i32 %arg3, %arg4
   %tmp12 = or i1 %tmp11, %tmp7

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index e46cf5c5f910..26966ff49372 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -8,19 +8,19 @@
 define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
+; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2)
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2)
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
 ; OPT-NEXT:    store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    br label [[DONE:%.*]]
+; OPT:       done:
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
@@ -43,18 +43,18 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:252
 ; GCN-NEXT:    s_endpgm
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp = icmp eq i32 %tid, 0
   br i1 %cmp, label %endif, label %if
 
 if:
+  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
   %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2)
   br label %endif
 
 endif:
   %x = phi i32 [ %val, %if ], [ 0, %entry ]
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
   store i32 %x, i32 addrspace(1)* %out.gep
   br label %done
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 8c496d552d71..66e4d59fe3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -7,20 +7,20 @@
 define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
+; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
-; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00)
 ; OPT-NEXT:    [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
 ; OPT-NEXT:    store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    br label [[DONE:%.*]]
+; OPT:       done:
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32:
@@ -45,19 +45,19 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %cmp = icmp eq i32 %tid, 0
   br i1 %cmp, label %endif, label %if
 
 if:
+  %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
   %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
   %val = load volatile float, float addrspace(1)* undef
   br label %endif
 
 endif:
   %x = phi float [ %val, %if ], [ 0.0, %entry ]
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
   store float %x, float addrspace(1)* %out.gep
   br label %done
 

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c9d23d76fdfd..92682a4b0117 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -11,13 +11,13 @@
 
 define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 entry:
-  %rem.lhs.trunc = trunc i32 %s to i16
-  %rem4 = urem i16 %rem.lhs.trunc, 12
-  %rem.zext = zext i16 %rem4 to i32
   %cmp = icmp eq i32 %s, 3
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
+  %rem.lhs.trunc = trunc i32 %s to i16
+  %rem4 = urem i16 %rem.lhs.trunc, 12
+  %rem.zext = zext i16 %rem4 to i32
   %idxprom = zext i32 %s to i64
   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom
   %div = lshr i32 %rem.zext, 3

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
index 1244ab28c14f..c04ab319cc8c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -254,12 +254,6 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ; return to shader part epilog
 main_body:
-  %c.bc = bitcast i32 %c to float
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
-  %tex0 = extractelement <4 x float> %tex, i32 0
-  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
-  %data.sample = extractelement <4 x float> %dtex, i32 0
-
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ELSE
 
@@ -271,6 +265,12 @@ IF:
   br label %END
 
 ELSE:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %data.sample = extractelement <4 x float> %dtex, i32 0
+
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
   br label %END
 

diff  --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 6f8ebbcc73d1..cfe2357f0fa8 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -10,36 +10,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; OPT-NEXT:  main_body:
 ; OPT-NEXT:    br label [[LOOP_OUTER:%.*]]
 ; OPT:       LOOP.outer:
-; OPT-NEXT:    [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
+; OPT-NEXT:    [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
 ; OPT-NEXT:    [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ]
 ; OPT-NEXT:    br label [[LOOP:%.*]]
 ; OPT:       LOOP:
-; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
+; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
 ; OPT-NEXT:    [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ]
-; OPT-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ]
-; OPT-NEXT:    [[TMP47]] = add i32 [[TMP45]], 1
+; OPT-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ]
 ; OPT-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
 ; OPT-NEXT:    [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
 ; OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
 ; OPT-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
 ; OPT-NEXT:    br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
 ; OPT:       Flow:
-; OPT-NEXT:    [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
-; OPT-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT:    [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
+; OPT-NEXT:    [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ]
+; OPT-NEXT:    [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT:    [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
 ; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
-; OPT-NEXT:    [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]])
-; OPT-NEXT:    [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]])
-; OPT-NEXT:    [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]])
-; OPT-NEXT:    br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]]
+; OPT-NEXT:    [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]])
+; OPT-NEXT:    [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
+; OPT-NEXT:    [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]])
+; OPT-NEXT:    br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]]
 ; OPT:       Flow1:
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; OPT-NEXT:    [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]])
-; OPT-NEXT:    br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]]
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
+; OPT-NEXT:    [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]])
+; OPT-NEXT:    br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]]
 ; OPT:       IF:
-; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]])
+; OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
 ; OPT-NEXT:    ret void
 ; OPT:       ENDIF:
+; OPT-NEXT:    [[TMP47]] = add i32 [[TMP45]], 1
 ; OPT-NEXT:    [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]]
 ; OPT-NEXT:    [[TMP51_INV]] = xor i1 [[TMP51]], true
 ; OPT-NEXT:    br label [[FLOW]]
@@ -98,7 +99,6 @@ LOOP.outer:                                       ; preds = %ENDIF, %main_body
 
 LOOP:                                             ; preds = %ENDIF, %LOOP.outer
   %tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
-  %tmp47 = add i32 %tmp45, 1
   %tmp48 = icmp slt i32 %tmp45, %ub
   br i1 %tmp48, label %ENDIF, label %IF
 
@@ -106,6 +106,7 @@ IF:                                               ; preds = %LOOP
   ret void
 
 ENDIF:                                            ; preds = %LOOP
+  %tmp47 = add i32 %tmp45, 1
   %tmp51 = icmp eq i32 %tmp47, %cont
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index f3089e79ad37..640f240f143f 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -190,16 +190,16 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
+; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
+; IR:       bb14.lr.ph:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
 ; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
-; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
-; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
-; IR:       bb14.lr.ph:
 ; IR-NEXT:    br label [[BB14:%.*]]
 ; IR:       Flow3:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
@@ -277,17 +277,17 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef
 ; IR-NEXT:    ret void
 bb:
+  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
+  br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
+
+bb14.lr.ph:                                       ; preds = %bb
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = zext i32 %my.tmp to i64
   %my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
   %my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
   %my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
   %my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
-  %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
-  %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
-  br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
-
-bb14.lr.ph:                                       ; preds = %bb
   br label %bb14
 
 bb4.bb13_crit_edge:                               ; preds = %bb21

diff  --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index 26d0c050c884..c0d276f5d88c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -21,7 +21,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_or_b32 s1, s0, s1
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_4
-; GFX10-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:  .LBB0_2: ; %bb
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_or_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_1
@@ -50,20 +51,20 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
 ; GFX10-NEXT:    s_inst_prefetch 0x2
 ; GFX10-NEXT:    s_endpgm
 branch1_true:
-  br label %2
+  br label %bb
 
-2:                                                ; preds = %branch2_merge, %branch1_true
+bb:                                               ; preds = %branch2_merge, %branch1_true
   %r1.8.vec.insert14.i1 = phi float [ 0.000000e+00, %branch1_true ], [ %0, %branch2_merge ]
-  %3 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
-  %4 = icmp eq i32 %1, 0
-  br i1 %4, label %loop0_merge, label %branch2_merge
+  %i = icmp eq i32 %1, 0
+  br i1 %i, label %loop0_merge, label %branch2_merge
 
-branch2_merge:                                    ; preds = %2
-  %5 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %3, float %0, float 0.000000e+00)
-  %6 = fcmp ult float %5, 0.000000e+00
-  br i1 %6, label %2, label %loop0_merge
+branch2_merge:                                    ; preds = %bb
+  %i2 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i3 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %i2, float %0, float 0.000000e+00)
+  %i4 = fcmp ult float %i3, 0.000000e+00
+  br i1 %i4, label %bb, label %loop0_merge
 
-loop0_merge:                                      ; preds = %branch2_merge, %2
+loop0_merge:                                      ; preds = %branch2_merge, %bb
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index a0deb4572e0d..5514b91beea9 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -36,7 +36,6 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output,
 .entry:
   %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
   %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
-  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
   br i1 %.not10002, label %.merge, label %.bb0
 
 .bb0:
@@ -44,6 +43,7 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output,
 
 .merge:
   %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
   br i1 %i530, label %.end, label %.then
 
 .then:
@@ -103,7 +103,6 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
 .entry:
   %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
   %.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
-  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
   br i1 %.not10002, label %.merge, label %.bb0
 
 .bb0:
@@ -111,6 +110,7 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
 
 .merge:
   %src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+  %i530 = icmp ult i32 %LocalInvocationId.i0, 4
   br i1 %i530, label %.then, label %.else
 
 .then:

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a9c96d3fbb66..ada6c1da04e2 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1374,10 +1374,10 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
 ; GFX11-NEXT:    s_endpgm
 bb:
   %tmp = fcmp ult float %arg1, 0.000000e+00
-  %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
   br i1 %tmp, label %bb6, label %bb3
 
 bb3:                                              ; preds = %bb
+  %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
   br i1 %tmp2, label %bb5, label %bb4
 
 bb4:                                              ; preds = %bb3

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 3414e70d36e3..7cb57ab6b0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -260,10 +260,10 @@ define amdgpu_kernel void @icmp_users_
diff erent_blocks(i32 %cond0, i32 %cond1, i
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
-  %cmp1 = icmp sgt i32 %cond1, 0
   br i1 %cmp0, label %bb2, label %bb9
 
 bb2:                                              ; preds = %bb
+  %cmp1 = icmp sgt i32 %cond1, 0
   %tmp2 = sext i1 %cmp1 to i32
   %tmp3 = add i32 %tmp2, %tmp
   br i1 %cmp1, label %bb9, label %bb7

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index d1e867e526bc..dc85462631d4 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2372,14 +2372,14 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %c.bc = bitcast i32 %c to float
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
-  %tex0 = extractelement <4 x float> %tex, i32 0
-  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ENDIF
 
 IF:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %dataf = extractelement <4 x float> %dtex, i32 0
   %data1 = fptosi float %dataf to i32
   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
@@ -2909,14 +2909,14 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %c.bc = bitcast i32 %c to float
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
-  %tex0 = extractelement <4 x float> %tex, i32 0
-  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ENDIF
 
 IF:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %dataf = extractelement <4 x float> %dtex, i32 0
   %data1 = fptosi float %dataf to i32
   %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
@@ -2992,14 +2992,14 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %c.bc = bitcast i32 %c to float
-  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
-  %tex0 = extractelement <4 x float> %tex, i32 0
-  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %cmp = icmp eq i32 %z, 0
   br i1 %cmp, label %IF, label %ENDIF
 
 IF:
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
   %dataf = extractelement <4 x float> %dtex, i32 0
   %data1 = fptosi float %dataf to i32
   %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)