[llvm] GlobalISel: adapt MachineSSAUpdater for use in GlobalISel path (PR #78431)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 5 05:28:11 PST 2024
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/78431
>From 37966ecfa946e35887600650bf61b6d675488319 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Mon, 5 Feb 2024 14:26:05 +0100
Subject: [PATCH] GlobalISel: adapt MachineSSAUpdater for use in GlobalISel
path
GlobalISel works with registers that could have register class,
register bank and LLT as attributes.
When initializing MachineSSAUpdater save all attributes of register
and create new registers with same attributes instead of only using
register class.
patch 4 from: https://github.com/llvm/llvm-project/pull/73337
---
llvm/include/llvm/CodeGen/MachineSSAUpdater.h | 6 +-
llvm/lib/CodeGen/MachineSSAUpdater.cpp | 44 ++--
...-divergent-i1-phis-no-lane-mask-merging.ll | 99 +++++----
...divergent-i1-phis-no-lane-mask-merging.mir | 16 +-
...vergence-divergent-i1-used-outside-loop.ll | 198 ++++++++++++------
...ergence-divergent-i1-used-outside-loop.mir | 112 +++++-----
.../GlobalISel/divergence-structurizer.ll | 117 ++++++++---
.../GlobalISel/divergence-structurizer.mir | 100 +++++----
.../divergence-temporal-divergent-i1.ll | 80 ++++---
.../divergence-temporal-divergent-i1.mir | 24 +--
.../GlobalISel/divergent-control-flow.ll | 43 ++--
11 files changed, 480 insertions(+), 359 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
index bbd09d7d151ba..3305e90f696d4 100644
--- a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -13,6 +13,7 @@
#ifndef LLVM_CODEGEN_MACHINESSAUPDATER_H
#define LLVM_CODEGEN_MACHINESSAUPDATER_H
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Register.h"
namespace llvm {
@@ -40,8 +41,8 @@ class MachineSSAUpdater {
//typedef DenseMap<MachineBasicBlock*, Register> AvailableValsTy;
void *AV = nullptr;
- /// VRC - Register class of the current virtual register.
- const TargetRegisterClass *VRC = nullptr;
+ /// Register class or bank and LLT of current virtual register.
+ MachineRegisterInfo::VRegAttrs RegAttrs;
/// InsertedPHIs - If this is non-null, the MachineSSAUpdater adds all PHI
/// nodes that it creates to the vector.
@@ -62,7 +63,6 @@ class MachineSSAUpdater {
/// Initialize - Reset this object to get ready for a new set of SSA
/// updates.
void Initialize(Register V);
- void Initialize(const TargetRegisterClass *RC);
/// AddAvailableValue - Indicate that a rewritten value is available at the
/// end of the specified block with the specified value.
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 48076663ddf53..8d6ea9c488da5 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -51,17 +51,13 @@ MachineSSAUpdater::~MachineSSAUpdater() {
/// Initialize - Reset this object to get ready for a new set of SSA
/// updates.
-void MachineSSAUpdater::Initialize(const TargetRegisterClass *RC) {
+void MachineSSAUpdater::Initialize(Register V) {
if (!AV)
AV = new AvailableValsTy();
else
getAvailableVals(AV).clear();
- VRC = RC;
-}
-
-void MachineSSAUpdater::Initialize(Register V) {
- Initialize(MRI->getRegClass(V));
+ RegAttrs = MRI->getVRegAttrs(V);
}
/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
@@ -115,13 +111,12 @@ Register LookForIdenticalPHI(MachineBasicBlock *BB,
/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
/// a value of the given register class at the start of the specified basic
/// block. It returns the virtual register defined by the instruction.
-static
-MachineInstrBuilder InsertNewDef(unsigned Opcode,
- MachineBasicBlock *BB, MachineBasicBlock::iterator I,
- const TargetRegisterClass *RC,
- MachineRegisterInfo *MRI,
- const TargetInstrInfo *TII) {
- Register NewVR = MRI->createVirtualRegister(RC);
+static MachineInstrBuilder InsertNewDef(unsigned Opcode, MachineBasicBlock *BB,
+ MachineBasicBlock::iterator I,
+ MachineRegisterInfo::VRegAttrs RegAttrs,
+ MachineRegisterInfo *MRI,
+ const TargetInstrInfo *TII) {
+ Register NewVR = MRI->createVirtualRegister(RegAttrs);
return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
}
@@ -158,9 +153,9 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
if (ExistingValueOnly)
return Register();
// Insert an implicit_def to represent an undef value.
- MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
- BB, BB->getFirstTerminator(),
- VRC, MRI, TII);
+ MachineInstr *NewDef =
+ InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstTerminator(),
+ RegAttrs, MRI, TII);
return NewDef->getOperand(0).getReg();
}
@@ -197,8 +192,8 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
// Otherwise, we do need a PHI: insert one now.
MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
- MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
- Loc, VRC, MRI, TII);
+ MachineInstrBuilder InsertedPHI =
+ InsertNewDef(TargetOpcode::PHI, BB, Loc, RegAttrs, MRI, TII);
// Fill in all the predecessors of the PHI.
for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
@@ -300,10 +295,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
static Register GetUndefVal(MachineBasicBlock *BB,
MachineSSAUpdater *Updater) {
// Insert an implicit_def to represent an undef value.
- MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
- BB, BB->getFirstNonPHI(),
- Updater->VRC, Updater->MRI,
- Updater->TII);
+ MachineInstr *NewDef =
+ InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstNonPHI(),
+ Updater->RegAttrs, Updater->MRI, Updater->TII);
return NewDef->getOperand(0).getReg();
}
@@ -312,9 +306,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
MachineSSAUpdater *Updater) {
MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
- MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
- Updater->VRC, Updater->MRI,
- Updater->TII);
+ MachineInstr *PHI =
+ InsertNewDef(TargetOpcode::PHI, BB, Loc, Updater->RegAttrs,
+ Updater->MRI, Updater->TII);
return PHI->getOperand(0).getReg();
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 06a8f80e6aa34..0f70c1996d6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
; Divergent phis that don't require lowering using lane mask merging
@@ -66,15 +65,16 @@ exit:
define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %out, i32 %tid, i32 inreg %cond) {
; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
; GFX10: ; %bb.0: ; %A
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, 6, v2
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB1_2
-; GFX10-NEXT: ; %bb.1:
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, 6, v2
-; GFX10-NEXT: s_branch .LBB1_3
-; GFX10-NEXT: .LBB1_2: ; %B
-; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
-; GFX10-NEXT: .LBB1_3: ; %exit
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT: ; %bb.1: ; %B
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: s_andn2_b32 s0, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s1, s0, s1
+; GFX10-NEXT: .LBB1_2: ; %exit
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s1
; GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v2
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
@@ -101,23 +101,27 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 1
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v4, s5
+; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v3
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s6, s6, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -144,44 +148,49 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_mov_b32_e32 v8, 0x3e8
-; GFX10-NEXT: v_mov_b32_e32 v9, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
+; GFX10-NEXT: v_mov_b32_e32 v8, s5
+; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX10-NEXT: s_branch .LBB3_2
; GFX10-NEXT: .LBB3_1: ; %loop_body
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v10, v9
-; GFX10-NEXT: v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v0
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8
+; GFX10-NEXT: s_xor_b32 s4, s4, -1
+; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s6, s6, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
-; GFX10-NEXT: s_mov_b32 s5, 1
+; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
+; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_store_dword v[6:7], v8
+; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: flat_store_dword v[6:7], v1
; GFX10-NEXT: .LBB3_4: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: s_xor_b32 s5, s5, 1
-; GFX10-NEXT: s_and_b32 s5, s5, 1
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
+; GFX10-NEXT: s_xor_b32 s7, s7, 1
+; GFX10-NEXT: s_and_b32 s7, s7, 1
+; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10-NEXT: ; %bb.5: ; %if
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: flat_store_dword v[4:5], v8
+; GFX10-NEXT: flat_store_dword v[4:5], v1
; GFX10-NEXT: s_branch .LBB3_1
; GFX10-NEXT: .LBB3_6: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -221,8 +230,8 @@ exit:
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
; GFX10-LABEL: single_lane_execution_attribute:
; GFX10: ; %bb.0: ; %.entry
-; GFX10-NEXT: s_mov_b32 s12, 0
; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_mov_b32 s12, 0
; GFX10-NEXT: s_mov_b32 s13, -1
; GFX10-NEXT: s_mov_b32 s2, s0
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[12:13]
@@ -230,7 +239,6 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s2, 1
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
@@ -257,13 +265,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: s_mov_b32 s13, 0
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: .LBB4_4: ; %Flow
-; GFX10-NEXT: s_and_b32 s2, s2, 1
-; GFX10-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB4_6
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s13
+; GFX10-NEXT: s_cbranch_vccz .LBB4_6
; GFX10-NEXT: ; %bb.5: ; %.19
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: v_or_b32_e32 v3, 2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 55f22b0bbb4df..5549c89dc402f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,10 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
--- |
define void @divergent_i1_phi_uniform_branch() {ret void}
define void @divergent_i1_phi_uniform_branch_simple() {ret void}
@@ -147,8 +143,8 @@ body: |
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
@@ -206,11 +202,11 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
- ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -301,11 +297,11 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 808c2c2ded201..e9df20f9688e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
; This file contains various tests that have divergent i1s used outside of
; the loop. These are lane masks is sgpr and need to have correct value in
@@ -17,22 +16,29 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s6, s5, s6
+; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1
+; GFX10-NEXT: s_xor_b32 s7, s6, -1
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT: v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 s8, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s7, exec_lo, s7
+; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s7, s8, s7
+; GFX10-NEXT: s_or_b32 s5, s5, s6
+; GFX10-NEXT: s_mov_b32 s6, s7
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -60,8 +66,11 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: v_mov_b32_e32 v5, 1
+; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT: s_or_b32 s4, s5, s4
; GFX10-NEXT: s_branch .LBB1_2
; GFX10-NEXT: .LBB1_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
@@ -70,20 +79,26 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo
+; GFX10-NEXT: s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s4, s7, s8
; GFX10-NEXT: s_cbranch_vccz .LBB1_4
; GFX10-NEXT: .LBB1_2: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v5
-; GFX10-NEXT: s_mov_b32 s6, s5
+; GFX10-NEXT: s_mov_b32 s5, s4
+; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s6, s4, s6
; GFX10-NEXT: s_and_saveexec_b32 s4, s5
; GFX10-NEXT: s_cbranch_execz .LBB1_1
; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX10-NEXT: global_load_dword v5, v[1:2], off
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s6, s6, s7
; GFX10-NEXT: s_branch .LBB1_1
; GFX10-NEXT: .LBB1_4: ; %exit
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5
@@ -123,23 +138,28 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
-; GFX10-NEXT: v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v5, v0
+; GFX10-NEXT: s_xor_b32 s7, vcc_lo, -1
+; GFX10-NEXT: s_or_b32 s5, s4, s5
+; GFX10-NEXT: v_mov_b32_e32 v4, s7
+; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s7
+; GFX10-NEXT: s_or_b32 s6, s4, s6
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -177,44 +197,59 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s6, 1
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: ; implicit-def: $sgpr7
+; GFX10-NEXT: ; implicit-def: $sgpr8
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_xor_b32 s7, s7, 1
-; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
-; GFX10-NEXT: s_or_b32 s5, s6, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_xor_b32 s9, s8, -1
+; GFX10-NEXT: s_and_b32 s10, exec_lo, s7
+; GFX10-NEXT: s_or_b32 s5, s10, s5
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s9, exec_lo, s9
+; GFX10-NEXT: s_or_b32 s6, s6, s9
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB3_5
; GFX10-NEXT: .LBB3_3: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT: s_and_b32 s9, exec_lo, -1
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT: s_or_b32 s8, s8, s9
; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT: s_or_b32 s7, s7, s9
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_2
; GFX10-NEXT: ; %bb.4: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
-; GFX10-NEXT: v_cmp_lt_i32_e64 s6, v5, v0
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT: s_and_b32 s10, exec_lo, 0
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s8, s8, s10
+; GFX10-NEXT: s_or_b32 s7, s7, s11
; GFX10-NEXT: s_branch .LBB3_2
; GFX10-NEXT: .LBB3_5: ; %loop.exit.guard
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_and_b32 s5, 1, s7
-; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s5
+; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s6, s5, s6
; GFX10-NEXT: .LBB3_6: ; %Flow1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
@@ -266,21 +301,24 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: s_branch .LBB4_2
; GFX10-NEXT: .LBB4_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
-; GFX10-NEXT: s_and_b32 s4, 1, s6
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s7
; GFX10-NEXT: s_or_b32 s5, s4, s5
+; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s6, s4, s6
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB4_6
; GFX10-NEXT: .LBB4_2: ; %cond.block.0
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v4, v5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
-; GFX10-NEXT: s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_4
; GFX10-NEXT: ; %bb.3: ; %if.block.0
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
@@ -292,20 +330,22 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-NEXT: .LBB4_4: ; %loop.break.block
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
-; GFX10-NEXT: s_mov_b32 s6, 1
+; GFX10-NEXT: s_mov_b32 s7, 1
; GFX10-NEXT: ; implicit-def: $vgpr5
-; GFX10-NEXT: s_and_saveexec_b32 s7, s4
+; GFX10-NEXT: s_and_saveexec_b32 s8, s4
; GFX10-NEXT: s_cbranch_execz .LBB4_1
; GFX10-NEXT: ; %bb.5: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
-; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
+; GFX10-NEXT: s_or_b32 s7, s4, s7
; GFX10-NEXT: s_branch .LBB4_1
; GFX10-NEXT: .LBB4_6: ; %cond.block.1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s4, s6
; GFX10-NEXT: s_cbranch_execz .LBB4_8
; GFX10-NEXT: ; %bb.7: ; %if.block.1
; GFX10-NEXT: global_store_dword v[6:7], v4, off
@@ -370,34 +410,44 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
-; GFX10-NEXT: v_mov_b32_e32 v6, 1
+; GFX10-NEXT: s_mov_b32 s3, 1
; GFX10-NEXT: v_mov_b32_e32 v5, s0
+; GFX10-NEXT: ; implicit-def: $sgpr1
+; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s1
-; GFX10-NEXT: v_mov_b32_e32 v5, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5
; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s4
+; GFX10-NEXT: s_or_b32 s1, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB5_4
; GFX10-NEXT: .LBB5_2: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v6
-; GFX10-NEXT: s_and_saveexec_b32 s2, s1
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT: s_or_b32 s2, s2, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s3
; GFX10-NEXT: s_cbranch_execz .LBB5_1
; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s2, s2, s3
+; GFX10-NEXT: ; implicit-def: $sgpr3
; GFX10-NEXT: s_branch .LBB5_1
; GFX10-NEXT: .LBB5_4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -436,40 +486,52 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: ; implicit-def: $sgpr1
+; GFX10-NEXT: ; implicit-def: $sgpr2
+; GFX10-NEXT: ; implicit-def: $sgpr3
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_branch .LBB6_2
; GFX10-NEXT: .LBB6_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, 1, s3
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s4, s0
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT: s_or_b32 s1, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB6_4
; GFX10-NEXT: .LBB6_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_mov_b32 s3, 1
+; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s4
; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT: s_or_b32 s2, s2, s4
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
; GFX10-NEXT: global_load_dword v9, v[9:10], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB6_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
; GFX10-NEXT: global_load_dword v9, v[7:8], off
+; GFX10-NEXT: s_and_b32 s5, exec_lo, 0
; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s5
+; GFX10-NEXT: s_or_b32 s2, s2, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
; GFX10-NEXT: global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index d84ccece81c78..ace9bec6e1c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -1,10 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
---
name: divergent_i1_phi_used_outside_loop
legalized: true
@@ -24,8 +20,8 @@ body: |
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
- ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -34,14 +30,14 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -127,8 +123,8 @@ body: |
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
- ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -137,14 +133,14 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.3
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -166,8 +162,8 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
- ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
@@ -264,11 +260,11 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
- ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -369,8 +365,8 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.5
@@ -378,14 +374,14 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
@@ -437,11 +433,11 @@ body: |
; GFX10-NEXT: bb.7:
; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.3(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
- ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
- ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
- ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
@@ -578,10 +574,10 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -624,9 +620,9 @@ body: |
; GFX10-NEXT: bb.6:
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
- ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
@@ -761,14 +757,14 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
@@ -796,10 +792,10 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
- ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
- ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
- ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
@@ -912,14 +908,14 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -979,11 +975,11 @@ body: |
; GFX10-NEXT: bb.5:
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
- ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
- ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
- ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index fb1253c7a17d9..609fff51863a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
; Simples case, if - then, that requires lane mask merging,
; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
@@ -12,7 +11,10 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: ; %bb.1: ; %B
-; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
@@ -41,16 +43,23 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, s0
+; GFX10-NEXT: ; implicit-def: $sgpr0
; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-NEXT: ; %bb.1: ; %B
-; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 2, v2
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT: s_andn2_b32 s0, s2, exec_lo
; GFX10-NEXT: ; implicit-def: $vgpr2
+; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1
; GFX10-NEXT: ; %bb.3: ; %A
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, 1, v2
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: ; %bb.4: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, s0
@@ -98,36 +107,42 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: ; implicit-def: $sgpr1
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: s_branch .LBB2_2
; GFX10-NEXT: .LBB2_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT: s_or_b32 s0, s2, s0
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB2_4
; GFX10-NEXT: .LBB2_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 2, v[4:5]
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, v5
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
; GFX10-NEXT: global_load_dword v7, v[7:8], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB2_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v0, v5
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v4
-; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v4
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v4
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX10-NEXT: global_load_dword v7, v[5:6], off
; GFX10-NEXT: v_mov_b32_e32 v4, v8
+; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s1, s1, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v7, 1, v7
; GFX10-NEXT: global_store_dword v[5:6], v7, off
@@ -161,35 +176,42 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; GFX10-LABEL: loop_with_2breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: ; implicit-def: $sgpr1
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_1: ; %Flow3
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s1, s1, s3
; GFX10-NEXT: .LBB3_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT: s_or_b32 s0, s2, s0
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_3: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
; GFX10-NEXT: global_load_dword v9, v[9:10], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_2
; GFX10-NEXT: ; %bb.4: ; %B
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: global_load_dword v9, v[9:10], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
@@ -200,9 +222,12 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
; GFX10-NEXT: global_load_dword v9, v[7:8], off
; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_and_b32 s5, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s4, s4, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
; GFX10-NEXT: global_store_dword v[7:8], v9, off
@@ -242,38 +267,48 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; GFX10-LABEL: loop_with_3breaks:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: ; implicit-def: $sgpr1
; GFX10-NEXT: v_mov_b32_e32 v8, s0
; GFX10-NEXT: s_branch .LBB4_4
; GFX10-NEXT: .LBB4_1: ; %Flow5
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s4, s4, s5
; GFX10-NEXT: .LBB4_2: ; %Flow4
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s1, s1, s3
; GFX10-NEXT: .LBB4_3: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s0, s1, s0
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT: s_or_b32 s0, s2, s0
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB4_8
; GFX10-NEXT: .LBB4_4: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v9, 31, v8
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT: s_or_b32 s1, s1, s2
; GFX10-NEXT: v_lshlrev_b64 v[9:10], 2, v[8:9]
; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v2, v9
; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
; GFX10-NEXT: global_load_dword v11, v[11:12], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_3
; GFX10-NEXT: ; %bb.5: ; %B
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v4, v9
; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 1
; GFX10-NEXT: global_load_dword v11, v[11:12], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -283,6 +318,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v9
; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT: s_mov_b32 s5, 1
; GFX10-NEXT: global_load_dword v11, v[11:12], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -293,9 +329,12 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v0, v9
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v12, 1, v8
-; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v8
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v8
+; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
; GFX10-NEXT: global_load_dword v11, v[9:10], off
; GFX10-NEXT: v_mov_b32_e32 v8, v12
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s5, s5, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v11
; GFX10-NEXT: global_store_dword v[9:10], v11, off
@@ -345,40 +384,52 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
; GFX10-LABEL: loop_with_div_break_with_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: ; implicit-def: $sgpr1
+; GFX10-NEXT: ; implicit-def: $sgpr2
+; GFX10-NEXT: ; implicit-def: $sgpr3
; GFX10-NEXT: v_mov_b32_e32 v6, s0
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, 1, s3
-; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT: s_or_b32 s0, s4, s0
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT: s_or_b32 s1, s1, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB5_4
; GFX10-NEXT: .LBB5_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: s_mov_b32 s3, 1
+; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s4
; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT: s_or_b32 s2, s2, s4
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
; GFX10-NEXT: global_load_dword v9, v[9:10], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB5_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT: v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
; GFX10-NEXT: global_load_dword v9, v[7:8], off
+; GFX10-NEXT: s_and_b32 s5, exec_lo, 0
; GFX10-NEXT: v_mov_b32_e32 v6, v10
+; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s5
+; GFX10-NEXT: s_or_b32 s2, s2, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
; GFX10-NEXT: global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 0e6588b4593c9..df5505e1b28bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -1,10 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
---
name: divergent_i1_phi_if_then
legalized: true
@@ -39,8 +35,8 @@ body: |
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -104,8 +100,8 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
; GFX10-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.2
@@ -133,8 +129,8 @@ body: |
; GFX10-NEXT: G_BR %bb.1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.4:
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -210,10 +206,10 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.3
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
- ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -251,9 +247,9 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -348,10 +344,10 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -385,9 +381,9 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
- ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -414,9 +410,9 @@ body: |
; GFX10-NEXT: bb.5:
; GFX10-NEXT: successors: %bb.3(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
- ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
@@ -540,10 +536,10 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %61(s1), %bb.3
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -577,9 +573,9 @@ body: |
; GFX10-NEXT: bb.3:
; GFX10-NEXT: successors: %bb.8(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
- ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -602,9 +598,9 @@ body: |
; GFX10-NEXT: bb.5:
; GFX10-NEXT: successors: %bb.3(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
- ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+ ; GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
@@ -633,9 +629,9 @@ body: |
; GFX10-NEXT: bb.7:
; GFX10-NEXT: successors: %bb.5(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
; GFX10-NEXT: [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
- ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
@@ -782,14 +778,14 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -849,11 +845,11 @@ body: |
; GFX10-NEXT: bb.5:
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
- ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
; GFX10-NEXT: [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
- ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
- ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+ ; GFX10-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+ ; GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
@@ -986,12 +982,12 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %70(s1), %bb.7
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
@@ -1061,16 +1057,16 @@ body: |
; GFX10-NEXT: bb.7:
; GFX10-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
- ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
- ; GFX10-NEXT: [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
- ; GFX10-NEXT: [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
+ ; GFX10-NEXT: [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
; GFX10-NEXT: [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
- ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
- ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
- ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
- ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
+ ; GFX10-NEXT: [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+ ; GFX10-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+ ; GFX10-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
+ ; GFX10-NEXT: [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 431da1de1fd48..312c6a3822ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,29 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i1_phi:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, s4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, 1
+; GFX10-NEXT: v_mov_b32_e32 v4, s5
+; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GFX10-NEXT: v_mov_b32_e32 v5, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT: v_xor_b32_e32 v4, 1, v5
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v3
+; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s6, s6, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -49,23 +51,27 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i1_non_phi:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 1
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: v_mov_b32_e32 v4, s5
+; GFX10-NEXT: ; implicit-def: $sgpr6
; GFX10-NEXT: .LBB1_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v4
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT: v_and_b32_e32 v6, 1, v3
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s6, s6, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execnz .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -96,30 +102,41 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
; GFX10-NEXT: v_mov_b32_e32 v4, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s0
; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: ; implicit-def: $sgpr0
+; GFX10-NEXT: ; implicit-def: $sgpr1
; GFX10-NEXT: s_branch .LBB2_3
; GFX10-NEXT: .LBB2_1: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1
; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v5
-; GFX10-NEXT: v_cmp_lt_u32_e64 s0, v5, v2
-; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v5, v2
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX10-NEXT: global_load_dword v8, v[6:7], off
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_mov_b32_e32 v5, v9
+; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s1, s1, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
; GFX10-NEXT: global_store_dword v[6:7], v8, off
; GFX10-NEXT: .LBB2_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
-; GFX10-NEXT: s_or_b32 s4, s0, s4
-; GFX10-NEXT: s_and_b32 s0, 1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_and_b32 s5, 1, s5
+; GFX10-NEXT: s_and_b32 s6, exec_lo, s1
+; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5
+; GFX10-NEXT: s_or_b32 s4, s6, s4
+; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s0, s0, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB2_5
; GFX10-NEXT: .LBB2_3: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, -1
+; GFX10-NEXT: s_or_b32 s1, s1, s5
; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v3, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
@@ -128,8 +145,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: s_cbranch_vccnz .LBB2_1
; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT: s_mov_b32 s0, -1
-; GFX10-NEXT: s_mov_b32 s1, 1
+; GFX10-NEXT: s_mov_b32 s5, 1
; GFX10-NEXT: ; implicit-def: $vgpr5
; GFX10-NEXT: s_branch .LBB2_2
; GFX10-NEXT: .LBB2_5: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index 64d740287d9fc..abb491f938e54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -1,10 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
---
name: temporal_divergent_i1_phi
legalized: true
@@ -26,12 +22,12 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
- ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
@@ -113,11 +109,11 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
- ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -207,12 +203,12 @@ body: |
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.3(0x50000000), %bb.5(0x30000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
; GFX10-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
@@ -262,11 +258,11 @@ body: |
; GFX10-NEXT: bb.5:
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
- ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+ ; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 6384c47398fce..c25b0f2128266 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
; Make sure the branch targets are correct after lowering llvm.amdgcn.if
@@ -203,26 +202,34 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
; CHECK-NEXT: ; implicit-def: $vgpr1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_subrev_u32_e32 v0, s2, v0
-; CHECK-NEXT: s_branch .LBB5_2
-; CHECK-NEXT: .LBB5_1: ; %Flow
-; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
-; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT: s_branch .LBB5_3
+; CHECK-NEXT: .LBB5_1: ; %bb4
+; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT: global_load_dword v2, v[0:1], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2
+; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT: .LBB5_2: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3]
+; CHECK-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; CHECK-NEXT: s_cbranch_execz .LBB5_4
-; CHECK-NEXT: .LBB5_2: ; %bb1
+; CHECK-NEXT: s_cbranch_execz .LBB5_5
+; CHECK-NEXT: .LBB5_3: ; %bb1
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT: s_and_b64 s[4:5], exec, -1
; CHECK-NEXT: v_cmp_le_i32_e32 vcc, 0, v1
-; CHECK-NEXT: s_mov_b64 s[2:3], -1
-; CHECK-NEXT: s_cbranch_vccnz .LBB5_1
-; CHECK-NEXT: ; %bb.3: ; %bb4
-; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT: global_load_dword v2, v[0:1], off glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2
-; CHECK-NEXT: s_branch .LBB5_1
-; CHECK-NEXT: .LBB5_4: ; %bb9
+; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT: s_cbranch_vccz .LBB5_1
+; CHECK-NEXT: ; %bb.4: ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT: ; implicit-def: $vgpr1
+; CHECK-NEXT: s_branch .LBB5_2
+; CHECK-NEXT: .LBB5_5: ; %bb9
; CHECK-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list