[llvm] [AMDGPU] Fix VGPR to SGPR copy for inline asm with SGPR constraint (PR #176330)
Vigneshwar Jayakumar via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 29 22:04:29 PST 2026
https://github.com/VigneshwarJ updated https://github.com/llvm/llvm-project/pull/176330
>From 06f7f705a15ba0c149976e463e4acefd48ddabd2 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Fri, 16 Jan 2026 03:26:01 -0600
Subject: [PATCH 1/5] [AMDGPU] Fix VGPR to SGPR copy for inline asm with SGPR
constraint
SIFixSGPRCopies was incorrectly handling inline assembly operands with
SGPR ("s") constraints when the value came from a memory load (which
produces a VGPR). The pass would fail to insert the necessary
v_readfirstlane instruction instead directly passes the vgpr value.
example:
asm sideeffect buffer_load_dwordx4 $0, $1, $2, 0 =v,v,s,n
actual instruction expects sgpr but:
buffer_load_dwordx4 v[0:3], v0, v[8:11], 0 offen
fix:
buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
tradeoff:
callbr void asm "", "r,!i" for these instruction it now generates
readfirstlane and maintains sgpr but could have been a vgpr if value
came from vgpr.
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 15 +-
llvm/test/CodeGen/AMDGPU/callbr.ll | 3 +-
llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 4 +-
.../AMDGPU/inline-asm-vgpr-sgpr-copy.ll | 39 ++++
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 106 +++++----
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 104 +++++----
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 205 +++++++++++-------
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 95 ++++----
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 104 +++++----
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 205 +++++++++++-------
.../si-annotate-nested-control-flows.ll | 1 +
.../si-unify-exit-multiple-unreachables.ll | 8 +-
.../AMDGPU/spill-offset-calculation.ll | 79 ++++---
13 files changed, 619 insertions(+), 349 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 5ae02d025989b..a7da4a49c45cc 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -100,6 +100,7 @@ class V2SCopyInfo {
unsigned NumReadfirstlanes = 0;
// Current score state. To speedup selection V2SCopyInfos for processing
bool NeedToBeConvertedToVALU = false;
+ bool HasMandatorySGPRUse = false;
// Unique ID. Used as a key for mapping to keep permanent order.
unsigned ID;
@@ -1003,8 +1004,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
} else if (Inst->getNumExplicitDefs() != 0) {
Register Reg = Inst->getOperand(0).getReg();
if (Reg.isVirtual() && TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) {
- for (auto &U : MRI->use_instructions(Reg))
+ for (auto &U : MRI->use_instructions(Reg)) {
+ // Inline assembly operands with SGPR constraints must not be
+ // converted to VALU. Without this check, the SGPR could be converted
+ // to VGPR, violating the inline asm constraint.
+ // This is conservative: we cannot distinguish "s" (strict SGPR) from
+ // "r" (any register) at this stage, as the constraint string is lost
+ // after SelectionDAG. Therefore, we treat all inline asm as mandatory
+ // SGPR use.
+ if (U.isInlineAsm())
+ Info.HasMandatorySGPRUse = true;
Users.push_back(&U);
+ }
}
}
for (auto *U : Users) {
@@ -1019,6 +1030,8 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// The main function that computes the VGPR to SGPR copy score
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
+ if (Info->HasMandatorySGPRUse)
+ return false;
if (Info->SChain.empty()) {
Info->Score = 0;
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
index 253a6ec100eae..deb557d0ffc3b 100644
--- a/llvm/test/CodeGen/AMDGPU/callbr.ll
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -6,8 +6,9 @@ define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_load_dword v0, v[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s4, v6
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: v_cmp_gt_i32 vcc s4, 42; s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.1: ; %fallthrough
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index df635925b87df..f6fe5bde73c32 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
@@ -120,6 +120,7 @@ define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ;;#ASMEND
; SI-NEXT: ; %bb.1: ; %loop.preheader
@@ -354,6 +355,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out)
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ;;#ASMEND
; SI-NEXT: ; %bb.1: ; %outer_loop.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
new file mode 100644
index 0000000000000..39d4416e2734d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s
+
+; This is a regression test for a bug where SIFixSGPRCopies would incorrectly
+; keep the value as VGPR when it should be converted to SGPR for inline asm.
+
+define <4 x float> @test_sgpr_constraint_bug(ptr addrspace(5) %buf_desc_ptr) {
+; CHECK-LABEL: test_sgpr_constraint_bug:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: scratch_load_dwordx4 v[4:7], v0, off
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v0
+; CHECK-NEXT: s_mov_b64 s[4:5], exec
+; CHECK-NEXT: v_mov_b32_e32 v8, 1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v4
+; CHECK-NEXT: v_readfirstlane_b32 s1, v5
+; CHECK-NEXT: v_readfirstlane_b32 s2, v6
+; CHECK-NEXT: v_readfirstlane_b32 s3, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_cmpx_le_u32 exec, 1, v8
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:0
+; CHECK-NEXT: s_mov_b64 exec s[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %rsrc = load <4 x i32>, ptr addrspace(5) %buf_desc_ptr, align 16
+
+ %exec = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+
+ %result = call <4 x float> asm sideeffect
+ "v_cmpx_le_u32 exec, 1, $4\0Abuffer_load_dwordx4 $0, $1, $2, 0 offen offset:$3\0As_mov_b64 exec $5",
+ "=v,v,s,n,v,s,0,~{memory}"
+ (i32 0, <4 x i32> %rsrc, i32 0, i32 1, i64 %exec, <4 x float> zeroinitializer)
+
+ ret <4 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 7fd70de81af6f..3501a3d3bb157 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -522,8 +522,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v0
+; GFX7-NEXT: ; use s4
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -535,9 +536,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -549,9 +551,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-NEXT: s_and_b32 s4, 0xffff, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -561,9 +564,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -573,9 +577,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_max_f16_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -586,9 +591,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -599,9 +605,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -973,10 +980,12 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7-NEXT: s_lshl_b32 s4, s4, 16
+; GFX7-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7-NEXT: s_or_b32 s4, s5, s4
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v0
+; GFX7-NEXT: ; use s4
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -986,17 +995,21 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: s_lshr_b32 s4, s17, 16
; GFX8-NEXT: s_lshr_b32 s5, s16, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_max_f16_e32 v1, s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT: v_max_f16_e32 v0, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00
-; GFX8-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_max_f16_e32 v3, s16, v2
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s17
+; GFX8-NEXT: v_max_f16_e32 v1, s16, v0
+; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_or_b32 s4, s5, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1014,10 +1027,11 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-NEXT: v_readfirstlane_b32 s5, v1
+; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1027,8 +1041,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -1039,14 +1054,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: s_lshr_b32 s5, s16, 16
; GFX10-NEXT: v_pk_max_f16 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v2
+; GFX10-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1061,9 +1077,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1073,17 +1093,18 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, s0, s1
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s3, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1095,9 +1116,12 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v0
+; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 97eafd07d4b37..2bec23c9e86ae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -379,8 +379,9 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v0
+; GFX7-NEXT: ; use s4
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -392,8 +393,9 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -405,8 +407,9 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -415,8 +418,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_maximum3_f32 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -426,8 +431,9 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX10-NEXT: v_max_f32_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -436,10 +442,11 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -727,13 +734,15 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX7-NEXT: v_max_f32_e32 v1, s17, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_max_f32_e32 v3, s16, v0
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_mov_b32_e32 v1, s18
+; GFX7-NEXT: v_max_f32_e32 v3, s16, v1
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:1]
+; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -744,13 +753,15 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX8-NEXT: v_max_f32_e32 v1, s17, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s18
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_max_f32_e32 v3, s16, v0
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v1, s18
+; GFX8-NEXT: v_max_f32_e32 v3, s16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
+; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -761,13 +772,15 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX900-NEXT: v_max_f32_e32 v1, s17, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX900-NEXT: v_mov_b32_e32 v0, s18
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX900-NEXT: v_max_f32_e32 v3, s16, v0
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, s18
+; GFX900-NEXT: v_max_f32_e32 v3, s16, v1
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v1
+; GFX900-NEXT: v_readfirstlane_b32 s5, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:1]
+; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -775,40 +788,47 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_maximum3_f32 v1, v0, s3, s3
-; GFX950-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-NEXT: v_maximum3_f32 v0, v0, s2, s2
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: v_maximum3_f32 v0, v0, s3, s3
+; GFX950-NEXT: v_maximum3_f32 v1, v1, s2, s2
+; GFX950-NEXT: v_readfirstlane_b32 s1, v0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:1]
+; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_maximum_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e64 v0, s17, s19
-; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
-; GFX10-NEXT: v_max_f32_e64 v2, s16, s18
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e64 v0, s16, s18
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT: v_max_f32_e64 v1, s17, s19
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:1]
+; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_maximum_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e64 v0, s1, s3
-; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
-; GFX11-NEXT: v_max_f32_e64 v2, s0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_max_f32_e64 v0, s0, s2
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT: v_max_f32_e64 v1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:1]
+; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 3280d7aa9ddfe..a48fe6a4728f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -418,13 +418,15 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX7-NEXT: v_readfirstlane_b32 s6, v3
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
+; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX7-NEXT: s_cselect_b32 s4, 0, s4
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:1]
+; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -433,13 +435,15 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX8-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8-NEXT: v_readfirstlane_b32 s4, v2
+; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX8-NEXT: s_cselect_b32 s4, 0, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
+; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -448,13 +452,15 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s18
; GFX900-NEXT: v_mov_b32_e32 v1, s19
-; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
+; GFX900-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX900-NEXT: v_readfirstlane_b32 s6, v3
+; GFX900-NEXT: v_readfirstlane_b32 s4, v2
+; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX900-NEXT: s_cselect_b32 s4, 0, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:1]
+; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -462,14 +468,15 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX950-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT: v_max_f64 v[0:1], s[0:1], v[0:1]
+; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s2, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s2
+; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:1]
+; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -478,10 +485,13 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v0
+; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s5
+; GFX10-NEXT: s_cselect_b32 s4, 0, s6
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:1]
+; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -490,11 +500,14 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s1
+; GFX11-NEXT: s_cselect_b32 s0, 0, s2
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:1]
+; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -506,9 +519,13 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v[0:1]
+; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -825,15 +842,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX7-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
+; GFX7-NEXT: v_readfirstlane_b32 s8, v3
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7-NEXT: v_readfirstlane_b32 s6, v2
+; GFX7-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX7-NEXT: s_cselect_b32 s6, 0, s6
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: v_readfirstlane_b32 s8, v5
+; GFX7-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX7-NEXT: s_cselect_b32 s4, 0, s4
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:3]
+; GFX7-NEXT: ; use s[4:7]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -846,15 +868,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s21
-; GFX8-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s8, v3
+; GFX8-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX8-NEXT: s_cselect_b32 s6, 0, s6
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT: v_readfirstlane_b32 s8, v5
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX8-NEXT: s_cselect_b32 s4, 0, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:3]
+; GFX8-NEXT: ; use s[4:7]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -867,15 +894,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v0, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s21
-; GFX900-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX900-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
+; GFX900-NEXT: v_readfirstlane_b32 s8, v3
+; GFX900-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX900-NEXT: v_readfirstlane_b32 s6, v2
+; GFX900-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX900-NEXT: s_cselect_b32 s6, 0, s6
+; GFX900-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX900-NEXT: v_readfirstlane_b32 s8, v5
+; GFX900-NEXT: v_readfirstlane_b32 s4, v4
+; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX900-NEXT: s_cselect_b32 s4, 0, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:3]
+; GFX900-NEXT: ; use s[4:7]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -883,19 +915,23 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
-; GFX950-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
+; GFX950-NEXT: v_max_f64 v[0:1], s[2:3], v[0:1]
+; GFX950-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GFX950-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1]
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT: s_cselect_b32 s3, 0x7ff80000, s4
+; GFX950-NEXT: s_cselect_b32 s2, 0, s2
+; GFX950-NEXT: v_max_f64 v[0:1], s[0:1], v[0:1]
+; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s4
+; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:3]
+; GFX950-NEXT: ; use s[0:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -904,14 +940,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[22:23]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
-; GFX10-NEXT: v_max_f64 v[4:5], s[16:17], s[20:21]
+; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[20:21]
; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5
+; GFX10-NEXT: v_readfirstlane_b32 s6, v1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: s_cselect_b32 s7, 0x7ff80000, s6
+; GFX10-NEXT: s_cselect_b32 s6, 0, s8
+; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
+; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s4
+; GFX10-NEXT: s_cselect_b32 s4, 0, s9
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:3]
+; GFX10-NEXT: ; use s[4:7]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -919,16 +961,22 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[18:19]
+; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
-; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v3
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, s1
+; GFX11-NEXT: s_cselect_b32 s2, 0, s4
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s5
+; GFX11-NEXT: s_cselect_b32 s0, 0, s6
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:3]
+; GFX11-NEXT: ; use s[0:3]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -939,11 +987,18 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[18:19]
-; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[16:17]
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[2:3], s[18:19]
+; GFX12-NEXT: v_maximum_f64 v[2:3], s[0:1], s[16:17]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v[0:3]
+; GFX12-NEXT: ; use s[0:3]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index b5dab396f0bf1..f386d257ed678 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -445,9 +445,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -459,9 +460,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-NEXT: s_and_b32 s4, 0xffff, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -471,9 +473,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -483,9 +486,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_min_f16_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -496,9 +500,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: v_min_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -509,9 +514,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -799,17 +805,21 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: s_lshr_b32 s4, s17, 16
; GFX8-NEXT: s_lshr_b32 s5, s16, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_min_f16_e32 v1, s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT: v_min_f16_e32 v0, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00
-; GFX8-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_min_f16_e32 v3, s16, v2
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s17
+; GFX8-NEXT: v_min_f16_e32 v1, s16, v0
+; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT: s_or_b32 s4, s5, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -827,10 +837,11 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
+; GFX900-NEXT: v_readfirstlane_b32 s5, v1
+; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -840,8 +851,9 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -852,14 +864,15 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: s_lshr_b32 s5, s16, 16
; GFX10-NEXT: v_pk_min_f16 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v2
+; GFX10-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -874,9 +887,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -886,17 +903,18 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: v_pk_min_f16 v0, s0, s1
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s3, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -908,9 +926,12 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v0
+; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 3e98599fc4c7f..860e0686a0928 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -379,8 +379,9 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v0
+; GFX7-NEXT: ; use s4
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -392,8 +393,9 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v0
+; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -405,8 +407,9 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v0
+; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -415,8 +418,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_minimum3_f32 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v0
+; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -426,8 +431,9 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX10-NEXT: v_min_f32_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v0
+; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -436,10 +442,11 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f32_e64 v0, s0, s1
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v0
+; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -727,13 +734,15 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX7-NEXT: v_min_f32_e32 v1, s17, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s18
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_min_f32_e32 v3, s16, v0
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_mov_b32_e32 v1, s18
+; GFX7-NEXT: v_min_f32_e32 v3, s16, v1
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:1]
+; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -744,13 +753,15 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX8-NEXT: v_min_f32_e32 v1, s17, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s18
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_min_f32_e32 v3, s16, v0
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v1, s18
+; GFX8-NEXT: v_min_f32_e32 v3, s16, v1
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
+; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -761,13 +772,15 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX900-NEXT: v_min_f32_e32 v1, s17, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX900-NEXT: v_mov_b32_e32 v0, s18
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX900-NEXT: v_min_f32_e32 v3, s16, v0
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_mov_b32_e32 v1, s18
+; GFX900-NEXT: v_min_f32_e32 v3, s16, v1
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v1
+; GFX900-NEXT: v_readfirstlane_b32 s5, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:1]
+; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -775,40 +788,47 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_minimum3_f32 v1, v0, s3, s3
-; GFX950-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-NEXT: v_minimum3_f32 v0, v0, s2, s2
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: v_minimum3_f32 v0, v0, s3, s3
+; GFX950-NEXT: v_minimum3_f32 v1, v1, s2, s2
+; GFX950-NEXT: v_readfirstlane_b32 s1, v0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:1]
+; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_minimum_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f32_e64 v0, s17, s19
-; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
-; GFX10-NEXT: v_min_f32_e64 v2, s16, s18
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_min_f32_e64 v0, s16, s18
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX10-NEXT: v_min_f32_e64 v1, s17, s19
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:1]
+; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_minimum_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f32_e64 v0, s1, s3
-; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
-; GFX11-NEXT: v_min_f32_e64 v2, s0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_min_f32_e64 v0, s0, s2
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
+; GFX11-NEXT: v_min_f32_e64 v1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:1]
+; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index d07bd6c8dd902..7d6e799d56908 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -418,13 +418,15 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX7-NEXT: v_readfirstlane_b32 s6, v3
+; GFX7-NEXT: v_readfirstlane_b32 s4, v2
+; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX7-NEXT: s_cselect_b32 s4, 0, s4
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:1]
+; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -433,13 +435,15 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX8-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8-NEXT: v_readfirstlane_b32 s4, v2
+; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX8-NEXT: s_cselect_b32 s4, 0, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:1]
+; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -448,13 +452,15 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s18
; GFX900-NEXT: v_mov_b32_e32 v1, s19
-; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
-; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
+; GFX900-NEXT: s_and_b64 s[4:5], vcc, exec
+; GFX900-NEXT: v_readfirstlane_b32 s6, v3
+; GFX900-NEXT: v_readfirstlane_b32 s4, v2
+; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
+; GFX900-NEXT: s_cselect_b32 s4, 0, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:1]
+; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -462,14 +468,15 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX950-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT: v_min_f64 v[0:1], s[0:1], v[0:1]
+; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s2, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s2
+; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:1]
+; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -478,10 +485,13 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v0
+; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s5
+; GFX10-NEXT: s_cselect_b32 s4, 0, s6
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:1]
+; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -490,11 +500,14 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f64 v[0:1], s[0:1], s[2:3]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s1
+; GFX11-NEXT: s_cselect_b32 s0, 0, s2
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:1]
+; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -506,9 +519,13 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v[0:1]
+; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -825,15 +842,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX7-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
+; GFX7-NEXT: v_readfirstlane_b32 s8, v3
+; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX7-NEXT: v_readfirstlane_b32 s6, v2
+; GFX7-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX7-NEXT: s_cselect_b32 s6, 0, s6
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: v_readfirstlane_b32 s8, v5
+; GFX7-NEXT: v_readfirstlane_b32 s4, v4
+; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX7-NEXT: s_cselect_b32 s4, 0, s4
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use v[0:3]
+; GFX7-NEXT: ; use s[4:7]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -846,15 +868,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s21
-; GFX8-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX8-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s8, v3
+; GFX8-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX8-NEXT: s_cselect_b32 s6, 0, s6
+; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX8-NEXT: v_readfirstlane_b32 s8, v5
+; GFX8-NEXT: v_readfirstlane_b32 s4, v4
+; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX8-NEXT: s_cselect_b32 s4, 0, s4
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use v[0:3]
+; GFX8-NEXT: ; use s[4:7]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -867,15 +894,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v0, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s21
-; GFX900-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
-; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
+; GFX900-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
+; GFX900-NEXT: v_readfirstlane_b32 s8, v3
+; GFX900-NEXT: s_and_b64 s[6:7], vcc, exec
+; GFX900-NEXT: v_readfirstlane_b32 s6, v2
+; GFX900-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
+; GFX900-NEXT: s_cselect_b32 s6, 0, s6
+; GFX900-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX900-NEXT: v_readfirstlane_b32 s8, v5
+; GFX900-NEXT: v_readfirstlane_b32 s4, v4
+; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
+; GFX900-NEXT: s_cselect_b32 s4, 0, s4
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use v[0:3]
+; GFX900-NEXT: ; use s[4:7]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -883,19 +915,23 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
-; GFX950-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1]
-; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
+; GFX950-NEXT: v_min_f64 v[0:1], s[2:3], v[0:1]
+; GFX950-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
-; GFX950-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1]
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT: s_cselect_b32 s3, 0x7ff80000, s4
+; GFX950-NEXT: s_cselect_b32 s2, 0, s2
+; GFX950-NEXT: v_min_f64 v[0:1], s[0:1], v[0:1]
+; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
+; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s4
+; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[0:3]
+; GFX950-NEXT: ; use s[0:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -904,14 +940,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_min_f64 v[0:1], s[18:19], s[22:23]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
-; GFX10-NEXT: v_min_f64 v[4:5], s[16:17], s[20:21]
+; GFX10-NEXT: v_min_f64 v[2:3], s[16:17], s[20:21]
; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5
+; GFX10-NEXT: v_readfirstlane_b32 s6, v1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s9, v2
+; GFX10-NEXT: s_cselect_b32 s7, 0x7ff80000, s6
+; GFX10-NEXT: s_cselect_b32 s6, 0, s8
+; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
+; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s4
+; GFX10-NEXT: s_cselect_b32 s4, 0, s9
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use v[0:3]
+; GFX10-NEXT: ; use s[4:7]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -919,16 +961,22 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[18:19]
+; GFX11-NEXT: v_min_f64 v[2:3], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
-; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v3
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, s1
+; GFX11-NEXT: s_cselect_b32 s2, 0, s4
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s5
+; GFX11-NEXT: s_cselect_b32 s0, 0, s6
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v[0:3]
+; GFX11-NEXT: ; use s[0:3]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -939,11 +987,18 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[18:19]
-; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[16:17]
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[2:3], s[18:19]
+; GFX12-NEXT: v_minimum_f64 v[2:3], s[0:1], s[16:17]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use v[0:3]
+; GFX12-NEXT: ; use s[0:3]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 01bcdad3fc220..a1c6a398ccf50 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -97,6 +97,7 @@ define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5
; ISA-NEXT: .LBB1_1: ; %BB1
; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT: v_readfirstlane_b32 s8, v0
; ISA-NEXT: ;;#ASMSTART
; ISA-NEXT: ;;#ASMEND
; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 004c27971131d..b6233c2563a68 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
@@ -96,7 +96,6 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
; UNIFY-NEXT: br label [[IF_END6]]
; UNIFY: if.end6:
; UNIFY-NEXT: ret void
-;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %n, 256
@@ -140,12 +139,14 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.1: ; %if.then
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split
@@ -165,12 +166,14 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; CHECK-NEXT: ; Label of block must be emitted
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.5: ; %if.then3
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_branch .LBB1_2
@@ -218,7 +221,6 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; UNIFY-NEXT: to label [[IF_END6:%.*]] []
; UNIFY: if.end6:
; UNIFY-NEXT: ret void
-;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %n, 256
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 0452c3b89e9a9..e67cf4697beab 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -279,20 +279,22 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
-; MUBUF-NEXT: s_nop 0
-; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
-; MUBUF-NEXT: ; v[0:1]
+; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_endpgm
;
@@ -313,8 +315,10 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
+; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; v[0:1]
+; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_endpgm
entry:
@@ -345,21 +349,24 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
+; MUBUF-NEXT: s_mov_b32 s4, 0x40000
+; MUBUF-NEXT: s_mov_b32 s5, 0x40000
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
-; MUBUF-NEXT: s_nop 0
-; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
-; MUBUF-NEXT: ; v[0:1]
+; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_endpgm
;
@@ -380,8 +387,10 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
+; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; v[0:1]
+; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_endpgm
entry:
@@ -512,20 +521,22 @@ define void @test_sgpr_offset_subregs_function() {
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
-; MUBUF-NEXT: s_nop 0
-; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
-; MUBUF-NEXT: ; v[0:1]
+; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_setpc_b64 s[30:31]
;
@@ -541,8 +552,10 @@ define void @test_sgpr_offset_subregs_function() {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
+; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; v[0:1]
+; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -576,22 +589,24 @@ define void @test_inst_offset_subregs_function() {
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
+; MUBUF-NEXT: s_add_i32 s4, s32, 0x40000
+; MUBUF-NEXT: s_add_i32 s5, s32, 0x40000
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4092 ; 4-byte Folded Spill
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
-; MUBUF-NEXT: s_nop 0
-; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4092 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
-; MUBUF-NEXT: ; v[0:1]
+; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: s_setpc_b64 s[30:31]
;
@@ -607,8 +622,10 @@ define void @test_inst_offset_subregs_function() {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
+; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; v[0:1]
+; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
entry:
>From bc03169ea37bcf84198d7e19f26c7c04e28b9928 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Fri, 16 Jan 2026 03:39:03 -0600
Subject: [PATCH 2/5] fix
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index a7da4a49c45cc..bc8453f742705 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1013,7 +1013,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// after SelectionDAG. Therefore, we treat all inline asm as mandatory
// SGPR use.
if (U.isInlineAsm())
- Info.HasMandatorySGPRUse = true;
+ Info.HasMandatorySGPRUse = true;
Users.push_back(&U);
}
}
>From 633a489333e3718e45779c43c3aa0635e040d613 Mon Sep 17 00:00:00 2001
From: vigneshwar jayakumar <vigneshwar.jayakumar at amd.com>
Date: Wed, 28 Jan 2026 09:48:33 -0600
Subject: [PATCH 3/5] changes
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 166 ++++++++------
.../AMDGPU/inline-asm-vgpr-sgpr-copy.ll | 106 +++++++++
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 69 +++---
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 10 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 212 +++++++++---------
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 64 +++---
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 10 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 212 +++++++++---------
8 files changed, 488 insertions(+), 361 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index bc8453f742705..660cbfec36412 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -100,7 +100,6 @@ class V2SCopyInfo {
unsigned NumReadfirstlanes = 0;
// Current score state. To speedup selection V2SCopyInfos for processing
bool NeedToBeConvertedToVALU = false;
- bool HasMandatorySGPRUse = false;
// Unique ID. Used as a key for mapping to keep permanent order.
unsigned ID;
@@ -160,6 +159,15 @@ class SIFixSGPRCopies {
MachineBasicBlock *BlockToInsertTo,
MachineBasicBlock::iterator PointToInsertTo,
const DebugLoc &DL);
+
+ // Insert V_READFIRSTLANE_B32 instructions to convert a VGPR to SGPR.
+ // Handles 16-bit, 32-bit, and larger register sizes.
+ void insertReadFirstLane(Register VGPRSrc, Register SGPRDst,
+ const TargetRegisterClass *RC,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &DL,
+ unsigned SubReg = AMDGPU::NoSubRegister);
};
class SIFixSGPRCopiesLegacy : public MachineFunctionPass {
@@ -894,6 +902,63 @@ bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(
return true;
}
+void SIFixSGPRCopies::insertReadFirstLane(Register VGPRSrc, Register SGPRDst,
+ const TargetRegisterClass *RC,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsertPt,
+ const DebugLoc &DL, unsigned SubReg) {
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ MRI->constrainRegClass(SGPRDst, &AMDGPU::SReg_32_XM0RegClass);
+ if (Size == 16) {
+ assert(MBB.getParent()->getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(SGPRDst) == &AMDGPU::SReg_32RegClass ||
+ MRI->getRegClass(SGPRDst) == &AMDGPU::SReg_32_XM0RegClass);
+ // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
+ MRI->setRegClass(SGPRDst, &AMDGPU::SReg_32_XM0RegClass);
+ Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
+ .addReg(VGPRSrc, 0, SubReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SGPRDst)
+ .addReg(VReg32);
+ } else if (Size == 32) {
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ BuildMI(MBB, InsertPt, DL, ReadFirstLaneDesc, SGPRDst)
+ .addReg(VGPRSrc, 0, SubReg);
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(MRI->getRegClass(VGPRSrc), OpRC,
+ SubReg);
+
+ if (!MRI->constrainRegClass(VGPRSrc, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
+ } else {
+ auto Result =
+ BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::REG_SEQUENCE), SGPRDst);
+ int N = Size / 32;
+ for (int i = 0; i < N; i++) {
+ Register PartialSrc = TII->buildExtractSubReg(
+ Result, *MRI, MachineOperand::CreateReg(VGPRSrc, false), RC,
+ TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
+ Register PartialDst =
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(MBB, *Result, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ PartialDst)
+ .addReg(PartialSrc);
+ Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
+ }
+ }
+}
+
bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
MachineBasicBlock::iterator &I) {
Register DstReg = MI.getOperand(0).getReg();
@@ -1004,17 +1069,42 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
} else if (Inst->getNumExplicitDefs() != 0) {
Register Reg = Inst->getOperand(0).getReg();
if (Reg.isVirtual() && TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) {
+ SmallVector<MachineInstr *, 4> InlineAsmUsers;
for (auto &U : MRI->use_instructions(Reg)) {
- // Inline assembly operands with SGPR constraints must not be
- // converted to VALU. Without this check, the SGPR could be converted
- // to VGPR, violating the inline asm constraint.
- // This is conservative: we cannot distinguish "s" (strict SGPR) from
- // "r" (any register) at this stage, as the constraint string is lost
- // after SelectionDAG. Therefore, we treat all inline asm as mandatory
- // SGPR use.
if (U.isInlineAsm())
- Info.HasMandatorySGPRUse = true;
- Users.push_back(&U);
+ InlineAsmUsers.push_back(&U);
+ else
+ Users.push_back(&U);
+ }
+ for (auto *U : InlineAsmUsers) {
+ // Inline assembly operands with SGPR constraints cannot be handled by
+ // the VALU conversion. If we convert the definition to VALU, we must
+ // insert a readfirstlane to restore the SGPR for the inline asm use.
+ MachineBasicBlock *MBB = U->getParent();
+ const DebugLoc &DL = U->getDebugLoc();
+
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+ Register VGPR =
+ MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(RC));
+ MachineInstr *NewCopy =
+ BuildMI(*MBB, *U, DL, TII->get(AMDGPU::COPY), VGPR).addReg(Reg);
+ Users.push_back(NewCopy);
+
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ Register SGPR =
+ Size == 16
+ ? MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass)
+ : (Size == 32 ? MRI->createVirtualRegister(
+ &AMDGPU::SReg_32_XM0RegClass)
+ : MRI->createVirtualRegister(RC));
+
+ insertReadFirstLane(VGPR, SGPR, RC, *MBB, *U, DL);
+
+ for (unsigned i = 0; i < U->getNumOperands(); ++i) {
+ MachineOperand &MO = U->getOperand(i);
+ if (MO.isReg() && MO.getReg() == Reg && MO.isUse())
+ MO.setReg(SGPR);
+ }
}
}
}
@@ -1030,8 +1120,6 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
// The main function that computes the VGPR to SGPR copy score
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
- if (Info->HasMandatorySGPRUse)
- return false;
if (Info->SChain.empty()) {
Info->Score = 0;
return true;
@@ -1122,63 +1210,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
- MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
-
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
- size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
- if (SrcSize == 16) {
- assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
- "We do not expect to see 16-bit copies from VGPR to SGPR unless "
- "we have 16-bit VGPRs");
- assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
- MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
- // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
- MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
- Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- const DebugLoc &DL = MI->getDebugLoc();
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
- .addReg(SrcReg, 0, SubReg)
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(VReg32);
- } else if (SrcSize == 32) {
- const MCInstrDesc &ReadFirstLaneDesc =
- TII->get(AMDGPU::V_READFIRSTLANE_B32);
- const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
- BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
- .addReg(SrcReg, 0, SubReg);
-
- const TargetRegisterClass *ConstrainRC =
- SubReg == AMDGPU::NoSubRegister
- ? OpRC
- : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
- SubReg);
- if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
- llvm_unreachable("failed to constrain register");
- } else {
- auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
- TII->get(AMDGPU::REG_SEQUENCE), DstReg);
- int N = TRI->getRegSizeInBits(*SrcRC) / 32;
- for (int i = 0; i < N; i++) {
- Register PartialSrc = TII->buildExtractSubReg(
- Result, *MRI, MI->getOperand(1), SrcRC,
- TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
- Register PartialDst =
- MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, *Result, Result->getDebugLoc(),
- TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
- .addReg(PartialSrc);
- Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
- }
- }
+ insertReadFirstLane(SrcReg, DstReg, SrcRC, *MBB, MI, MI->getDebugLoc(),
+ SubReg);
MI->eraseFromParent();
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
index 39d4416e2734d..ea6ed773ab00d 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
@@ -37,3 +37,109 @@ define <4 x float> @test_sgpr_constraint_bug(ptr addrspace(5) %buf_desc_ptr) {
ret <4 x float> %result
}
+
+;inline asm does not construct waterfall; user should be responsible
+define amdgpu_kernel void @inlineasm_sgpr_constraint_divergent_value(ptr addrspace(1) %out, i32 %uniform) {
+; CHECK-LABEL: inlineasm_sgpr_constraint_divergent_value:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_add_u32 s2, s2, 1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %result = call i32 asm sideeffect "s_add_u32 $0, $1, 1", "=s,s"(i32 %tid)
+ %gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ store i32 %result, ptr addrspace(1) %gep
+ ret void
+}
+
+define amdgpu_kernel void @inlineasm_with_salu_to_valu_uses(ptr addrspace(1) %out, i32 %base) {
+; CHECK-LABEL: inlineasm_with_salu_to_valu_uses:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_add_u32_e32 v1, 10, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_readfirstlane_b32 s2, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_add_u32 s2, s2, 5
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mad_u32_u24 v1, v1, 3, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %derived = add i32 %tid, 10
+
+ %asm_result = call i32 asm sideeffect "s_add_u32 $0, $1, 5", "=s,s"(i32 %derived)
+
+ %mul_result = mul i32 %derived, 3
+ %combined = add i32 %asm_result, %mul_result
+ %gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ store i32 %combined, ptr addrspace(1) %gep
+ ret void
+}
+
+; Test case with divergent value used in inline asm AND feeding into
+; a waterfall loop
+define amdgpu_kernel void @inlineasm_and_waterfall_same_value(ptr addrspace(1) %out, ptr addrspace(1) %descriptors) {
+; CHECK-LABEL: inlineasm_and_waterfall_same_value:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; CHECK-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v4
+; CHECK-NEXT: v_readfirstlane_b32 s0, v4
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_add_u32 s12, s0, 100
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[10:11]
+; CHECK-NEXT: s_mov_b64 s[2:3], exec
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-NEXT: v_readfirstlane_b32 s6, v2
+; CHECK-NEXT: v_readfirstlane_b32 s7, v3
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; CHECK-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_execnz .LBB4_1
+; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_mov_b64 exec, s[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_add_u32_e32 v0, s12, v5
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v4
+; CHECK-NEXT: global_store_dword v1, v0, s[8:9]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+
+ %asm_result = call i32 asm sideeffect "s_add_u32 $0, $1, 100", "=s,s"(i32 %tid)
+
+ %desc_ptr = getelementptr <4 x i32>, ptr addrspace(1) %descriptors, i32 %tid
+ %desc = load <4 x i32>, ptr addrspace(1) %desc_ptr
+ %buffer_result = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %tid, i32 0, i32 0)
+
+ %combined = add i32 %asm_result, %buffer_result
+ %out_ptr = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ store i32 %combined, ptr addrspace(1) %out_ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 3501a3d3bb157..bde7e107f23a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -536,8 +536,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
@@ -551,8 +551,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: v_readfirstlane_b32 s4, v0
-; GFX900-NEXT: s_and_b32 s4, 0xffff, s4
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
@@ -564,8 +564,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
@@ -577,8 +578,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_max_f16_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
@@ -591,8 +592,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
@@ -605,8 +607,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
@@ -980,10 +983,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7-NEXT: s_lshl_b32 s4, s4, 16
-; GFX7-NEXT: v_readfirstlane_b32 s5, v1
-; GFX7-NEXT: s_or_b32 s4, s5, s4
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s4
; GFX7-NEXT: ;;#ASMEND
@@ -995,19 +997,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: s_lshr_b32 s4, s17, 16
; GFX8-NEXT: s_lshr_b32 s5, s16, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_max_f16_e32 v1, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_max_f16_e32 v0, s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX8-NEXT: v_mov_b32_e32 v2, s17
+; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_max_f16_e32 v3, s16, v2
+; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s17
-; GFX8-NEXT: v_max_f16_e32 v1, s16, v0
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX8-NEXT: s_or_b32 s4, s5, s4
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
@@ -1027,9 +1026,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX900-NEXT: v_readfirstlane_b32 s4, v0
-; GFX900-NEXT: v_readfirstlane_b32 s5, v1
-; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
@@ -1054,13 +1053,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: s_lshr_b32 s5, s16, 16
; GFX10-NEXT: v_pk_max_f16 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v2
-; GFX10-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
@@ -1077,11 +1076,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
@@ -1093,16 +1090,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, s0, s1
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s3, s2
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 2bec23c9e86ae..dd7ab20d9f23f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -788,11 +788,11 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-NEXT: v_maximum3_f32 v0, v0, s3, s3
-; GFX950-NEXT: v_maximum3_f32 v1, v1, s2, s2
-; GFX950-NEXT: v_readfirstlane_b32 s1, v0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v1
+; GFX950-NEXT: v_maximum3_f32 v1, v0, s3, s3
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_maximum3_f32 v0, v0, s2, s2
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index a48fe6a4728f1..070e8746f97b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -418,13 +418,13 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX7-NEXT: v_readfirstlane_b32 s6, v3
-; GFX7-NEXT: v_readfirstlane_b32 s4, v2
-; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX7-NEXT: s_cselect_b32 s4, 0, s4
+; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
@@ -435,13 +435,13 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX8-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8-NEXT: v_readfirstlane_b32 s4, v2
-; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX8-NEXT: s_cselect_b32 s4, 0, s4
+; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
@@ -452,13 +452,13 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s18
; GFX900-NEXT: v_mov_b32_e32 v1, s19
-; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
-; GFX900-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX900-NEXT: v_readfirstlane_b32 s6, v3
-; GFX900-NEXT: v_readfirstlane_b32 s4, v2
-; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX900-NEXT: s_cselect_b32 s4, 0, s4
+; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v1
+; GFX900-NEXT: v_readfirstlane_b32 s5, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
@@ -468,13 +468,14 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT: v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: v_max_f64 v[0:1], s[0:1], v[0:1]
-; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s2, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s2
-; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -485,11 +486,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v0
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s5
-; GFX10-NEXT: s_cselect_b32 s4, 0, s6
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
@@ -500,12 +500,12 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s1
-; GFX11-NEXT: s_cselect_b32 s0, 0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
@@ -842,18 +842,17 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
-; GFX7-NEXT: v_readfirstlane_b32 s8, v3
-; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7-NEXT: v_readfirstlane_b32 s6, v2
-; GFX7-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX7-NEXT: s_cselect_b32 s6, 0, s6
-; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX7-NEXT: v_readfirstlane_b32 s8, v5
-; GFX7-NEXT: v_readfirstlane_b32 s4, v4
-; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX7-NEXT: s_cselect_b32 s4, 0, s4
+; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s6, v1
+; GFX7-NEXT: v_readfirstlane_b32 s7, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7-NEXT: v_readfirstlane_b32 s5, v2
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[4:7]
; GFX7-NEXT: ;;#ASMEND
@@ -868,18 +867,17 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s21
-; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX8-NEXT: v_readfirstlane_b32 s6, v2
-; GFX8-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX8-NEXT: s_cselect_b32 s6, 0, s6
-; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT: v_readfirstlane_b32 s8, v5
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX8-NEXT: s_cselect_b32 s4, 0, s4
+; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
+; GFX8-NEXT: v_readfirstlane_b32 s7, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8-NEXT: v_readfirstlane_b32 s5, v2
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[4:7]
; GFX8-NEXT: ;;#ASMEND
@@ -894,18 +892,17 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v0, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s21
-; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
-; GFX900-NEXT: v_readfirstlane_b32 s8, v3
-; GFX900-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX900-NEXT: v_readfirstlane_b32 s6, v2
-; GFX900-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX900-NEXT: s_cselect_b32 s6, 0, s6
-; GFX900-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX900-NEXT: v_readfirstlane_b32 s8, v5
-; GFX900-NEXT: v_readfirstlane_b32 s4, v4
-; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX900-NEXT: s_cselect_b32 s4, 0, s4
+; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s6, v1
+; GFX900-NEXT: v_readfirstlane_b32 s7, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX900-NEXT: v_readfirstlane_b32 s4, v3
+; GFX900-NEXT: v_readfirstlane_b32 s5, v2
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[4:7]
; GFX900-NEXT: ;;#ASMEND
@@ -915,21 +912,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
-; GFX950-NEXT: v_max_f64 v[0:1], s[2:3], v[0:1]
-; GFX950-NEXT: s_and_b64 s[2:3], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s4, v1
-; GFX950-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_cselect_b32 s3, 0x7ff80000, s4
-; GFX950-NEXT: s_cselect_b32 s2, 0, s2
-; GFX950-NEXT: v_max_f64 v[0:1], s[0:1], v[0:1]
-; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s2, v2
+; GFX950-NEXT: v_readfirstlane_b32 s3, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s4
-; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:3]
; GFX950-NEXT: ;;#ASMEND
@@ -938,20 +934,18 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX10-LABEL: s_maximum_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[22:23]
-; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
-; GFX10-NEXT: v_max_f64 v[2:3], s[16:17], s[20:21]
-; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
-; GFX10-NEXT: v_readfirstlane_b32 s6, v1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v3
-; GFX10-NEXT: v_readfirstlane_b32 s9, v2
-; GFX10-NEXT: s_cselect_b32 s7, 0x7ff80000, s6
-; GFX10-NEXT: s_cselect_b32 s6, 0, s8
-; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
-; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s4
-; GFX10-NEXT: s_cselect_b32 s4, 0, s9
+; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21]
+; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21]
+; GFX10-NEXT: v_max_f64 v[2:3], s[18:19], s[22:23]
+; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[18:19], s[22:23]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s5
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s[4:7]
; GFX10-NEXT: ;;#ASMEND
@@ -960,21 +954,21 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX11-LABEL: s_maximum_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[18:19]
-; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[16:17]
-; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
+; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_max_f64 v[2:3], s[2:3], s[18:19]
+; GFX11-NEXT: v_cmp_u_f64_e64 s1, s[2:3], s[18:19]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v3
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
-; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, s1
-; GFX11-NEXT: s_cselect_b32 s2, 0, s4
-; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s5
-; GFX11-NEXT: s_cselect_b32 s0, 0, s6
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:3]
; GFX11-NEXT: ;;#ASMEND
@@ -987,14 +981,14 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_maximum_f64 v[0:1], s[2:3], s[18:19]
-; GFX12-NEXT: v_maximum_f64 v[2:3], s[0:1], s[16:17]
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[16:17]
+; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[18:19]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s3, v1
-; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s2, v2
+; GFX12-NEXT: v_readfirstlane_b32 s3, v3
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:3]
; GFX12-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index f386d257ed678..67660dac43e26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -445,8 +445,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
@@ -460,8 +460,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: v_readfirstlane_b32 s4, v0
-; GFX900-NEXT: s_and_b32 s4, 0xffff, s4
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
@@ -473,8 +473,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_and_b32 s0, s0, 0xffff
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s0
; GFX950-NEXT: ;;#ASMEND
@@ -486,8 +487,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_min_f16_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: s_and_b32 s4, 0xffff, s4
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
@@ -500,8 +501,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: v_min_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
@@ -514,8 +516,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
@@ -805,19 +808,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: s_lshr_b32 s4, s17, 16
; GFX8-NEXT: s_lshr_b32 s5, s16, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_min_f16_e32 v1, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_min_f16_e32 v0, s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00
+; GFX8-NEXT: v_mov_b32_e32 v2, s17
+; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_min_f16_e32 v3, s16, v2
+; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s17
-; GFX8-NEXT: v_min_f16_e32 v1, s16, v0
-; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
-; GFX8-NEXT: s_lshl_b32 s4, s4, 16
-; GFX8-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX8-NEXT: s_or_b32 s4, s5, s4
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s4
; GFX8-NEXT: ;;#ASMEND
@@ -837,9 +837,9 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX900-NEXT: v_readfirstlane_b32 s4, v0
-; GFX900-NEXT: v_readfirstlane_b32 s5, v1
-; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s4
; GFX900-NEXT: ;;#ASMEND
@@ -864,13 +864,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: s_lshr_b32 s5, s16, 16
; GFX10-NEXT: v_pk_min_f16 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v2
-; GFX10-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
; GFX10-NEXT: ;;#ASMEND
@@ -887,11 +887,9 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-TRUE16-NEXT: ;;#ASMSTART
; GFX11-TRUE16-NEXT: ; use s0
; GFX11-TRUE16-NEXT: ;;#ASMEND
@@ -903,16 +901,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: v_pk_min_f16 v0, s0, s1
; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s3, s2
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
; GFX11-FAKE16-NEXT: ; use s0
; GFX11-FAKE16-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 860e0686a0928..0743fad299450 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -788,11 +788,11 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-NEXT: v_minimum3_f32 v0, v0, s3, s3
-; GFX950-NEXT: v_minimum3_f32 v1, v1, s2, s2
-; GFX950-NEXT: v_readfirstlane_b32 s1, v0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v1
+; GFX950-NEXT: v_minimum3_f32 v1, v0, s3, s3
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_minimum3_f32 v0, v0, s2, s2
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 7d6e799d56908..72d0c70c1a83c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -418,13 +418,13 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s18
; GFX7-NEXT: v_mov_b32_e32 v1, s19
-; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX7-NEXT: v_readfirstlane_b32 s6, v3
-; GFX7-NEXT: v_readfirstlane_b32 s4, v2
-; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX7-NEXT: s_cselect_b32 s4, 0, s4
+; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s4, v1
+; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[4:5]
; GFX7-NEXT: ;;#ASMEND
@@ -435,13 +435,13 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX8-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8-NEXT: v_readfirstlane_b32 s4, v2
-; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX8-NEXT: s_cselect_b32 s4, 0, s4
+; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[4:5]
; GFX8-NEXT: ;;#ASMEND
@@ -452,13 +452,13 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, s18
; GFX900-NEXT: v_mov_b32_e32 v1, s19
-; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
-; GFX900-NEXT: s_and_b64 s[4:5], vcc, exec
-; GFX900-NEXT: v_readfirstlane_b32 s6, v3
-; GFX900-NEXT: v_readfirstlane_b32 s4, v2
-; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s6
-; GFX900-NEXT: s_cselect_b32 s4, 0, s4
+; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s4, v1
+; GFX900-NEXT: v_readfirstlane_b32 s5, v0
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[4:5]
; GFX900-NEXT: ;;#ASMEND
@@ -468,13 +468,14 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT: v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v4, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: v_min_f64 v[0:1], s[0:1], v[0:1]
-; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s2, v1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s2
-; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:1]
; GFX950-NEXT: ;;#ASMEND
@@ -485,11 +486,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[18:19]
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v0
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s5
-; GFX10-NEXT: s_cselect_b32 s4, 0, s6
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s[4:5]
; GFX10-NEXT: ;;#ASMEND
@@ -500,12 +500,12 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f64 v[0:1], s[0:1], s[2:3]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s1
-; GFX11-NEXT: s_cselect_b32 s0, 0, s2
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:1]
; GFX11-NEXT: ;;#ASMEND
@@ -842,18 +842,17 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s20
; GFX7-NEXT: v_mov_b32_e32 v1, s21
-; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
-; GFX7-NEXT: v_readfirstlane_b32 s8, v3
-; GFX7-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX7-NEXT: v_readfirstlane_b32 s6, v2
-; GFX7-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX7-NEXT: s_cselect_b32 s6, 0, s6
-; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX7-NEXT: v_readfirstlane_b32 s8, v5
-; GFX7-NEXT: v_readfirstlane_b32 s4, v4
-; GFX7-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX7-NEXT: s_cselect_b32 s4, 0, s4
+; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX7-NEXT: v_readfirstlane_b32 s6, v1
+; GFX7-NEXT: v_readfirstlane_b32 s7, v0
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX7-NEXT: v_readfirstlane_b32 s4, v3
+; GFX7-NEXT: v_readfirstlane_b32 s5, v2
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[4:7]
; GFX7-NEXT: ;;#ASMEND
@@ -868,18 +867,17 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v1, s21
-; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX8-NEXT: v_readfirstlane_b32 s6, v2
-; GFX8-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX8-NEXT: s_cselect_b32 s6, 0, s6
-; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX8-NEXT: v_readfirstlane_b32 s8, v5
-; GFX8-NEXT: v_readfirstlane_b32 s4, v4
-; GFX8-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX8-NEXT: s_cselect_b32 s4, 0, s4
+; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
+; GFX8-NEXT: v_readfirstlane_b32 s7, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v3
+; GFX8-NEXT: v_readfirstlane_b32 s5, v2
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[4:7]
; GFX8-NEXT: ;;#ASMEND
@@ -894,18 +892,17 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[18:19], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v0, s20
; GFX900-NEXT: v_mov_b32_e32 v1, s21
-; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
-; GFX900-NEXT: v_readfirstlane_b32 s8, v3
-; GFX900-NEXT: s_and_b64 s[6:7], vcc, exec
-; GFX900-NEXT: v_readfirstlane_b32 s6, v2
-; GFX900-NEXT: s_cselect_b32 s7, 0x7ff80000, s8
-; GFX900-NEXT: s_cselect_b32 s6, 0, s6
-; GFX900-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX900-NEXT: v_readfirstlane_b32 s8, v5
-; GFX900-NEXT: v_readfirstlane_b32 s4, v4
-; GFX900-NEXT: s_cselect_b32 s5, 0x7ff80000, s8
-; GFX900-NEXT: s_cselect_b32 s4, 0, s4
+; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
+; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX900-NEXT: v_readfirstlane_b32 s6, v1
+; GFX900-NEXT: v_readfirstlane_b32 s7, v0
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
+; GFX900-NEXT: v_readfirstlane_b32 s4, v3
+; GFX900-NEXT: v_readfirstlane_b32 s5, v2
; GFX900-NEXT: ;;#ASMSTART
; GFX900-NEXT: ; use s[4:7]
; GFX900-NEXT: ;;#ASMEND
@@ -915,21 +912,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1]
-; GFX950-NEXT: v_min_f64 v[0:1], s[2:3], v[0:1]
-; GFX950-NEXT: s_and_b64 s[2:3], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s4, v1
-; GFX950-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1]
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: s_cselect_b32 s3, 0x7ff80000, s4
-; GFX950-NEXT: s_cselect_b32 s2, 0, s2
-; GFX950-NEXT: v_min_f64 v[0:1], s[0:1], v[0:1]
-; GFX950-NEXT: s_and_b64 s[0:1], vcc, exec
-; GFX950-NEXT: v_readfirstlane_b32 s4, v1
+; GFX950-NEXT: v_readfirstlane_b32 s2, v2
+; GFX950-NEXT: v_readfirstlane_b32 s3, v3
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
; GFX950-NEXT: v_readfirstlane_b32 s0, v0
-; GFX950-NEXT: s_cselect_b32 s1, 0x7ff80000, s4
-; GFX950-NEXT: s_cselect_b32 s0, 0, s0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use s[0:3]
; GFX950-NEXT: ;;#ASMEND
@@ -938,20 +934,18 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX10-LABEL: s_minimum_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f64 v[0:1], s[18:19], s[22:23]
-; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
-; GFX10-NEXT: v_min_f64 v[2:3], s[16:17], s[20:21]
-; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
-; GFX10-NEXT: v_readfirstlane_b32 s6, v1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v3
-; GFX10-NEXT: v_readfirstlane_b32 s9, v2
-; GFX10-NEXT: s_cselect_b32 s7, 0x7ff80000, s6
-; GFX10-NEXT: s_cselect_b32 s6, 0, s8
-; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
-; GFX10-NEXT: s_cselect_b32 s5, 0x7ff80000, s4
-; GFX10-NEXT: s_cselect_b32 s4, 0, s9
+; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21]
+; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21]
+; GFX10-NEXT: v_min_f64 v[2:3], s[18:19], s[22:23]
+; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[18:19], s[22:23]
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s5
+; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s7, v3
+; GFX10-NEXT: v_readfirstlane_b32 s6, v2
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s[4:7]
; GFX10-NEXT: ;;#ASMEND
@@ -960,21 +954,21 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX11-LABEL: s_minimum_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[18:19]
-; GFX11-NEXT: v_min_f64 v[2:3], s[0:1], s[16:17]
-; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
+; GFX11-NEXT: v_min_f64 v[0:1], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_min_f64 v[2:3], s[2:3], s[18:19]
+; GFX11-NEXT: v_cmp_u_f64_e64 s1, s[2:3], s[18:19]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v3
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
-; GFX11-NEXT: s_cselect_b32 s3, 0x7ff80000, s1
-; GFX11-NEXT: s_cselect_b32 s2, 0, s4
-; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT: s_cselect_b32 s1, 0x7ff80000, s5
-; GFX11-NEXT: s_cselect_b32 s0, 0, s6
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
+; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s[0:3]
; GFX11-NEXT: ;;#ASMEND
@@ -987,14 +981,14 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_minimum_f64 v[0:1], s[2:3], s[18:19]
-; GFX12-NEXT: v_minimum_f64 v[2:3], s[0:1], s[16:17]
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[16:17]
+; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[18:19]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s3, v1
-; GFX12-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s2, v2
+; GFX12-NEXT: v_readfirstlane_b32 s3, v3
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:3]
; GFX12-NEXT: ;;#ASMEND
>From 2f748ee55775f775db5ad01ce7ff2710e9510813 Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vjayakum at amd.com>
Date: Thu, 29 Jan 2026 17:25:05 -0600
Subject: [PATCH 4/5] moved to SISelLowering
---
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 159 ++++++------------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 83 +++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
.../AMDGPU/inline-asm-vgpr-sgpr-copy.ll | 41 ++++-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 53 ++----
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 98 +++++------
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 147 ++++++----------
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 47 ++----
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 98 +++++------
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 147 ++++++----------
.../si-unify-exit-multiple-unreachables.ll | 3 -
.../AMDGPU/spill-offset-calculation.ll | 35 ++--
12 files changed, 397 insertions(+), 515 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 660cbfec36412..5ae02d025989b 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -159,15 +159,6 @@ class SIFixSGPRCopies {
MachineBasicBlock *BlockToInsertTo,
MachineBasicBlock::iterator PointToInsertTo,
const DebugLoc &DL);
-
- // Insert V_READFIRSTLANE_B32 instructions to convert a VGPR to SGPR.
- // Handles 16-bit, 32-bit, and larger register sizes.
- void insertReadFirstLane(Register VGPRSrc, Register SGPRDst,
- const TargetRegisterClass *RC,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt,
- const DebugLoc &DL,
- unsigned SubReg = AMDGPU::NoSubRegister);
};
class SIFixSGPRCopiesLegacy : public MachineFunctionPass {
@@ -902,63 +893,6 @@ bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(
return true;
}
-void SIFixSGPRCopies::insertReadFirstLane(Register VGPRSrc, Register SGPRDst,
- const TargetRegisterClass *RC,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt,
- const DebugLoc &DL, unsigned SubReg) {
- unsigned Size = TRI->getRegSizeInBits(*RC);
- MRI->constrainRegClass(SGPRDst, &AMDGPU::SReg_32_XM0RegClass);
- if (Size == 16) {
- assert(MBB.getParent()->getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
- "We do not expect to see 16-bit copies from VGPR to SGPR unless "
- "we have 16-bit VGPRs");
- assert(MRI->getRegClass(SGPRDst) == &AMDGPU::SReg_32RegClass ||
- MRI->getRegClass(SGPRDst) == &AMDGPU::SReg_32_XM0RegClass);
- // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
- MRI->setRegClass(SGPRDst, &AMDGPU::SReg_32_XM0RegClass);
- Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
- BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
- BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
- .addReg(VGPRSrc, 0, SubReg)
- .addImm(AMDGPU::lo16)
- .addReg(Undef)
- .addImm(AMDGPU::hi16);
- BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SGPRDst)
- .addReg(VReg32);
- } else if (Size == 32) {
- const MCInstrDesc &ReadFirstLaneDesc =
- TII->get(AMDGPU::V_READFIRSTLANE_B32);
- const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
- BuildMI(MBB, InsertPt, DL, ReadFirstLaneDesc, SGPRDst)
- .addReg(VGPRSrc, 0, SubReg);
- const TargetRegisterClass *ConstrainRC =
- SubReg == AMDGPU::NoSubRegister
- ? OpRC
- : TRI->getMatchingSuperRegClass(MRI->getRegClass(VGPRSrc), OpRC,
- SubReg);
-
- if (!MRI->constrainRegClass(VGPRSrc, ConstrainRC))
- llvm_unreachable("failed to constrain register");
- } else {
- auto Result =
- BuildMI(MBB, InsertPt, DL, TII->get(AMDGPU::REG_SEQUENCE), SGPRDst);
- int N = Size / 32;
- for (int i = 0; i < N; i++) {
- Register PartialSrc = TII->buildExtractSubReg(
- Result, *MRI, MachineOperand::CreateReg(VGPRSrc, false), RC,
- TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
- Register PartialDst =
- MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(MBB, *Result, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- PartialDst)
- .addReg(PartialSrc);
- Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
- }
- }
-}
-
bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
MachineBasicBlock::iterator &I) {
Register DstReg = MI.getOperand(0).getReg();
@@ -1069,43 +1003,8 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
} else if (Inst->getNumExplicitDefs() != 0) {
Register Reg = Inst->getOperand(0).getReg();
if (Reg.isVirtual() && TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst)) {
- SmallVector<MachineInstr *, 4> InlineAsmUsers;
- for (auto &U : MRI->use_instructions(Reg)) {
- if (U.isInlineAsm())
- InlineAsmUsers.push_back(&U);
- else
- Users.push_back(&U);
- }
- for (auto *U : InlineAsmUsers) {
- // Inline assembly operands with SGPR constraints cannot be handled by
- // the VALU conversion. If we convert the definition to VALU, we must
- // insert a readfirstlane to restore the SGPR for the inline asm use.
- MachineBasicBlock *MBB = U->getParent();
- const DebugLoc &DL = U->getDebugLoc();
-
- const TargetRegisterClass *RC = MRI->getRegClass(Reg);
- Register VGPR =
- MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(RC));
- MachineInstr *NewCopy =
- BuildMI(*MBB, *U, DL, TII->get(AMDGPU::COPY), VGPR).addReg(Reg);
- Users.push_back(NewCopy);
-
- unsigned Size = TRI->getRegSizeInBits(*RC);
- Register SGPR =
- Size == 16
- ? MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass)
- : (Size == 32 ? MRI->createVirtualRegister(
- &AMDGPU::SReg_32_XM0RegClass)
- : MRI->createVirtualRegister(RC));
-
- insertReadFirstLane(VGPR, SGPR, RC, *MBB, *U, DL);
-
- for (unsigned i = 0; i < U->getNumOperands(); ++i) {
- MachineOperand &MO = U->getOperand(i);
- if (MO.isReg() && MO.getReg() == Reg && MO.isUse())
- MO.setReg(SGPR);
- }
- }
+ for (auto &U : MRI->use_instructions(Reg))
+ Users.push_back(&U);
}
}
for (auto *U : Users) {
@@ -1210,13 +1109,63 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
<< " is being turned to v_readfirstlane_b32"
<< " Score: " << C.second.Score << "\n");
Register DstReg = MI->getOperand(0).getReg();
+ MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+
Register SrcReg = MI->getOperand(1).getReg();
unsigned SubReg = MI->getOperand(1).getSubReg();
const TargetRegisterClass *SrcRC =
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));
+ size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
+ if (SrcSize == 16) {
+ assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
+ "We do not expect to see 16-bit copies from VGPR to SGPR unless "
+ "we have 16-bit VGPRs");
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
+ MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
+ // There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
+ MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+ Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ const DebugLoc &DL = MI->getDebugLoc();
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), VReg32)
+ .addReg(SrcReg, 0, SubReg)
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(VReg32);
+ } else if (SrcSize == 32) {
+ const MCInstrDesc &ReadFirstLaneDesc =
+ TII->get(AMDGPU::V_READFIRSTLANE_B32);
+ const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)
+ .addReg(SrcReg, 0, SubReg);
+
+ const TargetRegisterClass *ConstrainRC =
+ SubReg == AMDGPU::NoSubRegister
+ ? OpRC
+ : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,
+ SubReg);
- insertReadFirstLane(SrcReg, DstReg, SrcRC, *MBB, MI, MI->getDebugLoc(),
- SubReg);
+ if (!MRI->constrainRegClass(SrcReg, ConstrainRC))
+ llvm_unreachable("failed to constrain register");
+ } else {
+ auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::REG_SEQUENCE), DstReg);
+ int N = TRI->getRegSizeInBits(*SrcRC) / 32;
+ for (int i = 0; i < N; i++) {
+ Register PartialSrc = TII->buildExtractSubReg(
+ Result, *MRI, MI->getOperand(1), SrcRC,
+ TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
+ Register PartialDst =
+ MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, *Result, Result->getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
+ .addReg(PartialSrc);
+ Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
+ }
+ }
MI->eraseFromParent();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 17b64c0330318..055a27488d2ba 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -317,6 +317,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
Expand);
+ setOperationAction({ISD::INLINEASM, ISD::INLINEASM_BR}, MVT::Other, Custom);
+
#if 0
setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
#endif
@@ -6932,6 +6934,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerSET_FPENV(Op, DAG);
case ISD::ROTR:
return lowerROTR(Op, DAG);
+ case ISD::INLINEASM:
+ case ISD::INLINEASM_BR:
+ return LowerINLINEASM(Op, DAG);
}
return SDValue();
}
@@ -8305,6 +8310,84 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
+/// When a divergent value (in VGPR) is passed to an inline asm with an SGPR
+/// constraint ('s'), we need to insert v_readfirstlane to move the value from
+/// VGPR to SGPR. This is done by modifying the CopyToReg nodes in the glue
+/// chain that feed into the INLINEASM node.
+SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
+ unsigned NumOps = Op.getNumOperands();
+
+ if (Op.getOperand(NumOps - 1).getValueType() != MVT::Glue)
+ return Op;
+
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ DenseSet<Register> SGPRInputRegs;
+
+ for (unsigned I = InlineAsm::Op_FirstOperand; I < NumOps - 1;) {
+ const InlineAsm::Flag Flags(Op.getConstantOperandVal(I));
+ unsigned NumVals = Flags.getNumOperandRegisters();
+ ++I;
+
+ unsigned RCID;
+ bool IsSGPRInput = Flags.getKind() == InlineAsm::Kind::RegUse &&
+ NumVals > 0 && Flags.hasRegClassConstraint(RCID) &&
+ TRI->isSGPRClass(TRI->getRegClass(RCID));
+
+ for (unsigned J = 0; J < NumVals; ++J, ++I) {
+ if (!IsSGPRInput)
+ continue;
+
+ SDValue Val = Op.getOperand(I);
+ if (Val.getOpcode() != ISD::Register)
+ continue;
+
+ Register Reg = cast<RegisterSDNode>(Val.getNode())->getReg();
+ if (Reg.isVirtual())
+ SGPRInputRegs.insert(Reg);
+ }
+ }
+
+ if (SGPRInputRegs.empty())
+ return Op;
+
+ // Walk the glue chain and insert readfirstlane for divergent SGPR inputs.
+ SDLoc DL(Op);
+ SDNode *N = Op.getOperand(NumOps - 1).getNode();
+
+ while (N && N->getOpcode() == ISD::CopyToReg) {
+ Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+ SDValue SrcVal = N->getOperand(2);
+
+ // Insert readfirstlane if copying a divergent value to an SGPR input.
+ if (SGPRInputRegs.count(Reg) && SrcVal->isDivergent()) {
+ SDValue ReadFirstLaneID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
+ SDValue ReadFirstLane =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, SrcVal.getValueType(),
+ ReadFirstLaneID, SrcVal);
+
+ SmallVector<SDValue, 4> Ops = {N->getOperand(0), N->getOperand(1),
+ ReadFirstLane};
+ if (N->getNumOperands() > 3)
+ Ops.push_back(N->getOperand(3)); // Glue input
+
+ DAG.UpdateNodeOperands(N, Ops);
+ }
+
+ // Follow glue chain to next CopyToReg.
+ SDNode *Next = nullptr;
+ for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) {
+ if (N->getOperand(I).getValueType() == MVT::Glue) {
+ Next = N->getOperand(I).getNode();
+ break;
+ }
+ }
+ N = Next;
+ }
+
+ return Op;
+}
+
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e82f4528fcd09..ff3e6150f537d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -188,6 +188,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
index ea6ed773ab00d..b76b79457d795 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-vgpr-sgpr-copy.ll
@@ -16,10 +16,10 @@ define <4 x float> @test_sgpr_constraint_bug(ptr addrspace(5) %buf_desc_ptr) {
; CHECK-NEXT: s_mov_b64 s[4:5], exec
; CHECK-NEXT: v_mov_b32_e32 v8, 1
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v4
-; CHECK-NEXT: v_readfirstlane_b32 s1, v5
-; CHECK-NEXT: v_readfirstlane_b32 s2, v6
; CHECK-NEXT: v_readfirstlane_b32 s3, v7
+; CHECK-NEXT: v_readfirstlane_b32 s2, v6
+; CHECK-NEXT: v_readfirstlane_b32 s1, v5
+; CHECK-NEXT: v_readfirstlane_b32 s0, v4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: v_cmpx_le_u32 exec, 1, v8
; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:0
@@ -108,7 +108,7 @@ define amdgpu_kernel void @inlineasm_and_waterfall_same_value(ptr addrspace(1) %
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[10:11]
; CHECK-NEXT: s_mov_b64 s[2:3], exec
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_readfirstlane_b32 s4, v0
; CHECK-NEXT: v_readfirstlane_b32 s5, v1
; CHECK-NEXT: v_readfirstlane_b32 s6, v2
@@ -121,7 +121,7 @@ define amdgpu_kernel void @inlineasm_and_waterfall_same_value(ptr addrspace(1) %
; CHECK-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1]
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
+; CHECK-NEXT: s_cbranch_execnz .LBB3_1
; CHECK-NEXT: ; %bb.2:
; CHECK-NEXT: s_mov_b64 exec, s[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -143,3 +143,34 @@ entry:
store i32 %combined, ptr addrspace(1) %out_ptr
ret void
}
+
+; Test case with multiple divergent SGPR inputs to verify all are handled
+define amdgpu_kernel void @multiple_divergent_sgpr_inputs(ptr addrspace(1) %out) {
+; CHECK-LABEL: multiple_divergent_sgpr_inputs:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_add_u32_e32 v1, 10, v0
+; CHECK-NEXT: v_add_u32_e32 v2, 20, v0
+; CHECK-NEXT: v_readfirstlane_b32 s2, v0
+; CHECK-NEXT: v_readfirstlane_b32 s3, v1
+; CHECK-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_add_u32 s2, s2, s3
+; CHECK-NEXT: s_add_u32 s2, s2, s4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %val1 = add i32 %tid, 10
+ %val2 = add i32 %tid, 20
+ ; Three separate divergent SGPR inputs
+ %result = call i32 asm sideeffect "s_add_u32 $0, $1, $2\0As_add_u32 $0, $0, $3", "=s,s,s,s"(i32 %tid, i32 %val1, i32 %val2)
+ %gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+ store i32 %result, ptr addrspace(1) %gep
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index bde7e107f23a5..7fd70de81af6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -522,9 +522,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s4
+; GFX7-NEXT: ; use v0
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -537,9 +536,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -552,9 +550,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -565,10 +562,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -579,9 +574,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -593,10 +587,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s0
+; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -608,10 +600,8 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s0
+; GFX11-FAKE16-NEXT: ; use v0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -985,9 +975,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s4
+; GFX7-NEXT: ; use v0
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -1006,9 +995,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1028,9 +1016,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -1040,9 +1027,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -1059,9 +1045,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1077,10 +1062,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s0
+; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1097,11 +1080,10 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s0
+; GFX11-FAKE16-NEXT: ; use v0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1113,12 +1095,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index dd7ab20d9f23f..97eafd07d4b37 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -379,9 +379,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s4
+; GFX7-NEXT: ; use v0
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -393,9 +392,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -407,9 +405,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -418,10 +415,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_maximum3_f32 v0, v0, s1, s1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -431,9 +426,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX10-NEXT: v_max_f32_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -442,11 +436,10 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ; use v0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -734,15 +727,13 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX7-NEXT: v_max_f32_e32 v1, s17, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_mov_b32_e32 v1, s18
-; GFX7-NEXT: v_max_f32_e32 v3, s16, v1
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_max_f32_e32 v3, s16, v0
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:5]
+; GFX7-NEXT: ; use v[0:1]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -753,15 +744,13 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX8-NEXT: v_max_f32_e32 v1, s17, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s18
-; GFX8-NEXT: v_max_f32_e32 v3, s16, v1
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: v_max_f32_e32 v3, s16, v0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:5]
+; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -772,15 +761,13 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX900-NEXT: v_max_f32_e32 v1, s17, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_mov_b32_e32 v1, s18
-; GFX900-NEXT: v_max_f32_e32 v3, s16, v1
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v1
-; GFX900-NEXT: v_readfirstlane_b32 s5, v0
+; GFX900-NEXT: v_mov_b32_e32 v0, s18
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT: v_max_f32_e32 v3, s16, v0
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:5]
+; GFX900-NEXT: ; use v[0:1]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -791,44 +778,37 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950-NEXT: v_maximum3_f32 v1, v0, s3, s3
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_maximum3_f32 v0, v0, s2, s2
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:1]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_maximum_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f32_e64 v0, s16, s18
-; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
-; GFX10-NEXT: v_max_f32_e64 v1, s17, s19
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_max_f32_e64 v0, s17, s19
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_max_f32_e64 v2, s16, s18
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:5]
+; GFX10-NEXT: ; use v[0:1]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_maximum_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f32_e64 v0, s0, s2
-; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
-; GFX11-NEXT: v_max_f32_e64 v1, s1, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_max_f32_e64 v0, s1, s3
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_max_f32_e64 v2, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:1]
+; GFX11-NEXT: ; use v[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 070e8746f97b7..3280d7aa9ddfe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -421,12 +421,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:5]
+; GFX7-NEXT: ; use v[0:1]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -438,12 +436,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:5]
+; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -455,12 +451,10 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: v_max_f64 v[2:3], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v1
-; GFX900-NEXT: v_readfirstlane_b32 s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:5]
+; GFX900-NEXT: ; use v[0:1]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -474,10 +468,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:1]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -488,10 +480,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:5]
+; GFX10-NEXT: ; use v[0:1]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -503,11 +493,8 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:1]
+; GFX11-NEXT: ; use v[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -519,13 +506,9 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:1]
+; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -845,16 +828,12 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s6, v1
-; GFX7-NEXT: v_readfirstlane_b32 s7, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX7-NEXT: v_readfirstlane_b32 s4, v3
-; GFX7-NEXT: v_readfirstlane_b32 s5, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:7]
+; GFX7-NEXT: ; use v[0:3]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -870,16 +849,12 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: v_readfirstlane_b32 s7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8-NEXT: v_readfirstlane_b32 s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:7]
+; GFX8-NEXT: ; use v[0:3]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -895,16 +870,12 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_max_f64 v[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s6, v1
-; GFX900-NEXT: v_readfirstlane_b32 s7, v0
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX900-NEXT: v_readfirstlane_b32 s4, v3
-; GFX900-NEXT: v_readfirstlane_b32 s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:7]
+; GFX900-NEXT: ; use v[0:3]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -920,57 +891,44 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: v_readfirstlane_b32 s2, v2
-; GFX950-NEXT: v_readfirstlane_b32 s3, v3
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:3]
+; GFX950-NEXT: ; use v[0:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_maximum_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[20:21]
-; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21]
-; GFX10-NEXT: v_max_f64 v[2:3], s[18:19], s[22:23]
-; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[18:19], s[22:23]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s5
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_readfirstlane_b32 s7, v3
-; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_max_f64 v[0:1], s[18:19], s[22:23]
+; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
+; GFX10-NEXT: v_max_f64 v[4:5], s[16:17], s[20:21]
+; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:7]
+; GFX10-NEXT: ; use v[0:3]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_maximum_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[16:17]
+; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[18:19]
+; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
+; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: v_max_f64 v[2:3], s[2:3], s[18:19]
-; GFX11-NEXT: v_cmp_u_f64_e64 s1, s[2:3], s[18:19]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:3]
+; GFX11-NEXT: ; use v[0:3]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -981,18 +939,11 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[16:17]
; GFX12-NEXT: v_maximum_f64 v[2:3], s[2:3], s[18:19]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_readfirstlane_b32 s1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_readfirstlane_b32 s2, v2
-; GFX12-NEXT: v_readfirstlane_b32 s3, v3
+; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[16:17]
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:3]
+; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 67660dac43e26..b5dab396f0bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -446,9 +446,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -461,9 +460,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -474,10 +472,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -488,9 +484,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -502,10 +497,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s0
+; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -517,10 +510,8 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s0
+; GFX11-FAKE16-NEXT: ; use v0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -817,9 +808,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -839,9 +829,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -851,9 +840,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -870,9 +858,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -888,10 +875,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s0
+; GFX11-TRUE16-NEXT: ; use v0
; GFX11-TRUE16-NEXT: ;;#ASMEND
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -908,11 +893,10 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s0
+; GFX11-FAKE16-NEXT: ; use v0
; GFX11-FAKE16-NEXT: ;;#ASMEND
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -924,12 +908,9 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 0743fad299450..3e98599fc4c7f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -379,9 +379,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v0
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s4
+; GFX7-NEXT: ; use v0
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -393,9 +392,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s4
+; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -407,9 +405,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v0
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s4
+; GFX900-NEXT: ; use v0
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -418,10 +415,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_minimum3_f32 v0, v0, s1, s1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s0
+; GFX950-NEXT: ; use v0
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -431,9 +426,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX10-NEXT: v_min_f32_e64 v0, s16, s17
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s17
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s4
+; GFX10-NEXT: ; use v0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -442,11 +436,10 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_min_f32_e64 v0, s0, s1
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ; use v0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -734,15 +727,13 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX7-NEXT: v_min_f32_e32 v1, s17, v0
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_mov_b32_e32 v1, s18
-; GFX7-NEXT: v_min_f32_e32 v3, s16, v1
-; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s18
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX7-NEXT: v_min_f32_e32 v3, s16, v0
+; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:5]
+; GFX7-NEXT: ; use v[0:1]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -753,15 +744,13 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX8-NEXT: v_min_f32_e32 v1, s17, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s18
-; GFX8-NEXT: v_min_f32_e32 v3, s16, v1
-; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s18
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: v_min_f32_e32 v3, s16, v0
+; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:5]
+; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -772,15 +761,13 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX900-NEXT: v_min_f32_e32 v1, s17, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s17, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT: v_mov_b32_e32 v1, s18
-; GFX900-NEXT: v_min_f32_e32 v3, s16, v1
-; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v1
-; GFX900-NEXT: v_readfirstlane_b32 s5, v0
+; GFX900-NEXT: v_mov_b32_e32 v0, s18
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT: v_min_f32_e32 v3, s16, v0
+; GFX900-NEXT: v_cmp_o_f32_e32 vcc, s16, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:5]
+; GFX900-NEXT: ; use v[0:1]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -791,44 +778,37 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX950-NEXT: v_minimum3_f32 v1, v0, s3, s3
; GFX950-NEXT: v_mov_b32_e32 v0, s0
; GFX950-NEXT: v_minimum3_f32 v0, v0, s2, s2
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:1]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_minimum_v2f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f32_e64 v0, s16, s18
-; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
-; GFX10-NEXT: v_min_f32_e64 v1, s17, s19
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_min_f32_e64 v0, s17, s19
; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s17, s19
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
+; GFX10-NEXT: v_min_f32_e64 v2, s16, s18
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s16, s18
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:5]
+; GFX10-NEXT: ; use v[0:1]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_minimum_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f32_e64 v0, s0, s2
-; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
-; GFX11-NEXT: v_min_f32_e64 v1, s1, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_min_f32_e64 v0, s1, s3
; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_min_f32_e64 v2, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:1]
+; GFX11-NEXT: ; use v[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 72d0c70c1a83c..d07bd6c8dd902 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -421,12 +421,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX7-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7-NEXT: v_readfirstlane_b32 s5, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:5]
+; GFX7-NEXT: ; use v[0:1]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -438,12 +436,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX8-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:5]
+; GFX8-NEXT: ; use v[0:1]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -455,12 +451,10 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX900-NEXT: v_min_f64 v[2:3], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e32 vcc, s[16:17], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s4, v1
-; GFX900-NEXT: v_readfirstlane_b32 s5, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:5]
+; GFX900-NEXT: ; use v[0:1]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -474,10 +468,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:1]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
@@ -488,10 +480,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:5]
+; GFX10-NEXT: ; use v[0:1]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -503,11 +493,8 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:1]
+; GFX11-NEXT: ; use v[0:1]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -519,13 +506,9 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:1]
+; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -845,16 +828,12 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX7-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s6, v1
-; GFX7-NEXT: v_readfirstlane_b32 s7, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX7-NEXT: v_readfirstlane_b32 s4, v3
-; GFX7-NEXT: v_readfirstlane_b32 s5, v2
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX7-NEXT: ;;#ASMSTART
-; GFX7-NEXT: ; use s[4:7]
+; GFX7-NEXT: ; use v[0:3]
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -870,16 +849,12 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX8-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: v_readfirstlane_b32 s7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8-NEXT: v_readfirstlane_b32 s5, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX8-NEXT: ;;#ASMSTART
-; GFX8-NEXT: ; use s[4:7]
+; GFX8-NEXT: ; use v[0:3]
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -895,16 +870,12 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX900-NEXT: v_min_f64 v[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_cmp_u_f64_e64 s[4:5], s[16:17], v[0:1]
; GFX900-NEXT: v_mov_b32_e32 v6, 0x7ff80000
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
-; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX900-NEXT: v_readfirstlane_b32 s6, v1
-; GFX900-NEXT: v_readfirstlane_b32 s7, v0
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[4:5]
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v4, 0, s[4:5]
-; GFX900-NEXT: v_readfirstlane_b32 s4, v3
-; GFX900-NEXT: v_readfirstlane_b32 s5, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[4:5]
+; GFX900-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX900-NEXT: ;;#ASMSTART
-; GFX900-NEXT: ; use s[4:7]
+; GFX900-NEXT: ; use v[0:3]
; GFX900-NEXT: ;;#ASMEND
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
@@ -920,57 +891,44 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; GFX950-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
-; GFX950-NEXT: v_readfirstlane_b32 s2, v2
-; GFX950-NEXT: v_readfirstlane_b32 s3, v3
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX950-NEXT: v_readfirstlane_b32 s1, v1
-; GFX950-NEXT: v_readfirstlane_b32 s0, v0
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use s[0:3]
+; GFX950-NEXT: ; use v[0:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_minimum_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_min_f64 v[0:1], s[16:17], s[20:21]
-; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[16:17], s[20:21]
-; GFX10-NEXT: v_min_f64 v[2:3], s[18:19], s[22:23]
-; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[18:19], s[22:23]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s5
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
-; GFX10-NEXT: v_readfirstlane_b32 s7, v3
-; GFX10-NEXT: v_readfirstlane_b32 s6, v2
+; GFX10-NEXT: v_min_f64 v[0:1], s[18:19], s[22:23]
+; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[18:19], s[22:23]
+; GFX10-NEXT: v_min_f64 v[4:5], s[16:17], s[20:21]
+; GFX10-NEXT: v_cmp_u_f64_e64 s5, s[16:17], s[20:21]
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s5
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s[4:7]
+; GFX10-NEXT: ; use v[0:3]
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_minimum_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f64 v[0:1], s[0:1], s[16:17]
+; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[18:19]
+; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[18:19]
+; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[16:17]
; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[16:17]
-; GFX11-NEXT: v_min_f64 v[2:3], s[2:3], s[18:19]
-; GFX11-NEXT: v_cmp_u_f64_e64 s1, s[2:3], s[18:19]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0x7ff80000, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s[0:3]
+; GFX11-NEXT: ; use v[0:3]
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -981,18 +939,11 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[16:17]
; GFX12-NEXT: v_minimum_f64 v[2:3], s[2:3], s[18:19]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_readfirstlane_b32 s1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_readfirstlane_b32 s2, v2
-; GFX12-NEXT: v_readfirstlane_b32 s3, v3
+; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[16:17]
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s[0:3]
+; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index b6233c2563a68..80813d6d81e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -139,14 +139,12 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.1: ; %if.then
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split
@@ -173,7 +171,6 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_branch .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index e67cf4697beab..baa9c5843cc7f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -287,12 +287,12 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
@@ -315,8 +315,8 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
@@ -350,7 +350,6 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_mov_b32 s4, 0x40000
-; MUBUF-NEXT: s_mov_b32 s5, 0x40000
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -359,12 +358,12 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
@@ -387,8 +386,8 @@ define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
@@ -529,12 +528,12 @@ define void @test_sgpr_offset_subregs_function() {
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
@@ -552,8 +551,8 @@ define void @test_sgpr_offset_subregs_function() {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
@@ -590,7 +589,6 @@ define void @test_inst_offset_subregs_function() {
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_add_i32 s4, s32, 0x40000
-; MUBUF-NEXT: s_add_i32 s5, s32, 0x40000
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4092 ; 4-byte Folded Spill
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
@@ -599,12 +597,13 @@ define void @test_inst_offset_subregs_function() {
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_add_i32 s4, s32, 0x40000
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4092 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_readfirstlane_b32 s4, v0
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_readfirstlane_b32 s5, v0
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; s[4:5]
; MUBUF-NEXT: ;;#ASMEND
@@ -622,8 +621,8 @@ define void @test_inst_offset_subregs_function() {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: v_readfirstlane_b32 s1, v1
+; FLATSCR-NEXT: v_readfirstlane_b32 s0, v0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; s[0:1]
; FLATSCR-NEXT: ;;#ASMEND
>From 002bfde7e1989bd0a05cc60348f6df212e7e36a6 Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vjayakum at amd.com>
Date: Thu, 29 Jan 2026 23:44:05 -0600
Subject: [PATCH 5/5] removed inlineasm_br
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +--
llvm/test/CodeGen/AMDGPU/callbr.ll | 3 +--
llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 4 +---
llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll | 1 -
.../CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll | 5 +++--
5 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 055a27488d2ba..838ad1f89daa9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -317,7 +317,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
Expand);
- setOperationAction({ISD::INLINEASM, ISD::INLINEASM_BR}, MVT::Other, Custom);
+ setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
#if 0
setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
@@ -6935,7 +6935,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ROTR:
return lowerROTR(Op, DAG);
case ISD::INLINEASM:
- case ISD::INLINEASM_BR:
return LowerINLINEASM(Op, DAG);
}
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
index deb557d0ffc3b..253a6ec100eae 100644
--- a/llvm/test/CodeGen/AMDGPU/callbr.ll
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -6,9 +6,8 @@ define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_load_dword v0, v[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s4, v6
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_cmp_gt_i32 vcc s4, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.1: ; %fallthrough
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index f6fe5bde73c32..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc -mtriple=amdgcn -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s
; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s
@@ -120,7 +120,6 @@ define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ;;#ASMEND
; SI-NEXT: ; %bb.1: ; %loop.preheader
@@ -355,7 +354,6 @@ define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out)
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ;;#ASMEND
; SI-NEXT: ; %bb.1: ; %outer_loop.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index a1c6a398ccf50..01bcdad3fc220 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -97,7 +97,6 @@ define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5
; ISA-NEXT: .LBB1_1: ; %BB1
; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT: v_readfirstlane_b32 s8, v0
; ISA-NEXT: ;;#ASMSTART
; ISA-NEXT: ;;#ASMEND
; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 80813d6d81e3a..004c27971131d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
@@ -96,6 +96,7 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
; UNIFY-NEXT: br label [[IF_END6]]
; UNIFY: if.end6:
; UNIFY-NEXT: ret void
+;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %n, 256
@@ -164,7 +165,6 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; CHECK-NEXT: ; Label of block must be emitted
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.5: ; %if.then3
@@ -218,6 +218,7 @@ define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 nounde
; UNIFY-NEXT: to label [[IF_END6:%.*]] []
; UNIFY: if.end6:
; UNIFY-NEXT: ret void
+;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %n, 256
More information about the llvm-commits
mailing list