[llvm] [AMDGPU] Insert readfirstlane in the function returns in sgpr. (PR #135326)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 21 04:39:53 PDT 2025
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/135326
>From 627f6d8d86ee99b902d01f4497b8a517dba0741b Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 11 Apr 2025 13:48:46 +0530
Subject: [PATCH 1/9] while lowering return introduce readfirstlane to copy the
intermediate result to out reg
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 ++++-
llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll | 21 ++++++++++++++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bd95bcd89e183..bbef279dc478a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3251,6 +3251,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ SDValue ReadFirstLane =
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
// Copy the result values into the output registers.
for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
++I, ++RealRVLocIdx) {
@@ -3278,7 +3280,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
default:
llvm_unreachable("Unknown loc info!");
}
-
+ Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
+ ReadFirstLane, Arg);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
diff --git a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
new file mode 100644
index 0000000000000..34b5a3f358225
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s -check-prefixes=GFX11
+
+define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
+ ; GFX11-LABEL: name: s_copysign_f32_bf16
+ ; GFX11: bb.0 (%ir-block.0):
+ ; GFX11-NEXT: liveins: $sgpr0, $sgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
+ ; GFX11-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_]], [[COPY1]], killed [[V_LSHLREV_B32_e64_]], implicit $exec
+ ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[V_BFI_B32_e64_]], implicit $exec
+ ; GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX11-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+ %sign = fpext bfloat %sign.bf16 to float
+ %op = call float @llvm.copysign.f32(float %mag, float %sign)
+ %cast = bitcast float %op to i32
+ ret i32 %cast
+}
>From 56b057167df1682df7ba7471481701fbae59ef12 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 11 Apr 2025 19:58:10 +0530
Subject: [PATCH 2/9] Update affected test
---
.../AMDGPU/add64-low-32-bits-known-zero.ll | 4 +-
.../CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll | 48 +-
llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 30 +-
.../AMDGPU/dag-preserve-disjoint-flag.ll | 44 +-
llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll | 106 ++-
.../CodeGen/AMDGPU/flat-atomic-fadd.f64.ll | 18 +-
.../CodeGen/AMDGPU/global-atomic-fadd.f64.ll | 18 +-
llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll | 836 +++++++++---------
.../AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll | 64 +-
llvm/test/CodeGen/AMDGPU/ptrmask.ll | 4 +-
.../CodeGen/AMDGPU/sdag-print-divergence.ll | 15 +-
.../AMDGPU/sub64-low-32-bits-known-zero.ll | 4 +-
.../Inputs/amdgpu_isel.ll.expected | 60 +-
13 files changed, 696 insertions(+), 555 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
index 52259c4c2e6e1..1d51b8a077566 100644
--- a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
+++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
@@ -148,8 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
ret <2 x i64> %add
@@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64>
define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 2
+; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
ret <2 x i64> %add
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
index 6885657bbfa36..37928a78622a6 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
@@ -110,9 +110,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY8]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY9]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret double %ret
@@ -136,9 +138,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
@@ -162,9 +166,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY9]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY10]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret double %ret
@@ -190,9 +196,11 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY10]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY11]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
@@ -334,9 +342,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY12]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY13]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret double %ret
@@ -366,9 +376,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
@@ -398,9 +410,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY13]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY14]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret double %ret
@@ -432,9 +446,11 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY14]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY15]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret double %ret
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 661af021e8a84..af4ca2ad7120a 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -168,26 +168,26 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) {
; CHECK-LABEL: s_csh_v4i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_lshl_b32 s8, s0, s4
-; CHECK-NEXT: s_lshl_b32 s9, s1, s5
-; CHECK-NEXT: s_lshl_b32 s10, s2, s6
-; CHECK-NEXT: s_lshl_b32 s11, s3, s7
-; CHECK-NEXT: s_lshr_b32 s12, s0, s4
-; CHECK-NEXT: s_lshr_b32 s13, s1, s5
-; CHECK-NEXT: s_lshr_b32 s14, s2, s6
-; CHECK-NEXT: s_lshr_b32 s15, s3, s7
-; CHECK-NEXT: s_ashr_i32 s3, s3, s7
-; CHECK-NEXT: s_ashr_i32 s2, s2, s6
-; CHECK-NEXT: s_ashr_i32 s1, s1, s5
+; CHECK-NEXT: s_lshl_b32 s8, s3, s7
+; CHECK-NEXT: s_lshl_b32 s9, s2, s6
+; CHECK-NEXT: s_lshl_b32 s10, s1, s5
+; CHECK-NEXT: s_lshl_b32 s11, s0, s4
+; CHECK-NEXT: s_lshr_b32 s12, s3, s7
+; CHECK-NEXT: s_lshr_b32 s13, s2, s6
+; CHECK-NEXT: s_lshr_b32 s14, s1, s5
+; CHECK-NEXT: s_lshr_b32 s15, s0, s4
; CHECK-NEXT: s_ashr_i32 s0, s0, s4
+; CHECK-NEXT: s_ashr_i32 s1, s1, s5
+; CHECK-NEXT: s_ashr_i32 s2, s2, s6
+; CHECK-NEXT: s_ashr_i32 s3, s3, s7
; CHECK-NEXT: s_add_i32 s4, s11, s15
; CHECK-NEXT: s_add_i32 s5, s10, s14
; CHECK-NEXT: s_add_i32 s6, s9, s13
; CHECK-NEXT: s_add_i32 s7, s8, s12
-; CHECK-NEXT: s_add_i32 s0, s7, s0
-; CHECK-NEXT: s_add_i32 s1, s6, s1
-; CHECK-NEXT: s_add_i32 s2, s5, s2
-; CHECK-NEXT: s_add_i32 s3, s4, s3
+; CHECK-NEXT: s_add_i32 s3, s7, s3
+; CHECK-NEXT: s_add_i32 s2, s6, s2
+; CHECK-NEXT: s_add_i32 s1, s5, s1
+; CHECK-NEXT: s_add_i32 s0, s4, s0
; CHECK-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: s_csh_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index 4b4718a2acb80..d63a36c4b2958 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -10,8 +10,10 @@ define amdgpu_ps i32 @s_or_i32_disjoint(i32 inreg %a, i32 inreg %b) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: %3:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY %3
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
%result = or disjoint i32 %a, %b
ret i32 %result
@@ -26,10 +28,14 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; CHECK-NEXT: %5:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
- ; CHECK-NEXT: %6:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY %5
- ; CHECK-NEXT: $sgpr1 = COPY %6
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
+ ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
@@ -42,8 +48,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY %10
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%result = or disjoint i32 %a, %b
ret i32 %result
@@ -58,10 +64,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
- ; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY %12
- ; CHECK-NEXT: $vgpr1 = COPY %13
+ ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
+ ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
+ ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%result = or disjoint <2 x i32> %a, %b
ret <2 x i32> %result
@@ -78,11 +84,15 @@ define amdgpu_ps i64 @s_or_i64_disjoint(i64 inreg %a, i64 inreg %b) {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; CHECK-NEXT: %7:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY %7.sub1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY %7.sub0
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY5]]
- ; CHECK-NEXT: $sgpr1 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%result = or disjoint i64 %a, %b
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
index 34b5a3f358225..e47888ed78b58 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
@@ -1,21 +1,91 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s -check-prefixes=GFX11
-
-define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
- ; GFX11-LABEL: name: s_copysign_f32_bf16
- ; GFX11: bb.0 (%ir-block.0):
- ; GFX11-NEXT: liveins: $sgpr0, $sgpr1
- ; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
- ; GFX11-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_]], [[COPY1]], killed [[V_LSHLREV_B32_e64_]], implicit $exec
- ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[V_BFI_B32_e64_]], implicit $exec
- ; GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX11-NEXT: SI_RETURN_TO_EPILOG $sgpr0
- %sign = fpext bfloat %sign.bf16 to float
- %op = call float @llvm.copysign.f32(float %mag, float %sign)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
+
+define amdgpu_ps i32 @s_copysign_uniform(float inreg %x, float inreg %y) {
+; GFX11-LABEL: s_copysign_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %op = call float @llvm.copysign.f32(float %x, float %y)
%cast = bitcast float %op to i32
ret i32 %cast
}
+
+define amdgpu_ps i64 @uniform_vbfi_val_op(i32 inreg %a, i32 inreg %b) {
+; GFX11-LABEL: uniform_vbfi_val_op:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, llvm.amdgcn.bfi.i32 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, llvm.amdgcn.bfi.i32 at gotpcrel32@hi+12
+; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: s_mov_b64 s[8:9], 36
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: ; return to shader part epilog
+ %mask = xor i32 -1, 0
+ %bfi = call i32 @llvm.amdgcn.bfi.i32(i32 %mask, i32 %a, i32 %b)
+ %ext = zext i32 %bfi to i64
+ ret i64 %ext
+}
+
+declare i32 @llvm.amdgcn.bfi.i32(i32, i32, i32)
+
+
+define amdgpu_ps <2 x i32> @s_uniform_val_v2i32(<2 x i32> inreg %x, <2 x i32> inreg %y) {
+; GFX11-LABEL: s_uniform_val_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_getpc_b64 s[4:5]
+; GFX11-NEXT: s_add_u32 s4, s4, llvm.amdgcn.bfi.v2i32 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s5, s5, llvm.amdgcn.bfi.v2i32 at gotpcrel32@hi+12
+; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX11-NEXT: s_mov_b64 s[8:9], 36
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %mask = xor <2 x i32> <i32 -1, i32 -1>, zeroinitializer
+ %bfi = call <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32> %mask, <2 x i32> %x, <2 x i32> %y)
+ ret <2 x i32> %bfi
+}
+
+declare <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+
+
+define amdgpu_ps ptr @s_uniform_val_ptr(ptr inreg %base) {
+; GFX11-LABEL: s_uniform_val_ptr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, llvm.amdgcn.bfi.i32 at gotpcrel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, llvm.amdgcn.bfi.i32 at gotpcrel32@hi+12
+; GFX11-NEXT: s_add_i32 s0, s0, 16
+; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_mov_b64 s[8:9], 36
+; GFX11-NEXT: s_mov_b32 s32, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: ; return to shader part epilog
+ %gep = getelementptr i8, ptr %base, i32 16
+ %cast = ptrtoint ptr %gep to i32
+ %mask = xor i32 -1, 0
+ %val = call i32 @llvm.amdgcn.bfi.i32(i32 %mask, i32 %cast, i32 42)
+ %resptr = inttoptr i32 %val to ptr
+ ret ptr %resptr
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index 36714b386e7e5..f2f8c0a5cfa8f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -44,9 +44,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
ret double %ret
@@ -117,9 +119,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %ret
@@ -144,9 +148,11 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr,
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
ret double %ret
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index fa4e7f87853dd..682c1cd8060aa 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -62,9 +62,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %
; GFX90A-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
; GFX90A-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub0
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
; GFX90A-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI2]].sub1
- ; GFX90A-NEXT: $sgpr0 = COPY [[COPY12]]
- ; GFX90A-NEXT: $sgpr1 = COPY [[COPY13]]
+ ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
+ ; GFX90A-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
;
; GFX942-LABEL: name: global_atomic_fadd_f64_rtn_atomicrmw
@@ -81,9 +83,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_rtn_atomicrmw(ptr addrspace(1) %
; GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
; GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
; GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
; GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_RTN]].sub1
- ; GFX942-NEXT: $sgpr0 = COPY [[COPY6]]
- ; GFX942-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec
+ ; GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic
ret double %ret
@@ -123,9 +127,11 @@ define amdgpu_ps double @global_atomic_fadd_f64_saddr_rtn_atomicrmw(ptr addrspac
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN:%[0-9]+]]:vreg_64_align2 = GLOBAL_ATOMIC_ADD_F64_SADDR_RTN killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 1, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub0
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_ADD_F64_SADDR_RTN]].sub1
- ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[COPY5]]
- ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[COPY6]]
+ ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec
+ ; GFX90A_GFX942-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX90A_GFX942-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret double %ret
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index c88113d62a887..e82801eadc936 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -358,8 +358,8 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX7-LABEL: s_uitofp_v2i1_to_v2bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s2, 1, s0
-; GFX7-NEXT: s_bitcmp1_b32 s1, 0
+; GFX7-NEXT: s_and_b32 s2, 1, s1
+; GFX7-NEXT: s_bitcmp1_b32 s0, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
@@ -367,14 +367,14 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7-NEXT: v_readfirstlane_b32 s1, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uitofp_v2i1_to_v2bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s2, 1, s0
-; GFX9-NEXT: s_bitcmp1_b32 s1, 0
+; GFX9-NEXT: s_and_b32 s2, 1, s1
+; GFX9-NEXT: s_bitcmp1_b32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
@@ -396,77 +396,75 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_uitofp_v2i1_to_v2bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, 1, s0
-; GFX11-NEXT: s_bitcmp1_b32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s0, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX11-NEXT: s_and_b32 s1, 1, s1
+; GFX11-NEXT: s_bitcmp1_b32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_uitofp_v2i1_to_v2bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s0, 1, s0
-; GFX12-NEXT: s_bitcmp1_b32 s1, 0
-; GFX12-NEXT: s_cselect_b32 s1, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s0, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX12-NEXT: s_and_b32 s1, 1, s1
+; GFX12-NEXT: s_bitcmp1_b32 s0, 0
+; GFX12-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s1, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v1
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_readfirstlane_b32 s1, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
%op = uitofp <2 x i1> %num to <2 x bfloat>
@@ -706,11 +704,11 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX7-LABEL: s_uitofp_v3i1_to_v3bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s4, 1, s0
-; GFX7-NEXT: s_and_b32 s3, 1, s1
-; GFX7-NEXT: s_bitcmp1_b32 s2, 0
+; GFX7-NEXT: s_and_b32 s4, 1, s2
+; GFX7-NEXT: s_and_b32 s2, 1, s1
+; GFX7-NEXT: s_bitcmp1_b32 s0, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT: s_cmp_eq_u32 s3, 1
+; GFX7-NEXT: s_cmp_eq_u32 s2, 1
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
@@ -720,18 +718,18 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
-; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s2, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uitofp_v3i1_to_v3bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s4, 1, s0
-; GFX9-NEXT: s_and_b32 s3, 1, s1
-; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_and_b32 s4, 1, s2
+; GFX9-NEXT: s_and_b32 s2, 1, s1
+; GFX9-NEXT: s_bitcmp1_b32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s3, 1
+; GFX9-NEXT: s_cmp_eq_u32 s2, 1
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
@@ -759,99 +757,96 @@ define amdgpu_ps <3 x i32> @s_uitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_uitofp_v3i1_to_v3bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, 1, s0
+; GFX11-NEXT: s_and_b32 s2, 1, s2
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_bitcmp1_b32 s2, 0
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_bitcmp1_b32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s0, 1
+; GFX11-NEXT: s_cmp_eq_u32 s2, 1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_uitofp_v3i1_to_v3bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s0, 1, s0
+; GFX12-NEXT: s_and_b32 s2, 1, s2
; GFX12-NEXT: s_and_b32 s1, 1, s1
-; GFX12-NEXT: s_bitcmp1_b32 s2, 0
-; GFX12-NEXT: s_cselect_b32 s2, -1, 0
+; GFX12-NEXT: s_bitcmp1_b32 s0, 0
+; GFX12-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
+; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s0, 1
+; GFX12-NEXT: s_cmp_eq_u32 s2, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1
; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_readfirstlane_b32 s1, v0
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1154,10 +1149,10 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX7-LABEL: s_uitofp_v4i1_to_v4bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s6, 1, s0
-; GFX7-NEXT: s_and_b32 s4, 1, s1
-; GFX7-NEXT: s_and_b32 s2, 1, s2
-; GFX7-NEXT: s_bitcmp1_b32 s3, 0
+; GFX7-NEXT: s_and_b32 s6, 1, s3
+; GFX7-NEXT: s_and_b32 s4, 1, s2
+; GFX7-NEXT: s_and_b32 s2, 1, s1
+; GFX7-NEXT: s_bitcmp1_b32 s0, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -1173,18 +1168,18 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s1, v1
-; GFX7-NEXT: v_readfirstlane_b32 s2, v2
-; GFX7-NEXT: v_readfirstlane_b32 s3, v3
+; GFX7-NEXT: v_readfirstlane_b32 s0, v3
+; GFX7-NEXT: v_readfirstlane_b32 s1, v2
+; GFX7-NEXT: v_readfirstlane_b32 s2, v1
+; GFX7-NEXT: v_readfirstlane_b32 s3, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uitofp_v4i1_to_v4bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s6, 1, s0
-; GFX9-NEXT: s_and_b32 s4, 1, s1
-; GFX9-NEXT: s_and_b32 s2, 1, s2
-; GFX9-NEXT: s_bitcmp1_b32 s3, 0
+; GFX9-NEXT: s_and_b32 s6, 1, s3
+; GFX9-NEXT: s_and_b32 s4, 1, s2
+; GFX9-NEXT: s_and_b32 s2, 1, s1
+; GFX9-NEXT: s_bitcmp1_b32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -1216,7 +1211,7 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: v_readfirstlane_b32 s3, v0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
@@ -1225,133 +1220,126 @@ define amdgpu_ps <4 x i32> @s_uitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
+; GFX9-NEXT: v_readfirstlane_b32 s0, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_uitofp_v4i1_to_v4bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, 1, s0
-; GFX11-NEXT: s_and_b32 s1, 1, s1
+; GFX11-NEXT: s_and_b32 s3, 1, s3
; GFX11-NEXT: s_and_b32 s2, 1, s2
-; GFX11-NEXT: s_bitcmp1_b32 s3, 0
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_and_b32 s1, 1, s1
+; GFX11-NEXT: s_bitcmp1_b32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s0, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s1
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s2, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s3, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3
; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v3
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_uitofp_v4i1_to_v4bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s0, 1, s0
-; GFX12-NEXT: s_and_b32 s1, 1, s1
+; GFX12-NEXT: s_and_b32 s3, 1, s3
; GFX12-NEXT: s_and_b32 s2, 1, s2
-; GFX12-NEXT: s_bitcmp1_b32 s3, 0
-; GFX12-NEXT: s_cselect_b32 s3, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s2, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
-; GFX12-NEXT: s_cselect_b32 s2, -1, 0
+; GFX12-NEXT: s_and_b32 s1, 1, s1
+; GFX12-NEXT: s_bitcmp1_b32 s0, 0
+; GFX12-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s2
+; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s0, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s1
-; GFX12-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s2, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s1
+; GFX12-NEXT: s_cselect_b32 s2, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s3, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s2
+; GFX12-NEXT: s_cselect_b32 s3, -1, 0
; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3
; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v4
; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4
-; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
-; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
+; GFX12-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_readfirstlane_b32 s2, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s1, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_readfirstlane_b32 s2, v3
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_readfirstlane_b32 s3, v1
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12-NEXT: v_readfirstlane_b32 s3, v0
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
%op = uitofp <4 x i1> %num to <4 x bfloat>
@@ -1712,8 +1700,8 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX7-LABEL: s_sitofp_v2i1_to_v2bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s2, 1, s1
-; GFX7-NEXT: s_bitcmp1_b32 s0, 0
+; GFX7-NEXT: s_and_b32 s2, 1, s0
+; GFX7-NEXT: s_bitcmp1_b32 s1, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
@@ -1721,14 +1709,14 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s[0:1]
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s0, v1
+; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_sitofp_v2i1_to_v2bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s2, 1, s1
-; GFX9-NEXT: s_bitcmp1_b32 s0, 0
+; GFX9-NEXT: s_and_b32 s2, 1, s0
+; GFX9-NEXT: s_bitcmp1_b32 s1, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1]
@@ -1750,75 +1738,77 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_sitofp_v2i1_to_v2bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_bitcmp1_b32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s1, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-NEXT: s_and_b32 s0, 1, s0
+; GFX11-NEXT: s_bitcmp1_b32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v1
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_sitofp_v2i1_to_v2bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s1, 1, s1
-; GFX12-NEXT: s_bitcmp1_b32 s0, 0
-; GFX12-NEXT: s_cselect_b32 s0, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s1, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX12-NEXT: s_and_b32 s0, 1, s0
+; GFX12-NEXT: s_bitcmp1_b32 s1, 0
+; GFX12-NEXT: s_cselect_b32 s1, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX12-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX12-NEXT: v_readfirstlane_b32 s1, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s1, v0
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
%op = sitofp <2 x i1> %num to <2 x bfloat>
@@ -2058,11 +2048,11 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX7-LABEL: s_sitofp_v3i1_to_v3bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s4, 1, s2
-; GFX7-NEXT: s_and_b32 s2, 1, s1
-; GFX7-NEXT: s_bitcmp1_b32 s0, 0
+; GFX7-NEXT: s_and_b32 s4, 1, s0
+; GFX7-NEXT: s_and_b32 s3, 1, s1
+; GFX7-NEXT: s_bitcmp1_b32 s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT: s_cmp_eq_u32 s2, 1
+; GFX7-NEXT: s_cmp_eq_u32 s3, 1
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s4, 1
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
@@ -2072,18 +2062,18 @@ define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v2
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
-; GFX7-NEXT: v_readfirstlane_b32 s2, v0
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_sitofp_v3i1_to_v3bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s4, 1, s2
-; GFX9-NEXT: s_and_b32 s2, 1, s1
-; GFX9-NEXT: s_bitcmp1_b32 s0, 0
+; GFX9-NEXT: s_and_b32 s4, 1, s0
+; GFX9-NEXT: s_and_b32 s3, 1, s1
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 1
+; GFX9-NEXT: s_cmp_eq_u32 s3, 1
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
@@ -2111,96 +2101,99 @@ define amdgpu_ps <3 x i32> @s_sitofp_v3i1_to_v3bf16(<3 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v2
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_sitofp_v3i1_to_v3bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s2, 1, s2
+; GFX11-NEXT: s_and_b32 s0, 1, s0
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_bitcmp1_b32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_bitcmp1_b32 s2, 0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s1, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
+; GFX11-NEXT: s_cmp_eq_u32 s0, 1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s0
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_sitofp_v3i1_to_v3bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s2, 1, s2
+; GFX12-NEXT: s_and_b32 s0, 1, s0
; GFX12-NEXT: s_and_b32 s1, 1, s1
-; GFX12-NEXT: s_bitcmp1_b32 s0, 0
-; GFX12-NEXT: s_cselect_b32 s0, -1, 0
+; GFX12-NEXT: s_bitcmp1_b32 s2, 0
+; GFX12-NEXT: s_cselect_b32 s2, -1, 0
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
+; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s2
; GFX12-NEXT: s_cselect_b32 s1, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s2, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_eq_u32 s0, 1
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s1
; GFX12-NEXT: s_cselect_b32 s0, -1, 0
-; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, s0
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX12-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v4, v4, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX12-NEXT: v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v2
+; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_readfirstlane_b32 s1, v0
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2
@@ -2503,10 +2496,10 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX7-LABEL: s_sitofp_v4i1_to_v4bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_and_b32 s6, 1, s3
-; GFX7-NEXT: s_and_b32 s4, 1, s2
-; GFX7-NEXT: s_and_b32 s2, 1, s1
-; GFX7-NEXT: s_bitcmp1_b32 s0, 0
+; GFX7-NEXT: s_and_b32 s6, 1, s0
+; GFX7-NEXT: s_and_b32 s4, 1, s1
+; GFX7-NEXT: s_and_b32 s2, 1, s2
+; GFX7-NEXT: s_bitcmp1_b32 s3, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_cmp_eq_u32 s2, 1
; GFX7-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -2522,18 +2515,18 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v3
-; GFX7-NEXT: v_readfirstlane_b32 s1, v2
-; GFX7-NEXT: v_readfirstlane_b32 s2, v1
-; GFX7-NEXT: v_readfirstlane_b32 s3, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s2, v2
+; GFX7-NEXT: v_readfirstlane_b32 s3, v3
; GFX7-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_sitofp_v4i1_to_v4bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s6, 1, s3
-; GFX9-NEXT: s_and_b32 s4, 1, s2
-; GFX9-NEXT: s_and_b32 s2, 1, s1
-; GFX9-NEXT: s_bitcmp1_b32 s0, 0
+; GFX9-NEXT: s_and_b32 s6, 1, s0
+; GFX9-NEXT: s_and_b32 s4, 1, s1
+; GFX9-NEXT: s_and_b32 s2, 1, s2
+; GFX9-NEXT: s_bitcmp1_b32 s3, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 1
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -2565,7 +2558,7 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v4, 0x7fff, v4
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
+; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s[0:1]
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
@@ -2574,126 +2567,133 @@ define amdgpu_ps <4 x i32> @s_sitofp_v4i1_to_v4bf16(<4 x i1> inreg %num) {
; GFX9-NEXT: v_add_u32_e32 v5, 0x7fff, v5
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 16, v2
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 16, v3
-; GFX9-NEXT: v_readfirstlane_b32 s1, v2
-; GFX9-NEXT: v_readfirstlane_b32 s0, v3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_sitofp_v4i1_to_v4bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s3, 1, s3
-; GFX11-NEXT: s_and_b32 s2, 1, s2
+; GFX11-NEXT: s_and_b32 s0, 1, s0
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_bitcmp1_b32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s1, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_and_b32 s2, 1, s2
+; GFX11-NEXT: s_bitcmp1_b32 s3, 0
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s3, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s2
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s1, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2
+; GFX11-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s0, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s3
-; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT: v_add_nc_u32_e32 v5, v5, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, v2, v0
; GFX11-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v3
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v2
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readfirstlane_b32 s1, v3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: s_sitofp_v4i1_to_v4bf16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_and_b32 s3, 1, s3
-; GFX12-NEXT: s_and_b32 s2, 1, s2
+; GFX12-NEXT: s_and_b32 s0, 1, s0
; GFX12-NEXT: s_and_b32 s1, 1, s1
-; GFX12-NEXT: s_bitcmp1_b32 s0, 0
-; GFX12-NEXT: s_cselect_b32 s0, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s1, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s0
-; GFX12-NEXT: s_cselect_b32 s1, -1, 0
+; GFX12-NEXT: s_and_b32 s2, 1, s2
+; GFX12-NEXT: s_bitcmp1_b32 s3, 0
+; GFX12-NEXT: s_cselect_b32 s3, -1, 0
; GFX12-NEXT: s_cmp_eq_u32 s2, 1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s1
+; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s3
; GFX12-NEXT: s_cselect_b32 s2, -1, 0
-; GFX12-NEXT: s_cmp_eq_u32 s3, 1
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s2
-; GFX12-NEXT: s_cselect_b32 s3, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s1, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, -1.0, s2
+; GFX12-NEXT: s_cselect_b32 s1, -1, 0
+; GFX12-NEXT: s_cmp_eq_u32 s0, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, s1
+; GFX12-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s3
-; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX12-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v7, v7, v4
+; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
; GFX12-NEXT: v_add_nc_u32_e32 v6, v6, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX12-NEXT: v_add_nc_u32_e32 v5, v5, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v2, v2, v0
; GFX12-NEXT: v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX12-NEXT: v_dual_cndmask_b32 v1, v5, v9 :: v_dual_add_nc_u32 v2, v2, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v9, vcc_lo
+; GFX12-NEXT: v_add_nc_u32_e32 v2, 0x7fff, v6
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v6, v11, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v3
-; GFX12-NEXT: v_ashrrev_i32_e32 v3, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_ashrrev_i32_e32 v0, 16, v0
-; GFX12-NEXT: v_readfirstlane_b32 s0, v2
+; GFX12-NEXT: v_readfirstlane_b32 s0, v0
+; GFX12-NEXT: v_ashrrev_i32_e32 v2, 16, v2
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12-NEXT: v_readfirstlane_b32 s3, v0
+; GFX12-NEXT: v_readfirstlane_b32 s1, v2
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 16, v3
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s2, v3
+; GFX12-NEXT: v_ashrrev_i32_e32 v1, 16, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_readfirstlane_b32 s3, v1
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
%op = sitofp <4 x i1> %num to <4 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
index 3aa5ea995559f..dfde10329fe80 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
@@ -11,12 +11,18 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234
- ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 5678
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]]
- ; CHECK-NEXT: $sgpr1 = COPY [[S_AND_B32_]]
- ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]]
- ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_2]]
+ ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_1]]
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_3]]
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]]
+ ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_4]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
%rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678)
ret ptr addrspace(8) %rsrc
@@ -52,12 +58,18 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) {
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY2]], implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1234
- ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]]
- ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]]
- ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]]
- ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]]
+ ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_2]]
+ ; CHECK-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 5678
+ ; CHECK-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_4]]
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_3]]
+ ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_5]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
%rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678)
ret ptr addrspace(8) %rsrc
@@ -76,10 +88,18 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], killed [[S_MOV_B32_]], implicit-def dead $scc
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY3]]
- ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]]
- ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]]
- ; CHECK-NEXT: $sgpr3 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
%rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags)
ret ptr addrspace(8) %rsrc
@@ -99,10 +119,18 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride,
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc
- ; CHECK-NEXT: $sgpr0 = COPY [[COPY4]]
- ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]]
- ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]]
- ; CHECK-NEXT: $sgpr3 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]]
+ ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]]
+ ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
%rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
ret ptr addrspace(8) %rsrc
diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
index 8594549318dda..9ad9c80d82ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
@@ -186,8 +186,10 @@ define ptr addrspace(8) @v_ptrmask_buffer_resource_variable_i128_neg8(ptr addrsp
define amdgpu_ps ptr addrspace(8) @s_ptrmask_buffer_resource_variable_i128(ptr addrspace(8) inreg %ptr, i128 inreg %mask) {
; GCN-LABEL: s_ptrmask_buffer_resource_variable_i128:
; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7]
-; GCN-NEXT: s_and_b64 s[2:3], s[4:5], s[8:9]
+; GCN-NEXT: s_mov_b32 s2, s4
+; GCN-NEXT: s_mov_b32 s3, s5
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ptrmask_buffer_resource_variable_i128:
diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index 695d5225421de..822ee94fdeebb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel 2>&1 | FileCheck --check-prefixes=GCN,GCN-DEFAULT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -verify-machineinstrs < %s -debug-only=isel -dag-dump-verbose 2>&1 | FileCheck --check-prefixes=GCN,GCN-VERBOSE %s
@@ -5,23 +6,25 @@
; GCN-LABEL: === test_sdag_dump
; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry'
-; GCN: SelectionDAG has 10 nodes:
+; GCN: SelectionDAG has 11 nodes:
; GCN-DEFAULT: t0: ch,glue = EntryToken
; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0
+; GCN-DEFAULT: t7: i32 = TargetConstant<3139>
; GCN-DEFAULT: t5: f32 = fadd t2, t2
; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
-; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4
-; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
-; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG t8, Register:f32 $vgpr0, t8:1
+; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4
+; GCN-DEFAULT: t9: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6
+; GCN-DEFAULT: t10: ch = RETURN_TO_EPILOG t9, Register:f32 $vgpr0, t9:1
; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0
; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
+; GCN-VERBOSE: t7: i32 = TargetConstant<3139>
; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2
; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
; GCN-VERBOSE: t6: f32 = fadd [ORD=3] # D:1 t5, t4
-; GCN-VERBOSE: t8: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6
-; GCN-VERBOSE: t9: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t8, Register:f32 $vgpr0 # D:0, t8:1
+; GCN-VERBOSE: t9: ch,glue = CopyToReg [ORD=4] # D:1 t0, Register:f32 $vgpr0 # D:0, t6
+; GCN-VERBOSE: t10: ch = RETURN_TO_EPILOG [ORD=4] # D:0 t9, Register:f32 $vgpr0 # D:0, t9:1
define amdgpu_ps float @test_sdag_dump(float inreg %scalar, float %vector) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
index f52f1164f2ba2..6e4391a5ecaab 100644
--- a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll
@@ -148,8 +148,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_i32 s1, s1, -1
; GFX9-NEXT: s_add_i32 s3, s3, -1
+; GFX9-NEXT: s_add_i32 s1, s1, -1
; GFX9-NEXT: ; return to shader part epilog
%sub = sub <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
ret <2 x i64> %sub
@@ -158,8 +158,8 @@ define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64>
define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_i32 s1, s1, -1
; GFX9-NEXT: s_add_i32 s3, s3, -2
+; GFX9-NEXT: s_add_i32 s1, s1, -1
; GFX9-NEXT: ; return to shader part epilog
%sub = sub <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
ret <2 x i64> %sub
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index 2202b6446fd15..f590324f1120d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,16 +7,16 @@ define i64 @i64_test(i64 %i) nounwind readnone {
; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
; CHECK-NEXT: t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9
-; CHECK-NEXT: t49: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
-; CHECK-NEXT: t26: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT: t29: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT: t32: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t26, TargetConstant:i32<3>, t29, TargetConstant:i32<11>
-; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t49, t32
-; CHECK-NEXT: t23: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
-; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t23
-; CHECK-NEXT: t38: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
-; CHECK-NEXT: t18: ch,glue = CopyToReg # D:1 t16, Register:i32 $vgpr1, t38, t16:1
-; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
+; CHECK-NEXT: t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT: t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<60>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
+; CHECK-NEXT: t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33
+; CHECK-NEXT: t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
+; CHECK-NEXT: t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24
+; CHECK-NEXT: t39: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<11>
+; CHECK-NEXT: t19: ch,glue = CopyToReg # D:1 t17, Register:i32 $vgpr1, t39, t17:1
+; CHECK-NEXT: t20: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t19, t19:1
; CHECK-EMPTY:
%loc = alloca i64, addrspace(5)
%j = load i64, ptr addrspace(5) %loc
@@ -31,10 +31,10 @@ define i64 @i32_test(i32 %i) nounwind readnone {
; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
; CHECK-NEXT: t6: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
; CHECK-NEXT: t7: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t6, TargetConstant:i1<0>
-; CHECK-NEXT: t14: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
-; CHECK-NEXT: t22: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t16: ch,glue = CopyToReg t14, Register:i32 $vgpr1, t22, t14:1
-; CHECK-NEXT: t17: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t16, t16:1
+; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t7
+; CHECK-NEXT: t23: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t23, t15:1
+; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
; CHECK-EMPTY:
%loc = alloca i32, addrspace(5)
%j = load i32, ptr addrspace(5) %loc
@@ -48,14 +48,14 @@ define i64 @i16_test(i16 %i) nounwind readnone {
; CHECK: SelectionDAG has 18 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
-; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_USHORT_OFFEN<Mem:(dereferenceable load (s16) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0>
-; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<65535>
-; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
-; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
-; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT: t20: i32,ch = BUFFER_LOAD_USHORT_OFFEN<Mem:(dereferenceable load (s16) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t21: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t20, TargetConstant:i1<0>
+; CHECK-NEXT: t25: i32 = S_MOV_B32 TargetConstant:i32<65535>
+; CHECK-NEXT: t26: i32 = V_AND_B32_e64 # D:1 t21, t25
+; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t26
+; CHECK-NEXT: t32: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t18: ch,glue = CopyToReg t16, Register:i32 $vgpr1, t32, t16:1
+; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
; CHECK-EMPTY:
%loc = alloca i16, addrspace(5)
%j = load i16, ptr addrspace(5) %loc
@@ -69,14 +69,14 @@ define i64 @i8_test(i8 %i) nounwind readnone {
; CHECK: SelectionDAG has 18 nodes:
; CHECK-NEXT: t0: ch,glue = EntryToken
; CHECK-NEXT: t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
-; CHECK-NEXT: t19: i32,ch = BUFFER_LOAD_UBYTE_OFFEN<Mem:(dereferenceable load (s8) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT: t20: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t19, TargetConstant:i1<0>
-; CHECK-NEXT: t24: i32 = S_MOV_B32 TargetConstant:i32<255>
-; CHECK-NEXT: t25: i32 = V_AND_B32_e64 # D:1 t20, t24
-; CHECK-NEXT: t15: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t25
-; CHECK-NEXT: t31: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
-; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $vgpr1, t31, t15:1
-; CHECK-NEXT: t18: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t17, t17:1
+; CHECK-NEXT: t20: i32,ch = BUFFER_LOAD_UBYTE_OFFEN<Mem:(dereferenceable load (s8) from %ir.loc, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
+; CHECK-NEXT: t21: i32,i1 = V_ADD_CO_U32_e64 # D:1 t2, t20, TargetConstant:i1<0>
+; CHECK-NEXT: t25: i32 = S_MOV_B32 TargetConstant:i32<255>
+; CHECK-NEXT: t26: i32 = V_AND_B32_e64 # D:1 t21, t25
+; CHECK-NEXT: t16: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t26
+; CHECK-NEXT: t32: i32 = V_MOV_B32_e32 TargetConstant:i32<0>
+; CHECK-NEXT: t18: ch,glue = CopyToReg t16, Register:i32 $vgpr1, t32, t16:1
+; CHECK-NEXT: t19: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t18, t18:1
; CHECK-EMPTY:
%loc = alloca i8, addrspace(5)
%j = load i8, ptr addrspace(5) %loc
>From eea0c50eb1b7c7c84427a6c24331811ea79bedc0 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 11 Apr 2025 19:58:47 +0530
Subject: [PATCH 3/9] Introduce readfirstlane if dst reg is sgpr
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bbef279dc478a..e22498589d8c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3225,6 +3225,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
if (AMDGPU::isKernel(CallConv)) {
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
@@ -3280,8 +3281,9 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
default:
llvm_unreachable("Unknown loc info!");
}
- Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
- ReadFirstLane, Arg);
+ if (TRI->isSGPRPhysReg(VA.getLocReg()))
+ Arg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Arg.getValueType(),
+ ReadFirstLane, Arg);
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
>From d9ab3b9bf2f53721637aa7796bc5e28b24febb54 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 15 Apr 2025 13:38:01 +0530
Subject: [PATCH 4/9] address review
---
llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
index e47888ed78b58..a1c42c43d9fc4 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
@@ -1,17 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
-define amdgpu_ps i32 @s_copysign_uniform(float inreg %x, float inreg %y) {
-; GFX11-LABEL: s_copysign_uniform:
+define amdgpu_ps float @uniform_fpext(half inreg %x) {
+; GFX11-LABEL: uniform_fpext:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-NEXT: ; return to shader part epilog
- %op = call float @llvm.copysign.f32(float %x, float %y)
- %cast = bitcast float %op to i32
- ret i32 %cast
+ %f = fpext half %x to float
+ ret float %f
}
define amdgpu_ps i64 @uniform_vbfi_val_op(i32 inreg %a, i32 inreg %b) {
>From ec5c6be449cf337238e4cb1ad630bff2663b9b11 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Tue, 15 Apr 2025 14:15:46 +0530
Subject: [PATCH 5/9] update test
---
llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
index 822ee94fdeebb..482f78889ff4e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll
@@ -10,7 +10,7 @@
; GCN-DEFAULT: t0: ch,glue = EntryToken
; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0
-; GCN-DEFAULT: t7: i32 = TargetConstant<3139>
+; GCN-DEFAULT: t7: i32 = TargetConstant<3222>
; GCN-DEFAULT: t5: f32 = fadd t2, t2
; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1
; GCN-DEFAULT: t6: f32 = fadd # D:1 t5, t4
@@ -19,7 +19,7 @@
; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0
; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0
-; GCN-VERBOSE: t7: i32 = TargetConstant<3139>
+; GCN-VERBOSE: t7: i32 = TargetConstant<3222>
; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2
; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0
; GCN-VERBOSE: t6: f32 = fadd [ORD=3] # D:1 t5, t4
>From 0943e9836b1f65819290c823ad3c63794d5363ae Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 18 Apr 2025 14:03:06 +0530
Subject: [PATCH 6/9] update test
---
llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll | 87 -------------------
.../AMDGPU/uniform-vgpr-to-sgpr-return.ll | 74 ++++++++++++++++
2 files changed, 74 insertions(+), 87 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
diff --git a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
deleted file mode 100644
index a1c42c43d9fc4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/fix-illegal-copy.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
-
-define amdgpu_ps float @uniform_fpext(half inreg %x) {
-; GFX11-LABEL: uniform_fpext:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX11-NEXT: ; return to shader part epilog
- %f = fpext half %x to float
- ret float %f
-}
-
-define amdgpu_ps i64 @uniform_vbfi_val_op(i32 inreg %a, i32 inreg %b) {
-; GFX11-LABEL: uniform_vbfi_val_op:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, llvm.amdgcn.bfi.i32 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, llvm.amdgcn.bfi.i32 at gotpcrel32@hi+12
-; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_mov_b64 s[8:9], 36
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: ; return to shader part epilog
- %mask = xor i32 -1, 0
- %bfi = call i32 @llvm.amdgcn.bfi.i32(i32 %mask, i32 %a, i32 %b)
- %ext = zext i32 %bfi to i64
- ret i64 %ext
-}
-
-declare i32 @llvm.amdgcn.bfi.i32(i32, i32, i32)
-
-
-define amdgpu_ps <2 x i32> @s_uniform_val_v2i32(<2 x i32> inreg %x, <2 x i32> inreg %y) {
-; GFX11-LABEL: s_uniform_val_v2i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_getpc_b64 s[4:5]
-; GFX11-NEXT: s_add_u32 s4, s4, llvm.amdgcn.bfi.v2i32 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s5, s5, llvm.amdgcn.bfi.v2i32 at gotpcrel32@hi+12
-; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
-; GFX11-NEXT: s_mov_b64 s[8:9], 36
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: ; return to shader part epilog
- %mask = xor <2 x i32> <i32 -1, i32 -1>, zeroinitializer
- %bfi = call <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32> %mask, <2 x i32> %x, <2 x i32> %y)
- ret <2 x i32> %bfi
-}
-
-declare <2 x i32> @llvm.amdgcn.bfi.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
-
-
-define amdgpu_ps ptr @s_uniform_val_ptr(ptr inreg %base) {
-; GFX11-LABEL: s_uniform_val_ptr:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, llvm.amdgcn.bfi.i32 at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, llvm.amdgcn.bfi.i32 at gotpcrel32@hi+12
-; GFX11-NEXT: s_add_i32 s0, s0, 16
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_mov_b64 s[8:9], 36
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: ; return to shader part epilog
- %gep = getelementptr i8, ptr %base, i32 16
- %cast = ptrtoint ptr %gep to i32
- %mask = xor i32 -1, 0
- %val = call i32 @llvm.amdgcn.bfi.i32(i32 %mask, i32 %cast, i32 42)
- %resptr = inttoptr i32 %val to ptr
- ret ptr %resptr
-}
-
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
new file mode 100644
index 0000000000000..041368c05db13
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefixes=GFX11
+
+define amdgpu_ps i32 @uniform_v_to_s_i32(float inreg %a, float inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %cast = bitcast float %max0 to i32
+ ret i32 %cast
+}
+
+define amdgpu_ps i64 @uniform_v_to_s_i64(double inreg %a, double inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call double @llvm.maximum.f64(double %a, double %b)
+ %cast = bitcast double %max0 to i64
+ ret i64 %cast
+}
+
+define amdgpu_ps <2 x i32> @uniform_v_to_s_2_i32(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_2_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f32_e64 v0, s0, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2
+; GFX11-NEXT: v_max_f32_e64 v1, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call <2 x float> @llvm.maximum.f32(<2 x float> %a, <2 x float> %b)
+ %cast = bitcast <2 x float> %max0 to <2 x i32>
+ ret <2 x i32> %cast
+}
+
+define amdgpu_ps ptr @uniform_v_to_s_ptr(ptr inreg %x) {
+; GFX11-LABEL: uniform_v_to_s_ptr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: flat_load_b32 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f32_e32 v1, 1.0, v0
+; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %val = load float, ptr %x, align 4
+ %max = call float @llvm.maximum.f32(float %val, float 1.0)
+ %int = fptoui float %max to i32
+ %ptr = inttoptr i32 %int to ptr
+ ret ptr %ptr
+}
>From c7201407dbfdd63ceea6e0c71dd1c2d9a88f7f78 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 21 Apr 2025 16:38:13 +0530
Subject: [PATCH 7/9] add more test
---
.../AMDGPU/uniform-vgpr-to-sgpr-return.ll | 73 +++++++++++++++++++
1 file changed, 73 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
index 041368c05db13..61b583526b2ba 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
@@ -72,3 +72,76 @@ define amdgpu_ps ptr @uniform_v_to_s_ptr(ptr inreg %x) {
%ptr = inttoptr i32 %int to ptr
ret ptr %ptr
}
+
+define amdgpu_ps half @uniform_v_to_s_f16(half inreg %a, half inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f16_e64 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT: ; return to shader part epilog
+ %max = call half @llvm.maximum.f16(half %a, half %b)
+ ret half %max
+}
+
+define amdgpu_ps float @uniform_v_to_s_v2f16(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b)
+ %cast = bitcast <2 x half> %max to float
+ ret float %cast
+}
+
+define amdgpu_ps float @uniform_v_s_float(i32 inreg %a, i32 inreg %b) {
+; GFX11-LABEL: uniform_v_s_float:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %and = and i32 %a, %b
+ %cast = bitcast i32 %and to float
+ ret float %cast
+}
+
+define amdgpu_ps double @uniform_v_to_s_double(double inreg %a, double inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_double:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x7ff80000, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call double @llvm.maximum.f64(double %a, double %b)
+ ret double %max0
+}
+
+define amdgpu_ps float @uniform_v_to_s_f32(float inreg %a, float inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ ret float %max0
+}
>From b1575e5801fee7886803121e19d618f5022ce353 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 21 Apr 2025 16:49:48 +0530
Subject: [PATCH 8/9] add more test
---
.../AMDGPU/uniform-vgpr-to-sgpr-return.ll | 28 +++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
index 61b583526b2ba..a10751939e284 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
@@ -145,3 +145,31 @@ define amdgpu_ps float @uniform_v_to_s_f32(float inreg %a, float inreg %b) {
%max0 = call float @llvm.maximum.f32(float %a, float %b)
ret float %max0
}
+
+define amdgpu_ps <2 x i16> @uniform_v_to_s_2_i16(float inreg %a, float inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_2_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max0 = call float @llvm.maximum.f32(float %a, float %b)
+ %cast = bitcast float %max0 to <2 x i16>
+ ret <2 x i16> %cast
+}
+
+define amdgpu_ps i16 @uniform_v_to_s_i16(half inreg %a, half inreg %b) {
+; GFX11-LABEL: uniform_v_to_s_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_max_f16_e64 v0, s0, s1
+; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %max = call half @llvm.maximum.f16(half %a, half %b)
+ %cast = bitcast half %max to i16
+ ret i16 %cast
+}
>From 4d4f8834aa94a5124a5ff85077e91440992007a5 Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Mon, 21 Apr 2025 17:09:36 +0530
Subject: [PATCH 9/9] add more test
---
.../AMDGPU/uniform-vgpr-to-sgpr-return.ll | 81 ++++++-------------
1 file changed, 24 insertions(+), 57 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
index a10751939e284..7d58c8402ca3c 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-vgpr-to-sgpr-return.ll
@@ -73,51 +73,6 @@ define amdgpu_ps ptr @uniform_v_to_s_ptr(ptr inreg %x) {
ret ptr %ptr
}
-define amdgpu_ps half @uniform_v_to_s_f16(half inreg %a, half inreg %b) {
-; GFX11-LABEL: uniform_v_to_s_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_max_f16_e64 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT: ; return to shader part epilog
- %max = call half @llvm.maximum.f16(half %a, half %b)
- ret half %max
-}
-
-define amdgpu_ps float @uniform_v_to_s_v2f16(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GFX11-LABEL: uniform_v_to_s_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_pk_max_f16 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: ; return to shader part epilog
- %max = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b)
- %cast = bitcast <2 x half> %max to float
- ret float %cast
-}
-
-define amdgpu_ps float @uniform_v_s_float(i32 inreg %a, i32 inreg %b) {
-; GFX11-LABEL: uniform_v_s_float:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: ; return to shader part epilog
- %and = and i32 %a, %b
- %cast = bitcast i32 %and to float
- ret float %cast
-}
-
define amdgpu_ps double @uniform_v_to_s_double(double inreg %a, double inreg %b) {
; GFX11-LABEL: uniform_v_to_s_double:
; GFX11: ; %bb.0:
@@ -134,18 +89,6 @@ define amdgpu_ps double @uniform_v_to_s_double(double inreg %a, double inreg %b)
ret double %max0
}
-define amdgpu_ps float @uniform_v_to_s_f32(float inreg %a, float inreg %b) {
-; GFX11-LABEL: uniform_v_to_s_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_max_f32_e64 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX11-NEXT: ; return to shader part epilog
- %max0 = call float @llvm.maximum.f32(float %a, float %b)
- ret float %max0
-}
-
define amdgpu_ps <2 x i16> @uniform_v_to_s_2_i16(float inreg %a, float inreg %b) {
; GFX11-LABEL: uniform_v_to_s_2_i16:
; GFX11: ; %bb.0:
@@ -173,3 +116,27 @@ define amdgpu_ps i16 @uniform_v_to_s_i16(half inreg %a, half inreg %b) {
%cast = bitcast half %max to i16
ret i16 %cast
}
+
+define amdgpu_ps half @uniform_add_i16_cast_to_f16(i16 inreg %a, i16 inreg %b) {
+; GFX11-LABEL: uniform_add_i16_cast_to_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %add = add i16 %a, %b
+ %cast = bitcast i16 %add to half
+ ret half %cast
+}
+
+define amdgpu_ps float @uniform_mul_i32_cast_to_float(i32 inreg %a, i32 inreg %b) {
+; GFX11-LABEL: uniform_mul_i32_cast_to_float:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: ; return to shader part epilog
+ %mul = mul i32 %a, %b
+ %cast = bitcast i32 %mul to float
+ ret float %cast
+}
More information about the llvm-commits
mailing list