[llvm] [AMDGPU] Enable "amdgpu-uniform-intrinsic-combine" pass in pipeline. (PR #162819)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 10 02:57:49 PDT 2025
https://github.com/PankajDwivedi-25 created https://github.com/llvm/llvm-project/pull/162819
This PR enables AMDGPUUniformIntrinsicCombine pass in the llc pipeline.
Also introduces the "amdgpu-uniform-intrinsic-combine" command-line flag to enable/disable the pass.
see the PR:https://github.com/llvm/llvm-project/pull/116953
Original PR: https://github.com/llvm/llvm-project/pull/128687 was clobbered too much, so to make things cleaner, it is another attempt.
>From b6aba3ee451af2411462f0fdadba501f3975bcfd Mon Sep 17 00:00:00 2001
From: Pankaj kumar divedi <Pankajkumar.divedi at amd.com>
Date: Fri, 10 Oct 2025 15:19:18 +0530
Subject: [PATCH] [pre-commit] Update the test check affected after adding pass
to llc
---
.../GlobalISel/llvm.amdgcn.ballot.i32.ll | 214 ++---
.../GlobalISel/llvm.amdgcn.ballot.i64.ll | 152 ++--
llvm/test/CodeGen/AMDGPU/always-uniform.ll | 16 +-
.../amdgpu-miscellaneous-uniform-intrinsic.ll | 157 ++++
llvm/test/CodeGen/AMDGPU/bf16.ll | 48 +-
.../CodeGen/AMDGPU/convergence-laneops.ll | 1 +
.../test/CodeGen/AMDGPU/convergence-tokens.ll | 1 +
.../CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll | 18 +-
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 162 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 162 ++--
.../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 91 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 211 +----
.../AMDGPU/llvm.amdgcn.permlane64.ptr.ll | 77 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 857 ++----------------
.../AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll | 36 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 406 ++-------
.../AMDGPU/llvm.amdgcn.readlane.ptr.ll | 52 +-
.../spill-vgpr-to-agpr-update-regscavenger.ll | 23 +-
.../AMDGPU/splitkit-getsubrangeformask.ll | 198 ++--
llvm/test/CodeGen/AMDGPU/wqm.ll | 57 +-
20 files changed, 962 insertions(+), 1977 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 51714035352a3..8e8d9afaee4b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -89,17 +89,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_xor_b32 s2, vcc_lo, -1
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -113,9 +111,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -135,20 +133,29 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_non_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -162,16 +169,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -184,18 +192,27 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; GFX10-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_ne_zero_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_le_u32_e32 12, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -209,11 +226,7 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -233,18 +246,27 @@ false:
}
define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; GFX10-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: s_mov_b32 s0, 42
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT: ; %bb.1: ; %false
+; GFX10-NEXT: s_mov_b32 s0, 33
+; GFX10-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: branch_divergent_ballot_eq_zero_compare:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_mov_b32 s0, 42
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: v_cmpx_gt_u32_e32 12, v0
+; GFX11-NEXT: ; %bb.1: ; %false
+; GFX11-NEXT: s_mov_b32 s0, 33
+; GFX11-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -259,17 +281,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -284,18 +302,16 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
+; CHECK-NEXT: s_or_b32 s2, vcc_lo, s0
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -311,14 +327,12 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cmp_le_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -344,16 +358,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b32 s2, vcc_lo, s0
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s1, s2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -374,16 +386,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13b9ef1c..24b6250094c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -93,16 +93,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: s_xor_b64 s[4:5], vcc, -1
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -116,9 +114,9 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -142,16 +140,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -165,16 +160,17 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
+; CHECK-NEXT: s_and_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -189,16 +185,14 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -212,11 +206,7 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -239,15 +229,13 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -262,17 +250,13 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_cmp_lt_u32 s0, 12
-; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -287,18 +271,16 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 12, v0
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[0:1]
; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_2: ; %false
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
+; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -314,14 +296,12 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cmp_ge_u32 s0, 12
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
-; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cmp_le_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -347,16 +327,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[0:1]
+; CHECK-NEXT: s_mov_b32 s0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[4:5]
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -377,16 +355,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, 1, s0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 689b306518c9b..f7d293ddd9927 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,22 +8,20 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-LABEL: readfirstlane_uniform:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_add_i32 s12, s12, s17
-; GCN-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s0, s0, s4
-; GCN-NEXT: s_addc_u32 s1, s1, s5
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_add_u32 s0, s2, 40
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll
new file mode 100644
index 0000000000000..33c6fe4c09f1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck %s
+
+define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readfirstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readfirstlane_with_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_with_firstlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) {
+; CHECK-LABEL: readlane_readlane:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+ %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2)
+ store i32 %v2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) {
+; CHECK-LABEL: permlane64_uniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b32 s2, s[4:5], 0x8
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src)
+ store i32 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) {
+; CHECK-LABEL: permlane64_nonuniform_expression:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_add_nc_u32_e32 v1, 1, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid2 = add i32 %tid, 1
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+ %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+ store i32 %v, i32 addrspace(1)* %out_ptr
+ ret void
+}
+
+define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) {
+; CHECK-LABEL: trivial_waterfall_eq_zero:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
+; CHECK-NEXT: s_mov_b32 s2, 0
+; CHECK-NEXT: s_branch .LBB7_2
+; CHECK-NEXT: .LBB7_1: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_cbranch_vccz .LBB7_4
+; CHECK-NEXT: .LBB7_2: ; %while
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s2
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: s_cbranch_vccnz .LBB7_1
+; CHECK-NEXT: ; %bb.3: ; %if
+; CHECK-NEXT: ; in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT: s_mov_b32 s2, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; CHECK-NEXT: s_branch .LBB7_1
+; CHECK-NEXT: .LBB7_4: ; %exit
+; CHECK-NEXT: s_endpgm
+entry:
+ br label %while
+
+while:
+ %done = phi i1 [ 0, %entry ], [ 1, %if ]
+ %not_done = xor i1 %done, true
+ %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done)
+ %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done
+ br i1 %is_done, label %exit, label %if
+
+if:
+ store i32 5, ptr addrspace(1) %out
+ br label %while
+
+exit:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4b14dc63eeb84..1a382e75d973d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -46073,44 +46073,44 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
; GCN-LABEL: s_select_v3bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
-; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
-; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
-; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
+; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2
+; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s5
+; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s1
+; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s4
+; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT: v_readfirstlane_b32 s0, v1
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
+; GCN-NEXT: v_readfirstlane_b32 s1, v1
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_v3bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
-; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
-; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
+; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s1
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s0
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
+; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s4
+; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2
+; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7-NEXT: v_readfirstlane_b32 s1, v0
+; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_select_v3bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
index 57ab371d5b6fc..0cbfc092dc2ae 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-laneops.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 61d102d2222bd..da5451544c187 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
; RUN: llc -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
; RUN: llc -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index db32135939a5d..b8f084d5f82ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,24 +4,14 @@
define amdgpu_gs i32 @main() {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_bitcmp1_b32 0, 0
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_cselect_b32 s1, -1, 0
-; CHECK-NEXT: s_or_saveexec_b32 s2, -1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s1, v0
-; CHECK-NEXT: s_mov_b32 exec_lo, s2
-; CHECK-NEXT: s_or_b32 s0, s0, s1
-; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
-; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_xor_b32 s0, s0, -1
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_wait_alu 0xf1ff
; CHECK-NEXT: ; return to shader part epilog
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index e00e1f13b2b77..79b7ce39bc867 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -88,15 +88,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -110,9 +110,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB8_2
+; CHECK-NEXT: s_bitcmp0_b32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -134,15 +133,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB9_2
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -156,15 +155,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc_lo, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -179,15 +179,15 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_ne_zero = icmp ne i32 %ballot, 0
@@ -201,8 +201,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB12_2
+; CHECK-NEXT: s_cmp_gt_u32 s0, 11
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -224,14 +224,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB13_2
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b32 s0, vcc_lo
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
%ballot_eq_zero = icmp eq i32 %ballot, 0
@@ -245,14 +245,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc_lo, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c)
@@ -267,17 +267,17 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
-; CHECK-NEXT: s_cbranch_vccz .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0
+; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 35, v1
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_or_b32 s1, vcc_lo, s0
+; CHECK-NEXT: s_and_saveexec_b32 s0, s1
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -293,13 +293,13 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cmp_gt_u32 s0, 11
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
-; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cmp_lt_u32 s1, 35
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
-; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -324,15 +324,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; CHECK-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
-; CHECK-NEXT: s_cbranch_vccz .LBB17_2
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_b32 s1, vcc_lo, s0
+; CHECK-NEXT: s_and_saveexec_b32 s0, s1
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -353,14 +353,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index b4adf7f641550..e9359e9adf6af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -91,15 +91,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -113,9 +113,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB8_2
+; CHECK-NEXT: s_bitcmp0_b32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB8_3
@@ -137,15 +136,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB9_2
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB9_3
-; CHECK-NEXT: .LBB9_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -159,15 +158,16 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_non_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_and_b32 s0, s0, 1
-; CHECK-NEXT: v_cmp_ne_u32_e64 vcc, s0, 0
-; CHECK-NEXT: s_cbranch_vccz .LBB10_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB10_3
-; CHECK-NEXT: .LBB10_2: ; %true
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB10_3
+; CHECK-NEXT: .LBB10_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB10_3
; CHECK-NEXT: .LBB10_3:
%c = trunc i32 %v to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -182,15 +182,15 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB11_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB11_3
-; CHECK-NEXT: .LBB11_3:
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 11, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -204,8 +204,8 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB12_2
+; CHECK-NEXT: s_cmp_gt_u32 s0, 11
+; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB12_3
@@ -227,14 +227,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cbranch_vccz .LBB13_2
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB13_3
-; CHECK-NEXT: .LBB13_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_eq_zero = icmp eq i64 %ballot, 0
@@ -248,14 +248,14 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_lt_u32_e64 vcc, s0, 12
-; CHECK-NEXT: s_cbranch_vccz .LBB14_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB14_3
-; CHECK-NEXT: .LBB14_2: ; %true
+; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cbranch_scc1 .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB14_3
+; CHECK-NEXT: .LBB14_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB14_3
; CHECK-NEXT: .LBB14_3:
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
@@ -270,17 +270,17 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccz .LBB15_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB15_3
-; CHECK-NEXT: .LBB15_3:
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 11, v0
+; CHECK-NEXT: v_cmp_gt_u32_e64 s[0:1], 35, v1
+; CHECK-NEXT: s_or_b64 s[2:3], vcc, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
+; CHECK-NEXT: ; %bb.1: ; %false
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -296,13 +296,13 @@ false:
define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
; CHECK-LABEL: branch_uniform_ballot_ne_zero_and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_cmp_lt_u32 s0, 12
+; CHECK-NEXT: s_cmp_gt_u32 s0, 11
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_gt_u32 s1, 34
+; CHECK-NEXT: s_cmp_lt_u32 s1, 35
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 .LBB16_2
+; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB16_3
@@ -327,15 +327,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
-; CHECK-NEXT: s_and_b64 vcc, vcc, s[0:1]
-; CHECK-NEXT: s_cbranch_vccz .LBB17_2
+; CHECK-NEXT: s_and_b64 s[2:3], vcc, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 42
+; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3]
; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_2: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB17_3
-; CHECK-NEXT: .LBB17_3:
+; CHECK-NEXT: v_mov_b32_e32 v0, 33
+; CHECK-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
@@ -356,14 +356,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_cmp_gt_u32 s1, 34
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
-; CHECK-NEXT: ; %bb.1: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB18_3
-; CHECK-NEXT: .LBB18_2: ; %true
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
; CHECK-NEXT: s_branch .LBB18_3
+; CHECK-NEXT: .LBB18_2: ; %false
+; CHECK-NEXT: s_mov_b32 s0, 33
+; CHECK-NEXT: s_branch .LBB18_3
; CHECK-NEXT: .LBB18_3:
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index 91aba09e942f0..ceb4a90d232f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -124,19 +124,39 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
}
define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_compare(i32 %v) {
-; CHECK-LABEL: branch_divergent_ballot64_ne_zero_compare:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_u32_e64 s0, 12, v0
-; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
-; CHECK-NEXT: ; %bb.1: ; %true
-; CHECK-NEXT: s_mov_b32 s0, 42
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_2: ; %false
-; CHECK-NEXT: s_mov_b32 s0, 33
-; CHECK-NEXT: s_branch .LBB7_3
-; CHECK-NEXT: .LBB7_3:
+; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 42
+; DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; DAGISEL-NEXT: ; %bb.1: ; %false
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 33
+; DAGISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
+; DAGISEL-NEXT: ; return to shader part epilog
+;
+; GISEL-TRUE16-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; GISEL-TRUE16: ; %bb.0:
+; GISEL-TRUE16-NEXT: s_mov_b32 s0, 42
+; GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GISEL-TRUE16-NEXT: v_cmpx_le_u32_e32 12, v0
+; GISEL-TRUE16-NEXT: ; %bb.1: ; %false
+; GISEL-TRUE16-NEXT: s_mov_b32 s0, 33
+; GISEL-TRUE16-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GISEL-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: branch_divergent_ballot64_ne_zero_compare:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_mov_b32 s0, 42
+; GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GISEL-FAKE16-NEXT: v_cmpx_le_u32_e32 12, v0
+; GISEL-FAKE16-NEXT: ; %bb.1: ; %false
+; GISEL-FAKE16-NEXT: s_mov_b32 s0, 33
+; GISEL-FAKE16-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GISEL-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-FAKE16-NEXT: ; return to shader part epilog
%c = icmp ult i32 %v, 12
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c)
%ballot_ne_zero = icmp ne i64 %ballot, 0
@@ -150,37 +170,30 @@ false:
define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) {
; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_and:
; DAGISEL: ; %bb.0:
-; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; DAGISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; DAGISEL-NEXT: s_mov_b32 s1, 0
-; DAGISEL-NEXT: s_and_b32 s0, vcc_lo, s0
-; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
-; DAGISEL-NEXT: s_cmp_eq_u64 s[0:1], 0
-; DAGISEL-NEXT: s_cbranch_scc1 .LBB8_2
-; DAGISEL-NEXT: ; %bb.1: ; %true
-; DAGISEL-NEXT: s_mov_b32 s0, 42
-; DAGISEL-NEXT: s_branch .LBB8_3
-; DAGISEL-NEXT: .LBB8_2: ; %false
-; DAGISEL-NEXT: s_mov_b32 s0, 33
-; DAGISEL-NEXT: s_branch .LBB8_3
-; DAGISEL-NEXT: .LBB8_3:
+; DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 11, v0
+; DAGISEL-NEXT: v_cmp_gt_u32_e64 s0, 35, v1
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 42
+; DAGISEL-NEXT: s_or_b32 s1, vcc_lo, s0
+; DAGISEL-NEXT: s_and_saveexec_b32 s0, s1
+; DAGISEL-NEXT: ; %bb.1: ; %false
+; DAGISEL-NEXT: v_mov_b32_e32 v0, 33
+; DAGISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
+; DAGISEL-NEXT: ; return to shader part epilog
;
; GISEL-LABEL: branch_divergent_ballot64_ne_zero_and:
; GISEL: ; %bb.0:
-; GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; GISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
-; GISEL-NEXT: s_mov_b32 s1, 0
-; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0
-; GISEL-NEXT: s_cmp_eq_u64 s[0:1], 0
-; GISEL-NEXT: s_cbranch_scc1 .LBB8_2
-; GISEL-NEXT: ; %bb.1: ; %true
+; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 12, v0
+; GISEL-NEXT: v_cmp_ge_u32_e64 s0, 34, v1
+; GISEL-NEXT: s_or_b32 s2, vcc_lo, s0
; GISEL-NEXT: s_mov_b32 s0, 42
-; GISEL-NEXT: s_branch .LBB8_3
-; GISEL-NEXT: .LBB8_2: ; %false
+; GISEL-NEXT: s_and_saveexec_b32 s1, s2
+; GISEL-NEXT: ; %bb.1: ; %false
; GISEL-NEXT: s_mov_b32 s0, 33
-; GISEL-NEXT: s_branch .LBB8_3
-; GISEL-NEXT: .LBB8_3:
+; GISEL-NEXT: ; %bb.2: ; %UnifiedReturnBlock
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GISEL-NEXT: ; return to shader part epilog
%v1c = icmp ult i32 %v1, 12
%v2c = icmp ugt i32 %v2, 34
%c = and i1 %v1c, %v2c
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 6dd2258420998..9d088db43c277 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -23,10 +23,8 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_s_i32:
@@ -36,8 +34,6 @@ define amdgpu_kernel void @test_s_i32(ptr addrspace(1) %out, i32 %src0) {
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane64.i32(i32 %src0)
@@ -50,12 +46,9 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_s_i64:
@@ -64,9 +57,6 @@ define amdgpu_kernel void @test_s_i64(ptr addrspace(1) %out, i64 %src0) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%v = call i64 @llvm.amdgcn.permlane64.i64(i64 %src0)
@@ -79,12 +69,9 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_s_f64:
@@ -93,9 +80,6 @@ define amdgpu_kernel void @test_s_f64(ptr addrspace(1) %out, double %src0) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%v = call double @llvm.amdgcn.permlane64.f64(double %src0)
@@ -116,19 +100,15 @@ define amdgpu_kernel void @test_i_i32(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_i_i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x63
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_i_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -141,19 +121,15 @@ define amdgpu_kernel void @test_i_f32(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_i_f32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x449a5000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_i_f32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0x449a5000 :: v_dual_mov_b32 v1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -166,23 +142,16 @@ define amdgpu_kernel void @test_i_i64(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_i_i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x63
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_i_i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x63
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -195,22 +164,16 @@ define amdgpu_kernel void @test_i_f64(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_i_f64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x40934a00
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40934a00
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: global_store_b64 v0, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_i_f64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x40934a00
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x40934a00 :: v_dual_mov_b32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -235,8 +198,6 @@ define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -245,8 +206,6 @@ define amdgpu_kernel void @test_v_i32(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -261,8 +220,6 @@ define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -271,8 +228,6 @@ define amdgpu_kernel void @test_v_f32(ptr addrspace(1) %out, float %src0) #1 {
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -287,25 +242,17 @@ define amdgpu_kernel void @test_v_i64(ptr addrspace(1) %out, i64 %src0) #1 {
; GFX11-SDAG-LABEL: test_v_i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v2
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_v_i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v2
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidx_i64 = zext i32 %tidx to i64
@@ -320,11 +267,8 @@ define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 {
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
@@ -334,11 +278,8 @@ define amdgpu_kernel void @test_v_f64(ptr addrspace(1) %out, double %src0) #1 {
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v1, v1
+; GFX11-GISEL-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
@@ -354,14 +295,12 @@ define void @test_half(ptr addrspace(1) %out, half %src0) {
; GFX11-SDAG-LABEL: test_half:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_half:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call half @llvm.amdgcn.permlane64.f16(half %src0)
@@ -373,14 +312,12 @@ define void @test_bfloat(ptr addrspace(1) %out, bfloat %src0) {
; GFX11-SDAG-LABEL: test_bfloat:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_bfloat:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call bfloat @llvm.amdgcn.permlane64.bf16(bfloat %src0)
@@ -392,14 +329,12 @@ define void @test_i16(ptr addrspace(1) %out, i16 %src0) {
; GFX11-SDAG-LABEL: test_i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
@@ -411,14 +346,12 @@ define void @test_v2f16(ptr addrspace(1) %out, <2 x half> %src0) {
; GFX11-SDAG-LABEL: test_v2f16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_v2f16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x half> @llvm.amdgcn.permlane64.v2f16(<2 x half> %src0)
@@ -430,16 +363,12 @@ define void @test_v2f32(ptr addrspace(1) %out, <2 x float> %src0) {
; GFX11-SDAG-LABEL: test_v2f32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_v2f32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x float> @llvm.amdgcn.permlane64.v2f32(<2 x float> %src0)
@@ -451,13 +380,6 @@ define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) {
; GFX11-SDAG-LABEL: test_v7i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
-; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
@@ -466,13 +388,6 @@ define void @test_v7i32(ptr addrspace(1) %out, <7 x i32> %src0) {
; GFX11-GISEL-LABEL: test_v7i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
-; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
-; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
-; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
@@ -486,20 +401,12 @@ define void @test_v8i16(ptr addrspace(1) %out, <8 x i16> %src0) {
; GFX11-SDAG-LABEL: test_v8i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_v8i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <8 x i16> @llvm.amdgcn.permlane64.v8i16(<8 x i16> %src0)
@@ -511,20 +418,12 @@ define void @test_v2i64(ptr addrspace(1) %out, <2 x i64> %src0) {
; GFX11-SDAG-LABEL: test_v2i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: test_v2i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x i64> @llvm.amdgcn.permlane64.v2i64(<2 x i64> %src0)
@@ -536,12 +435,6 @@ define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) {
; GFX11-SDAG-LABEL: test_v3i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
-; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
@@ -550,12 +443,6 @@ define void @test_v3i64(ptr addrspace(1) %out, <3 x i64> %src0) {
; GFX11-GISEL-LABEL: test_v3i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
-; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
-; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
@@ -569,14 +456,6 @@ define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) {
; GFX11-SDAG-LABEL: test_v4f64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9
-; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
-; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
@@ -585,14 +464,6 @@ define void @test_v4f64(ptr addrspace(1) %out, <4 x double> %src0) {
; GFX11-GISEL-LABEL: test_v4f64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
-; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
-; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
-; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
-; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
@@ -606,22 +477,6 @@ define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) {
; GFX11-SDAG-LABEL: test_v8f64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v17, v17
-; GFX11-SDAG-NEXT: v_permlane64_b32 v16, v16
-; GFX11-SDAG-NEXT: v_permlane64_b32 v15, v15
-; GFX11-SDAG-NEXT: v_permlane64_b32 v14, v14
-; GFX11-SDAG-NEXT: v_permlane64_b32 v13, v13
-; GFX11-SDAG-NEXT: v_permlane64_b32 v12, v12
-; GFX11-SDAG-NEXT: v_permlane64_b32 v11, v11
-; GFX11-SDAG-NEXT: v_permlane64_b32 v10, v10
-; GFX11-SDAG-NEXT: v_permlane64_b32 v9, v9
-; GFX11-SDAG-NEXT: v_permlane64_b32 v8, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v7, v7
-; GFX11-SDAG-NEXT: v_permlane64_b32 v6, v6
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v3
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v2
; GFX11-SDAG-NEXT: s_clause 0x3
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[14:17], off offset:48
; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[10:13], off offset:32
@@ -632,22 +487,6 @@ define void @test_v8f64(ptr addrspace(1) %out, <8 x double> %src0) {
; GFX11-GISEL-LABEL: test_v8f64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_permlane64_b32 v2, v2
-; GFX11-GISEL-NEXT: v_permlane64_b32 v3, v3
-; GFX11-GISEL-NEXT: v_permlane64_b32 v4, v4
-; GFX11-GISEL-NEXT: v_permlane64_b32 v5, v5
-; GFX11-GISEL-NEXT: v_permlane64_b32 v6, v6
-; GFX11-GISEL-NEXT: v_permlane64_b32 v7, v7
-; GFX11-GISEL-NEXT: v_permlane64_b32 v8, v8
-; GFX11-GISEL-NEXT: v_permlane64_b32 v9, v9
-; GFX11-GISEL-NEXT: v_permlane64_b32 v10, v10
-; GFX11-GISEL-NEXT: v_permlane64_b32 v11, v11
-; GFX11-GISEL-NEXT: v_permlane64_b32 v12, v12
-; GFX11-GISEL-NEXT: v_permlane64_b32 v13, v13
-; GFX11-GISEL-NEXT: v_permlane64_b32 v14, v14
-; GFX11-GISEL-NEXT: v_permlane64_b32 v15, v15
-; GFX11-GISEL-NEXT: v_permlane64_b32 v16, v16
-; GFX11-GISEL-NEXT: v_permlane64_b32 v17, v17
; GFX11-GISEL-NEXT: s_clause 0x3
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
index b0149f7de5e85..672b658659824 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ptr.ll
@@ -6,12 +6,9 @@ define amdgpu_kernel void @test_p0(ptr addrspace(1) %out, ptr %src0) {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v2
-; GFX11-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr @llvm.amdgcn.permlane64.p0(ptr %src0)
store ptr %v, ptr addrspace(1) %out
@@ -22,21 +19,14 @@ define amdgpu_kernel void @test_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0) {
; GFX11-SDAG-LABEL: test_v3p0:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x2
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s7
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v8, s6
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v7, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v1
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v4
-; GFX11-SDAG-NEXT: v_permlane64_b32 v5, v5
-; GFX11-SDAG-NEXT: v_permlane64_b32 v4, v8
-; GFX11-SDAG-NEXT: v_permlane64_b32 v3, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s7
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s2
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
; GFX11-SDAG-NEXT: global_store_b128 v6, v[0:3], s[4:5]
@@ -53,10 +43,8 @@ define amdgpu_kernel void @test_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(3) @llvm.amdgcn.permlane64.v3p0(ptr addrspace(3) %src0)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -70,14 +58,9 @@ define amdgpu_kernel void @test_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane64.v3p3(<3 x ptr addrspace(3)> %src0)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -91,10 +74,8 @@ define amdgpu_kernel void @test_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(5) @llvm.amdgcn.permlane64.p5(ptr addrspace(5) %src0)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -108,14 +89,9 @@ define amdgpu_kernel void @test_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane64.v3p5(<3 x ptr addrspace(5)> %src0)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -129,10 +105,8 @@ define amdgpu_kernel void @test_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
%v = call ptr addrspace(6) @llvm.amdgcn.permlane64.p6(ptr addrspace(6) %src0)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -146,14 +120,9 @@ define amdgpu_kernel void @test_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s0
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v2, v0
-; GFX11-SDAG-NEXT: v_permlane64_b32 v1, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v3
-; GFX11-SDAG-NEXT: global_store_b96 v4, v[0:2], s[4:5]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-SDAG-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX11-SDAG-NEXT: s_endpgm
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane64.v3p6(<3 x ptr addrspace(6)> %src0)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index d1ba892d7f7e1..2067f9a133aa6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -6,9 +6,7 @@ define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i1:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: v_and_b32_e32 v2, 1, v2
; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -16,9 +14,7 @@ define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
; CHECK-GISEL-LABEL: test_readfirstlane_i1:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_and_b32_e32 v2, 1, v2
; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -55,10 +51,6 @@ define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2
-; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; CHECK-SDAG-NEXT: s_bitcmp1_b32 s4, 0
-; CHECK-SDAG-NEXT: s_cselect_b64 vcc, -1, 0
; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -68,10 +60,6 @@ define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-GISEL-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2
-; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v4
-; CHECK-GISEL-NEXT: s_and_b32 s4, 1, s4
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -89,9 +77,7 @@ define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1)
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-SDAG-NEXT: flat_load_ubyte v2, v[2:3]
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: v_and_b32_e32 v2, 1, v2
; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -101,9 +87,7 @@ define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1)
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-GISEL-NEXT: flat_load_ubyte v2, v[2:3]
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: v_and_b32_e32 v2, 1, v2
; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -117,8 +101,6 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -126,8 +108,6 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) {
; CHECK-GISEL-LABEL: test_readfirstlane_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -140,10 +120,6 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -151,10 +127,6 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
; CHECK-GISEL-LABEL: test_readfirstlane_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -167,24 +139,16 @@ define void @test_readfirstlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v2i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v2i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x i64> @llvm.amdgcn.readfirstlane.v2i64(<2 x i64> %src)
@@ -196,28 +160,16 @@ define void @test_readfirstlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v3i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ; use v[2:7]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v3i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ; use v[2:7]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x i64> @llvm.amdgcn.readfirstlane.v3i64(<3 x i64> %src)
@@ -229,32 +181,16 @@ define void @test_readfirstlane_v4i64(ptr addrspace(1) %out, <4 x i64> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v4i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ; use v[2:9]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v4i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ; use v[2:9]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <4 x i64> @llvm.amdgcn.readfirstlane.v4i64(<4 x i64> %src)
@@ -266,48 +202,16 @@ define void @test_readfirstlane_v8i64(ptr addrspace(1) %out, <8 x i64> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v8i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x i64> @llvm.amdgcn.readfirstlane.v8i64(<8 x i64> %src)
@@ -319,10 +223,6 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s5
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s4
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -330,10 +230,6 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
; CHECK-GISEL-LABEL: test_readfirstlane_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s5
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -396,8 +292,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
;
; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
-; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
+; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
@@ -456,14 +351,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -490,15 +384,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -588,17 +480,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -628,17 +520,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
@@ -694,18 +586,16 @@ define void @test_readfirstlane_half(ptr addrspace(1) %out, half %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_half:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_half:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call half @llvm.amdgcn.readfirstlane.f16(half %src)
@@ -717,18 +607,16 @@ define void @test_readfirstlane_float(ptr addrspace(1) %out, float %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_float:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_float:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call float @llvm.amdgcn.readfirstlane.f32(float %src)
@@ -740,18 +628,16 @@ define void @test_readfirstlane_bfloat(ptr addrspace(1) %out, bfloat %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_bfloat:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_bfloat:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call bfloat @llvm.amdgcn.readfirstlane.bf16(bfloat %src)
@@ -763,19 +649,18 @@ define void @test_readfirstlane_i16(ptr addrspace(1) %out, i16 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v0
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call i16 @llvm.amdgcn.readfirstlane.i16(i16 %src)
@@ -787,18 +672,16 @@ define void @test_readfirstlane_v2f16(ptr addrspace(1) %out, <2 x half> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v2f16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v2f16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x half> @llvm.amdgcn.readfirstlane.v2f16(<2 x half> %src)
@@ -810,20 +693,16 @@ define void @test_readfirstlane_v2f32(ptr addrspace(1) %out, <2 x float> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v2f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ; use v[2:3]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v2f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ; use v[2:3]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x float> @llvm.amdgcn.readfirstlane.v2f32(<2 x float> %src)
@@ -835,22 +714,16 @@ define void @test_readfirstlane_v3f32(ptr addrspace(1) %out, <3 x float> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v3f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v3f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:6]
+; CHECK-GISEL-NEXT: ; use v[2:4]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x float> @llvm.amdgcn.readfirstlane.v3f32(<3 x float> %src)
@@ -862,24 +735,16 @@ define void @test_readfirstlane_v4f32(ptr addrspace(1) %out, <4 x float> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v4f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v4f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <4 x float> @llvm.amdgcn.readfirstlane.v4f32(<4 x float> %src)
@@ -891,32 +756,16 @@ define void @test_readfirstlane_v8f32(ptr addrspace(1) %out, <8 x float> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v8f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ; use v[2:9]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v8f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ; use v[2:9]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x float> @llvm.amdgcn.readfirstlane.v8f32(<8 x float> %src)
@@ -928,48 +777,16 @@ define void @test_readfirstlane_v16f32(ptr addrspace(1) %out, <16 x float> %src)
; CHECK-SDAG-LABEL: test_readfirstlane_v16f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v16f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <16 x float> @llvm.amdgcn.readfirstlane.v16f32(<16 x float> %src)
@@ -981,171 +798,25 @@ define void @test_readfirstlane_v32f32(ptr addrspace(1) %out, <32 x float> %src)
; CHECK-SDAG-LABEL: test_readfirstlane_v32f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
-; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1
+; CHECK-SDAG-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[36:67]
+; CHECK-SDAG-NEXT: ; use v[2:33]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15
-; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14
-; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13
-; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12
-; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11
-; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10
-; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9
-; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8
-; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7
-; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6
-; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5
-; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4
-; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2
-; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1
-; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0
-; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v32f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1
+; CHECK-GISEL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[36:67]
+; CHECK-GISEL-NEXT: ; use v[2:33]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15
-; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14
-; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13
-; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12
-; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11
-; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10
-; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9
-; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8
-; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7
-; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6
-; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5
-; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4
-; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2
-; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1
-; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0
-; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <32 x float> @llvm.amdgcn.readfirstlane.v32f32(<32 x float> %src)
call void asm sideeffect "; use $0", "s"(<32 x float> %x)
@@ -1156,20 +827,16 @@ define void @test_readfirstlane_v2i32(ptr addrspace(1) %out, <2 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v2i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ; use v[2:3]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v2i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ; use v[2:3]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x i32> @llvm.amdgcn.readfirstlane.v2i32(<2 x i32> %src)
@@ -1181,22 +848,16 @@ define void @test_readfirstlane_v3i32(ptr addrspace(1) %out, <3 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v3i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v3i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:6]
+; CHECK-GISEL-NEXT: ; use v[2:4]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x i32> @llvm.amdgcn.readfirstlane.v3i32(<3 x i32> %src)
@@ -1208,24 +869,16 @@ define void @test_readfirstlane_v4i32(ptr addrspace(1) %out, <4 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v4i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v4i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <4 x i32> @llvm.amdgcn.readfirstlane.v4i32(<4 x i32> %src)
@@ -1237,26 +890,16 @@ define void @test_readfirstlane_v5i32(ptr addrspace(1) %out, <5 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v5i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:8]
+; CHECK-SDAG-NEXT: ; use v[2:6]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v5i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:8]
+; CHECK-GISEL-NEXT: ; use v[2:6]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <5 x i32> @llvm.amdgcn.readfirstlane.v5i32(<5 x i32> %src)
@@ -1268,28 +911,16 @@ define void @test_readfirstlane_v6i32(ptr addrspace(1) %out, <6 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v6i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ; use v[2:7]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v6i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ; use v[2:7]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <6 x i32> @llvm.amdgcn.readfirstlane.v6i32(<6 x i32> %src)
@@ -1301,30 +932,16 @@ define void @test_readfirstlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v7i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ; use v[2:8]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v7i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ; use v[2:8]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <7 x i32> @llvm.amdgcn.readfirstlane.v7i32(<7 x i32> %src)
@@ -1336,32 +953,16 @@ define void @test_readfirstlane_v8i32(ptr addrspace(1) %out, <8 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ; use v[2:9]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v8i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ; use v[2:9]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x i32> @llvm.amdgcn.readfirstlane.v8i32(<8 x i32> %src)
@@ -1373,48 +974,16 @@ define void @test_readfirstlane_v16i32(ptr addrspace(1) %out, <16 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v16i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v16i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <16 x i32> @llvm.amdgcn.readfirstlane.v16i32(<16 x i32> %src)
@@ -1426,171 +995,25 @@ define void @test_readfirstlane_v32i32(ptr addrspace(1) %out, <32 x i32> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v32i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
-; CHECK-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-SDAG-NEXT: buffer_load_dword v27, off, s[0:3], s32
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s37, 1
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s38, 2
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s39, 3
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s48, 4
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s49, 5
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s50, 6
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s51, 7
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s52, 8
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s53, 9
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s54, 10
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s55, 11
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s64, 12
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s65, 13
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s66, 14
-; CHECK-SDAG-NEXT: v_writelane_b32 v31, s67, 15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s64, v30
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s55, v21
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s54, v20
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s53, v19
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s52, v18
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s51, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s50, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s49, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s48, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s39, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s38, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s37, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s63, v29
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s62, v28
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s60, v26
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s59, v25
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s58, v24
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s57, v23
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s56, v22
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s47, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s46, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s45, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s44, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s43, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s42, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s41, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s40, v6
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(2)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s67, v0
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(1)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s66, v1
+; CHECK-SDAG-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s65, v27
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[36:67]
+; CHECK-SDAG-NEXT: ; use v[2:33]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_readlane_b32 s67, v31, 15
-; CHECK-SDAG-NEXT: v_readlane_b32 s66, v31, 14
-; CHECK-SDAG-NEXT: v_readlane_b32 s65, v31, 13
-; CHECK-SDAG-NEXT: v_readlane_b32 s64, v31, 12
-; CHECK-SDAG-NEXT: v_readlane_b32 s55, v31, 11
-; CHECK-SDAG-NEXT: v_readlane_b32 s54, v31, 10
-; CHECK-SDAG-NEXT: v_readlane_b32 s53, v31, 9
-; CHECK-SDAG-NEXT: v_readlane_b32 s52, v31, 8
-; CHECK-SDAG-NEXT: v_readlane_b32 s51, v31, 7
-; CHECK-SDAG-NEXT: v_readlane_b32 s50, v31, 6
-; CHECK-SDAG-NEXT: v_readlane_b32 s49, v31, 5
-; CHECK-SDAG-NEXT: v_readlane_b32 s48, v31, 4
-; CHECK-SDAG-NEXT: v_readlane_b32 s39, v31, 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s38, v31, 2
-; CHECK-SDAG-NEXT: v_readlane_b32 s37, v31, 1
-; CHECK-SDAG-NEXT: v_readlane_b32 s36, v31, 0
-; CHECK-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-SDAG-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v32i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s36, 0
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s36, v2
-; CHECK-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s32
-; CHECK-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; CHECK-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s37, 1
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s38, 2
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s39, 3
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s48, 4
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s49, 5
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s50, 6
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s51, 7
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s52, 8
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s53, 9
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s54, 10
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s55, 11
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s64, 12
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s65, 13
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s66, 14
-; CHECK-GISEL-NEXT: v_writelane_b32 v31, s67, 15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s37, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s38, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s39, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s48, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s49, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s50, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s51, v17
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s52, v18
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s53, v19
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s54, v20
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s55, v21
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s64, v30
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s40, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s41, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s42, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s43, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s44, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s45, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s46, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s47, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s56, v22
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s57, v23
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s58, v24
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s59, v25
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s60, v26
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s61, v27
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s62, v28
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s63, v29
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(2)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s65, v0
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(1)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s66, v1
+; CHECK-GISEL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-GISEL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s67, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[36:67]
+; CHECK-GISEL-NEXT: ; use v[2:33]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_readlane_b32 s67, v31, 15
-; CHECK-GISEL-NEXT: v_readlane_b32 s66, v31, 14
-; CHECK-GISEL-NEXT: v_readlane_b32 s65, v31, 13
-; CHECK-GISEL-NEXT: v_readlane_b32 s64, v31, 12
-; CHECK-GISEL-NEXT: v_readlane_b32 s55, v31, 11
-; CHECK-GISEL-NEXT: v_readlane_b32 s54, v31, 10
-; CHECK-GISEL-NEXT: v_readlane_b32 s53, v31, 9
-; CHECK-GISEL-NEXT: v_readlane_b32 s52, v31, 8
-; CHECK-GISEL-NEXT: v_readlane_b32 s51, v31, 7
-; CHECK-GISEL-NEXT: v_readlane_b32 s50, v31, 6
-; CHECK-GISEL-NEXT: v_readlane_b32 s49, v31, 5
-; CHECK-GISEL-NEXT: v_readlane_b32 s48, v31, 4
-; CHECK-GISEL-NEXT: v_readlane_b32 s39, v31, 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s38, v31, 2
-; CHECK-GISEL-NEXT: v_readlane_b32 s37, v31, 1
-; CHECK-GISEL-NEXT: v_readlane_b32 s36, v31, 0
-; CHECK-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; CHECK-GISEL-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <32 x i32> @llvm.amdgcn.readfirstlane.v32i32(<32 x i32> %src)
call void asm sideeffect "; use $0", "s"(<32 x i32> %x)
@@ -1601,24 +1024,16 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v8i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v8i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x i16> @llvm.amdgcn.readfirstlane.v8i16(<8 x i16> %src)
@@ -1630,32 +1045,16 @@ define void @test_readfirstlane_v16i16(ptr addrspace(1) %out, <16 x i16> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v16i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ; use v[2:9]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v16i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ; use v[2:9]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <16 x i16> @llvm.amdgcn.readfirstlane.v16i16(<16 x i16> %src)
@@ -1667,48 +1066,16 @@ define void @test_readfirstlane_v32i16(ptr addrspace(1) %out, <32 x i16> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v32i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v32i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <32 x i16> @llvm.amdgcn.readfirstlane.v32i16(<32 x i16> %src)
@@ -1721,48 +1088,16 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src)
; CHECK-SDAG-LABEL: test_readfirstlane_v32f16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s19, v17
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readfirstlane_v32f16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v8
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v9
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s12, v10
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s13, v11
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s14, v12
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s15, v13
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s16, v14
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s17, v15
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s18, v16
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v17
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <32 x half> @llvm.amdgcn.readfirstlane.v32f16(<32 x half> %src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
index 395abf0fca461..dc738253eb848 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ptr.ll
@@ -5,10 +5,8 @@ define void @test_readfirstlane_p0(ptr addrspace(1) %out, ptr %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_p0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ; use v[2:3]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr @llvm.amdgcn.readfirstlane.p0(ptr %src)
@@ -20,14 +18,8 @@ define void @test_readfirstlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_v3p0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s9, v7
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s8, v6
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s7, v5
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ; use v[2:7]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr> @llvm.amdgcn.readfirstlane.v3p0(<3 x ptr> %src)
@@ -39,9 +31,8 @@ define void @test_readfirstlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src)
; CHECK-SDAG-LABEL: test_readfirstlane_p3:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(3) @llvm.amdgcn.readfirstlane.p3(ptr addrspace(3) %src)
@@ -53,11 +44,8 @@ define void @test_readfirstlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3
; CHECK-SDAG-LABEL: test_readfirstlane_v3p3:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readfirstlane.v3p3(<3 x ptr addrspace(3)> %src)
@@ -69,9 +57,8 @@ define void @test_readfirstlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src)
; CHECK-SDAG-LABEL: test_readfirstlane_p5:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(5) @llvm.amdgcn.readfirstlane.p5(ptr addrspace(5) %src)
@@ -83,11 +70,8 @@ define void @test_readfirstlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5
; CHECK-SDAG-LABEL: test_readfirstlane_v3p5:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readfirstlane.v3p5(<3 x ptr addrspace(5)> %src)
@@ -99,9 +83,8 @@ define void @test_readfirstlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src)
; CHECK-SDAG-LABEL: test_readfirstlane_p6:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(6) @llvm.amdgcn.readfirstlane.p6(ptr addrspace(6) %src)
@@ -113,11 +96,8 @@ define void @test_readfirstlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6
; CHECK-SDAG-LABEL: test_readfirstlane_v3p6:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s6, v4
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s5, v3
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readfirstlane.v3p6(<3 x ptr addrspace(6)> %src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 7ff5eb46def38..ee5ab7ade99b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.readlane.f64(double, i32) #0
define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; use s0
@@ -18,7 +18,7 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_i32(i32 %src0, i32 %src1) #1
;
; CHECK-GISEL-LABEL: test_readlane_sreg_sreg_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s0
@@ -78,27 +78,21 @@ define amdgpu_kernel void @test_readlane_sreg_sreg_f64(double %src0, i32 %src1)
define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i32:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x4
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s0
+; CHECK-SDAG-NEXT: ; use v0
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dword s0, s[8:9], 0x4
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s0
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s0
+; CHECK-GISEL-NEXT: ; use v0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i32 asm sideeffect "; def $0", "=v"()
@@ -110,29 +104,21 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i32(i32 %src0, i32 %src1) #1
define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_i64:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ; use v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_i64:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
-; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ; use v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call i64 asm sideeffect "; def $0", "=v"()
@@ -144,29 +130,21 @@ define amdgpu_kernel void @test_readlane_vreg_sreg_i64(i64 %src0, i32 %src1) #1
define amdgpu_kernel void @test_readlane_vreg_sreg_f64(double %src0, i32 %src1) #1 {
; CHECK-SDAG-LABEL: test_readlane_vreg_sreg_f64:
; CHECK-SDAG: ; %bb.0:
-; CHECK-SDAG-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[0:1]
+; CHECK-SDAG-NEXT: ; use v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vreg_sreg_f64:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_load_dword s1, s[8:9], 0x8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readlane_b32 s0, v0, s1
-; CHECK-GISEL-NEXT: v_readlane_b32 s1, v1, s1
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[0:1]
+; CHECK-GISEL-NEXT: ; use v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_endpgm
%vgpr = call double asm sideeffect "; def $0", "=v"()
@@ -224,14 +202,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -258,15 +235,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -287,15 +262,11 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
+; CHECK-SDAG-NEXT: flat_load_dword v2, v[0:1]
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: flat_store_dword v[2:3], v0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vregs_i32:
@@ -310,14 +281,10 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s2, v1
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s2
+; CHECK-GISEL-NEXT: flat_load_dword v2, v[0:1]
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -344,15 +311,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vregs_i64:
@@ -367,16 +328,10 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
-; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -404,15 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-SDAG-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v4, s1
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s0, v2
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s1, v1, s0
-; CHECK-SDAG-NEXT: v_readlane_b32 s0, v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vregs_f64:
@@ -427,16 +376,10 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; CHECK-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s3, v2
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, s3
-; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, s3
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -493,33 +436,29 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; def v0
-; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: ;;#ASMSTART
+; CHECK-SDAG-NEXT: ; def v2
+; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; def v0
-; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: ;;#ASMSTART
+; CHECK-GISEL-NEXT: ; def v2
+; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -534,17 +473,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
-; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -552,18 +487,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -578,17 +509,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
-; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -596,18 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-LABEL: test_readlane_vgpr_imm_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -660,17 +583,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
@@ -700,17 +623,17 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
@@ -739,22 +662,16 @@ define void @test_readlane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_half:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_half:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call half @llvm.amdgcn.readlane.f16(half %src, i32 %src1)
@@ -766,22 +683,16 @@ define void @test_readlane_float(ptr addrspace(1) %out, float %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_float:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_float:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call float @llvm.amdgcn.readlane.f32(float %src, i32 %src1)
@@ -793,22 +704,16 @@ define void @test_readlane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1)
; CHECK-SDAG-LABEL: test_readlane_bfloat:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_bfloat:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call bfloat @llvm.amdgcn.readlane.bf16(bfloat %src, i32 %src1)
@@ -820,23 +725,18 @@ define void @test_readlane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
-; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 0xffff
+; CHECK-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v2
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v0
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
+; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call i16 @llvm.amdgcn.readlane.i16(i16 %src, i32 %src1)
@@ -848,22 +748,16 @@ define void @test_readlane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %sr
; CHECK-SDAG-LABEL: test_readlane_v2f16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v2f16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s4
+; CHECK-GISEL-NEXT: ; use v2
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x half> @llvm.amdgcn.readlane.v2f16(<2 x half> %src, i32 %src1)
@@ -875,24 +769,16 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s
; CHECK-SDAG-LABEL: test_readlane_v2f32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ; use v[2:3]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v2f32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v4
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s5
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s5
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:5]
+; CHECK-GISEL-NEXT: ; use v[2:3]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x float> @llvm.amdgcn.readlane.v2f32(<2 x float> %src, i32 %src1)
@@ -904,34 +790,16 @@ define void @test_readlane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src
; CHECK-SDAG-LABEL: test_readlane_v7i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v9
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:10]
+; CHECK-SDAG-NEXT: ; use v[2:8]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v7i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s10, v9
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s10
-; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s10
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:10]
+; CHECK-GISEL-NEXT: ; use v[2:8]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <7 x i32> @llvm.amdgcn.readlane.v7i32(<7 x i32> %src, i32 %src1)
@@ -943,28 +811,16 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
; CHECK-SDAG-LABEL: test_readlane_v8i16:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v8i16:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x i16> @llvm.amdgcn.readlane.v8i16(<8 x i16> %src, i32 %src1)
@@ -976,28 +832,16 @@ define void @test_readlane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src
; CHECK-SDAG-LABEL: test_readlane_v2i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v6
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:7]
+; CHECK-SDAG-NEXT: ; use v[2:5]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v2i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s7, v6
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s7
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s7
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:7]
+; CHECK-GISEL-NEXT: ; use v[2:5]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <2 x i64> @llvm.amdgcn.readlane.v2i64(<2 x i64> %src, i32 %src1)
@@ -1009,32 +853,16 @@ define void @test_readlane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src
; CHECK-SDAG-LABEL: test_readlane_v3i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ; use v[2:7]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v3i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s9, v8
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s9
-; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s9
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:9]
+; CHECK-GISEL-NEXT: ; use v[2:7]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x i64> @llvm.amdgcn.readlane.v3i64(<3 x i64> %src, i32 %src1)
@@ -1046,36 +874,16 @@ define void @test_readlane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %
; CHECK-SDAG-LABEL: test_readlane_v4f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v10
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:11]
+; CHECK-SDAG-NEXT: ; use v[2:9]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v4f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s11, v10
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s11
-; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s11
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:11]
+; CHECK-GISEL-NEXT: ; use v[2:9]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <4 x double> @llvm.amdgcn.readlane.v4f64(<4 x double> %src, i32 %src1)
@@ -1087,52 +895,16 @@ define void @test_readlane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %
; CHECK-SDAG-LABEL: test_readlane_v8f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v18
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s19, v17, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s18, v16, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s17, v15, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s16, v14, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s15, v13, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s14, v12, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s13, v11, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s12, v10, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s11, v9, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s10, v8, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:19]
+; CHECK-SDAG-NEXT: ; use v[2:17]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CHECK-GISEL-LABEL: test_readlane_v8f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s19, v18
-; CHECK-GISEL-NEXT: s_nop 3
-; CHECK-GISEL-NEXT: v_readlane_b32 s4, v2, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s5, v3, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s6, v4, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s7, v5, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s8, v6, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s9, v7, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s10, v8, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s11, v9, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s12, v10, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s13, v11, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s14, v12, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s15, v13, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s16, v14, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s17, v15, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s18, v16, s19
-; CHECK-GISEL-NEXT: v_readlane_b32 s19, v17, s19
; CHECK-GISEL-NEXT: ;;#ASMSTART
-; CHECK-GISEL-NEXT: ; use s[4:19]
+; CHECK-GISEL-NEXT: ; use v[2:17]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
%x = call <8 x double> @llvm.amdgcn.readlane.v4f64(<8 x double> %src, i32 %src1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
index ce3459506d8be..373c9dce72e20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ptr.ll
@@ -5,12 +5,8 @@ define void @test_readlane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) {
; CHECK-SDAG-LABEL: test_readlane_p0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:5]
+; CHECK-SDAG-NEXT: ; use v[2:3]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr @llvm.amdgcn.readlane.p0(ptr %src, i32 %src1)
@@ -22,16 +18,8 @@ define void @test_readlane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src1
; CHECK-SDAG-LABEL: test_readlane_v3p0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v8
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s9, v7, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s8, v6, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s7, v5, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:9]
+; CHECK-SDAG-NEXT: ; use v[2:7]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr> @llvm.amdgcn.readlane.v3p0(<3 x ptr> %src, i32 %src1)
@@ -43,11 +31,8 @@ define void @test_readlane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32
; CHECK-SDAG-LABEL: test_readlane_p3:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(3) @llvm.amdgcn.readlane.p3(ptr addrspace(3) %src, i32 %src1)
@@ -59,13 +44,8 @@ define void @test_readlane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s
; CHECK-SDAG-LABEL: test_readlane_v3p3:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(3)> @llvm.amdgcn.readlane.v3p3(<3 x ptr addrspace(3)> %src, i32 %src1)
@@ -77,11 +57,8 @@ define void @test_readlane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32
; CHECK-SDAG-LABEL: test_readlane_p5:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(5) @llvm.amdgcn.readlane.p5(ptr addrspace(5) %src, i32 %src1)
@@ -93,13 +70,8 @@ define void @test_readlane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s
; CHECK-SDAG-LABEL: test_readlane_v3p5:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(5)> @llvm.amdgcn.readlane.v3p5(<3 x ptr addrspace(5)> %src, i32 %src1)
@@ -111,11 +83,8 @@ define void @test_readlane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32
; CHECK-SDAG-LABEL: test_readlane_p6:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v3
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s4
+; CHECK-SDAG-NEXT: ; use v2
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call ptr addrspace(6) @llvm.amdgcn.readlane.p6(ptr addrspace(6) %src, i32 %src1)
@@ -127,13 +96,8 @@ define void @test_readlane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s
; CHECK-SDAG-LABEL: test_readlane_v3p6:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v5
-; CHECK-SDAG-NEXT: s_nop 3
-; CHECK-SDAG-NEXT: v_readlane_b32 s6, v4, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s5, v3, s4
-; CHECK-SDAG-NEXT: v_readlane_b32 s4, v2, s4
; CHECK-SDAG-NEXT: ;;#ASMSTART
-; CHECK-SDAG-NEXT: ; use s[4:6]
+; CHECK-SDAG-NEXT: ; use v[2:4]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
%x = call <3 x ptr addrspace(6)> @llvm.amdgcn.readlane.v3p6(<3 x ptr addrspace(6)> %src, i32 %src1)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 586579fcaeb93..ef96944abef0e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,38 +20,33 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: ; implicit-def: $sgpr4
-; CHECK-NEXT: v_mov_b32_e32 v0, s4
-; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
-; CHECK-NEXT: s_mov_b32 s7, 0
-; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_mov_b64 s[10:11], exec
-; CHECK-NEXT: s_mov_b64 exec, -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[10:11]
+; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..364598f7cf6c0 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %125:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %117:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,87 +44,85 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
; CHECK-NEXT: KILL undef %74:sreg_64
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL undef %89:sgpr_128
- ; CHECK-NEXT: KILL undef %118:sgpr_128
+ ; CHECK-NEXT: KILL undef %112:sgpr_128
+ ; CHECK-NEXT: KILL undef %87:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
- ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
@@ -135,49 +133,49 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4)
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -189,30 +187,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
- ; CHECK-NEXT: KILL undef %470:sreg_64
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+ ; CHECK-NEXT: KILL undef %443:sreg_64
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
@@ -224,22 +222,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
- ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
+ ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -351,13 +349,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index ad8dcd3888e9f..c9128984504b2 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -3477,13 +3477,10 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
; GFX9-W64-NEXT: s_mov_b64 exec, 0
; GFX9-W64-NEXT: s_mov_b32 s1, 0
; GFX9-W64-NEXT: s_mov_b32 s0, s1
-; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0
-; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-W64-NEXT: s_cmp_eq_u64 s[0:1], 0
; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
-; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX9-W64-NEXT: exp mrt0 off, off, off, off
; GFX9-W64-NEXT: s_endpgm
;
@@ -3491,14 +3488,11 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
; GFX10-W32: ; %bb.0:
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
; GFX10-W32-NEXT: s_mov_b32 s1, 0
-; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0
+; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s0, s1
-; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-W32-NEXT: s_cmp_eq_u64 s[0:1], 0
; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0
-; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0
-; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX10-W32-NEXT: exp mrt0 off, off, off, off
; GFX10-W32-NEXT: s_endpgm
call void @llvm.amdgcn.init.exec(i64 0)
@@ -3527,13 +3521,11 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
+; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_readfirstlane_b32 s16, v0
-; GFX9-W64-NEXT: s_buffer_load_dword s16, s[8:11], s16 offset:0x0
-; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
+; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: .LBB59_2: ; %endif
@@ -3557,13 +3549,11 @@ define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0
; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
+; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_readfirstlane_b32 s14, v0
-; GFX10-W32-NEXT: s_buffer_load_dword s14, s[8:11], s14 offset:0x0
-; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, s14
-; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
+; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: .LBB59_2: ; %endif
@@ -3613,16 +3603,14 @@ define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
-; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
-; GFX9-W64-NEXT: ; kill: killed $vgpr3
; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2
-; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
-; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
+; GFX9-W64-NEXT: ; kill: killed $vgpr3
+; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5
-; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX9-W64-NEXT: v_add_f32_e32 v1, v4, v5
+; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: short_exact_regions_2:
@@ -3635,12 +3623,11 @@ define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
-; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
+; GFX10-W32-NEXT: v_add_f32_e32 v1, v4, v1
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1
-; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
-; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
More information about the llvm-commits
mailing list