[llvm] [AMDGPU] promote i1 arg type for amdgpu_cs (PR #82971)

Nick Anderson via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 1 00:13:23 PST 2024


https://github.com/nickleus27 updated https://github.com/llvm/llvm-project/pull/82971

>From 6b4caa808702e59b65af4da9a3e7e3a455bf1495 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Mon, 26 Feb 2024 01:08:58 -0800
Subject: [PATCH 1/4] [AMDGPU] promote i1 arg type for admgpu_cs

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td     | 2 ++
 llvm/test/CodeGen/AMDGPU/calling-conventions.ll | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c5207228dc913f..be6bef0bd1c9e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -66,6 +66,8 @@ def RetCC_SI_Gfx : CallingConv<[
 
 def CC_SI_SHADER : CallingConv<[
 
+  CCIfType<[i1], CCPromoteToType<i16>>,
+  
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index ce1ce649c227d2..139116604d9c79 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2078,4 +2078,13 @@ entry:
   ret void
 }
 
+; test that i1 argument for amdgpu_cs is working
+define amdgpu_cs void @amdgpu_cs_i1_arg(i1 %arg0) {
+; GCN-LABEL: amdgpu_cs_i1_arg:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_endpgm
+bb:
+  ret void
+}
+
 attributes #0 = { nounwind noinline }

>From 6c7aee0ba51a45dc387d7da6723857bba0c5587c Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Tue, 27 Feb 2024 22:51:20 -0800
Subject: [PATCH 2/4] fixup! [AMDGPU] promote i1 arg type for admgpu_cs

---
 .../CodeGen/AMDGPU/calling-conventions.ll     | 219 +++++++++++++++++-
 1 file changed, 213 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 139116604d9c79..3e636bc2e143a6 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2078,12 +2078,219 @@ entry:
   ret void
 }
 
-; test that i1 argument for amdgpu_cs is working
-define amdgpu_cs void @amdgpu_cs_i1_arg(i1 %arg0) {
-; GCN-LABEL: amdgpu_cs_i1_arg:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_endpgm
-bb:
+define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
+; SI-LABEL: amdgpu_cs_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add i1 %arg0, %arg0
+  store i1 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
+; SI-LABEL: amdgpu_cs_v8i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_v8i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_v8i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <8 x i1> %arg0, %arg0
+  store <8 x i1> %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
+; SI-LABEL: amdgpu_cs_v16i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_v16i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_short v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_v16i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <16 x i1> %arg0, %arg0
+  store <16 x i1> %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
+; SI-LABEL: amdgpu_cs_v32i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_v32i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_v32i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <32 x i1> %arg0, %arg0
+  store <32 x i1> %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
+; SI-LABEL: amdgpu_cs_inreg_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_inreg_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_inreg_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add i1 %arg0, %arg0
+  store i1 %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
+; SI-LABEL: amdgpu_cs_inreg_v8i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_inreg_v8i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_inreg_v8i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <8 x i1> %arg0, %arg0
+  store <8 x i1> %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
+; SI-LABEL: amdgpu_cs_inreg_v16i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_inreg_v16i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_short v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_inreg_v16i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <16 x i1> %arg0, %arg0
+  store <16 x i1> %add, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
+; SI-LABEL: amdgpu_cs_inreg_v32i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_inreg_v32i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_inreg_v32i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  %add = add <32 x i1> %arg0, %arg0
+  store <32 x i1> %add, ptr addrspace(1) undef
   ret void
 }
 

>From db357f1574321d8313b0668f9631c13c8d195b90 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Thu, 29 Feb 2024 00:39:28 -0800
Subject: [PATCH 3/4] fixup! [AMDGPU] promote i1 arg type for admgpu_cs

---
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |    2 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     | 1029 ++++++++++++++++-
 2 files changed, 990 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index be6bef0bd1c9e0..4be64629ddac82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[
 
 def CC_SI_SHADER : CallingConv<[
 
-  CCIfType<[i1], CCPromoteToType<i16>>,
+  CCIfType<[i1], CCPromoteToType<i32>>,
   
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 3e636bc2e143a6..0f947d82fd0ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2081,216 +2081,1165 @@ entry:
 define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
 ; SI-LABEL: amdgpu_cs_i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add i1 %arg0, %arg0
-  store i1 %add, ptr addrspace(1) undef
+  store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ; SI-LABEL: amdgpu_cs_v8i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
+; SI-NEXT:    v_and_b32_e32 v6, 1, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; SI-NEXT:    v_and_b32_e32 v4, 1, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; SI-NEXT:    v_and_b32_e32 v2, 1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v1, v7, v6
+; SI-NEXT:    v_and_b32_e32 v4, 3, v4
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v0, 3, v0
+; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; SI-NEXT:    v_and_b32_e32 v0, 15, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_v8i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e32 v6, 1, v6
+; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
+; VI-NEXT:    v_and_b32_e32 v4, 1, v4
+; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
+; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v6, v7, v6
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    v_or_b32_e32 v4, v4, v6
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 4, v4
+; VI-NEXT:    v_and_b32_e32 v0, 15, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_v8i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 4, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <8 x i1> %arg0, %arg0
-  store <8 x i1> %add, ptr addrspace(1) undef
+  store <8 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ; SI-LABEL: amdgpu_cs_v16i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 3, v15
+; SI-NEXT:    v_and_b32_e32 v14, 1, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 1, v13
+; SI-NEXT:    v_and_b32_e32 v12, 1, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 3, v11
+; SI-NEXT:    v_and_b32_e32 v10, 1, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 1, v9
+; SI-NEXT:    v_and_b32_e32 v8, 1, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
+; SI-NEXT:    v_and_b32_e32 v6, 1, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; SI-NEXT:    v_and_b32_e32 v4, 1, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; SI-NEXT:    v_and_b32_e32 v2, 1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 2, v14
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 2, v10
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v1, v15, v14
+; SI-NEXT:    v_and_b32_e32 v5, 3, v12
+; SI-NEXT:    v_or_b32_e32 v9, v11, v10
+; SI-NEXT:    v_and_b32_e32 v8, 3, v8
+; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_and_b32_e32 v4, 3, v4
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v0, 3, v0
+; SI-NEXT:    v_or_b32_e32 v1, v5, v1
+; SI-NEXT:    v_or_b32_e32 v3, v8, v9
+; SI-NEXT:    v_or_b32_e32 v4, v4, v6
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
+; SI-NEXT:    v_and_b32_e32 v2, 15, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 4, v4
+; SI-NEXT:    v_and_b32_e32 v0, 15, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_v16i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e32 v14, 1, v14
+; VI-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
+; VI-NEXT:    v_and_b32_e32 v12, 1, v12
+; VI-NEXT:    v_and_b32_e32 v10, 1, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v9, 1, v9
+; VI-NEXT:    v_and_b32_e32 v8, 1, v8
+; VI-NEXT:    v_and_b32_e32 v6, 1, v6
+; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
+; VI-NEXT:    v_and_b32_e32 v4, 1, v4
+; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; VI-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
+; VI-NEXT:    v_or_b32_e32 v12, v12, v13
+; VI-NEXT:    v_lshlrev_b16_e32 v11, 3, v11
+; VI-NEXT:    v_lshlrev_b16_e32 v10, 2, v10
+; VI-NEXT:    v_or_b32_e32 v8, v8, v9
+; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
+; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v14, v15, v14
+; VI-NEXT:    v_and_b32_e32 v12, 3, v12
+; VI-NEXT:    v_or_b32_e32 v10, v11, v10
+; VI-NEXT:    v_and_b32_e32 v8, 3, v8
+; VI-NEXT:    v_or_b32_e32 v6, v7, v6
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    v_or_b32_e32 v12, v12, v14
+; VI-NEXT:    v_or_b32_e32 v8, v8, v10
+; VI-NEXT:    v_mov_b32_e32 v9, 15
+; VI-NEXT:    v_or_b32_e32 v4, v4, v6
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
+; VI-NEXT:    v_and_b32_sdwa v8, v8, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 4, v4
+; VI-NEXT:    v_and_b32_e32 v0, 15, v0
+; VI-NEXT:    v_or_b32_e32 v8, v12, v8
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_v16i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
+; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
+; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v1, 3, v8
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v15, v14
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v12
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 4, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v2, 12, v2
+; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <16 x i1> %arg0, %arg0
-  store <16 x i1> %add, ptr addrspace(1) undef
+  store <16 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ; SI-LABEL: amdgpu_cs_v32i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 1, v29
+; SI-NEXT:    v_and_b32_e32 v28, 1, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 1, v25
+; SI-NEXT:    v_and_b32_e32 v24, 1, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 1, v21
+; SI-NEXT:    v_and_b32_e32 v20, 1, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 1, v17
+; SI-NEXT:    v_and_b32_e32 v16, 1, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 1, v13
+; SI-NEXT:    v_and_b32_e32 v12, 1, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 1, v9
+; SI-NEXT:    v_and_b32_e32 v8, 1, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 1, v5
+; SI-NEXT:    v_and_b32_e32 v4, 1, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 3, v31
+; SI-NEXT:    v_and_b32_e32 v30, 1, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 3, v27
+; SI-NEXT:    v_and_b32_e32 v26, 1, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 3, v23
+; SI-NEXT:    v_and_b32_e32 v22, 1, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 3, v19
+; SI-NEXT:    v_and_b32_e32 v18, 1, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 3, v15
+; SI-NEXT:    v_and_b32_e32 v14, 1, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 3, v11
+; SI-NEXT:    v_and_b32_e32 v10, 1, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 3, v7
+; SI-NEXT:    v_and_b32_e32 v6, 1, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; SI-NEXT:    v_and_b32_e32 v2, 1, v2
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_or_b32_e32 v28, v28, v29
+; SI-NEXT:    v_or_b32_e32 v24, v24, v25
+; SI-NEXT:    v_or_b32_e32 v20, v20, v21
+; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 2, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 2, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 2, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 2, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v31, v1
+; SI-NEXT:    v_or_b32_e32 v5, v27, v5
+; SI-NEXT:    v_or_b32_e32 v9, v23, v9
+; SI-NEXT:    v_and_b32_e32 v17, 3, v28
+; SI-NEXT:    v_and_b32_e32 v18, 3, v24
+; SI-NEXT:    v_and_b32_e32 v20, 3, v20
+; SI-NEXT:    v_or_b32_e32 v13, v19, v13
+; SI-NEXT:    v_and_b32_e32 v16, 3, v16
+; SI-NEXT:    v_or_b32_e32 v14, v15, v14
+; SI-NEXT:    v_and_b32_e32 v12, 3, v12
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    v_and_b32_e32 v8, 3, v8
+; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_and_b32_e32 v4, 3, v4
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v0, 3, v0
+; SI-NEXT:    v_or_b32_e32 v1, v17, v1
+; SI-NEXT:    v_or_b32_e32 v3, v18, v5
+; SI-NEXT:    v_or_b32_e32 v5, v20, v9
+; SI-NEXT:    v_or_b32_e32 v7, v16, v13
+; SI-NEXT:    v_or_b32_e32 v9, v12, v14
+; SI-NEXT:    v_or_b32_e32 v8, v8, v10
+; SI-NEXT:    v_or_b32_e32 v4, v4, v6
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 12, v1
+; SI-NEXT:    v_and_b32_e32 v2, 15, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 4, v5
+; SI-NEXT:    v_and_b32_e32 v5, 15, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 12, v9
+; SI-NEXT:    v_and_b32_e32 v7, 15, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; SI-NEXT:    v_and_b32_e32 v0, 15, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
+; SI-NEXT:    v_or_b32_e32 v0, v0, v4
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; SI-NEXT:    v_or_b32_e32 v3, v6, v5
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_v32i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e32 v6, 1, v6
+; VI-NEXT:    v_lshlrev_b16_e32 v5, 1, v5
+; VI-NEXT:    v_and_b32_e32 v4, 1, v4
+; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v7, 3, v7
+; VI-NEXT:    v_lshlrev_b16_e32 v6, 2, v6
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v6, v7, v6
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    v_or_b32_e32 v4, v4, v6
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v4
+; VI-NEXT:    v_and_b32_e32 v0, 15, v0
+; VI-NEXT:    v_and_b32_e32 v2, 1, v30
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 3, v31
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 1, v29
+; VI-NEXT:    v_and_b32_e32 v3, 1, v28
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 3, v2
+; VI-NEXT:    v_and_b32_e32 v3, 1, v26
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 3, v27
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_and_b32_e32 v10, 1, v10
+; VI-NEXT:    v_lshlrev_b16_e32 v9, 1, v9
+; VI-NEXT:    v_and_b32_e32 v8, 1, v8
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 1, v25
+; VI-NEXT:    v_and_b32_e32 v4, 1, v24
+; VI-NEXT:    v_lshlrev_b16_e32 v11, 3, v11
+; VI-NEXT:    v_lshlrev_b16_e32 v10, 2, v10
+; VI-NEXT:    v_or_b32_e32 v8, v8, v9
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_or_b32_e32 v10, v11, v10
+; VI-NEXT:    v_and_b32_e32 v8, 3, v8
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_or_b32_e32 v8, v8, v10
+; VI-NEXT:    v_mov_b32_e32 v10, 15
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
+; VI-NEXT:    v_and_b32_sdwa v2, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e32 v3, 1, v22
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 3, v23
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 1, v21
+; VI-NEXT:    v_and_b32_e32 v4, 1, v20
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_and_b32_e32 v4, 1, v18
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 3, v19
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
+; VI-NEXT:    v_and_b32_e32 v14, 1, v14
+; VI-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
+; VI-NEXT:    v_and_b32_e32 v12, 1, v12
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 1, v17
+; VI-NEXT:    v_and_b32_e32 v5, 1, v16
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
+; VI-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
+; VI-NEXT:    v_or_b32_e32 v12, v12, v13
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_or_b32_e32 v14, v15, v14
+; VI-NEXT:    v_and_b32_e32 v12, 3, v12
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v12, v12, v14
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v9, 12, v12
+; VI-NEXT:    v_and_b32_sdwa v8, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 4, v2
+; VI-NEXT:    v_and_b32_e32 v3, 15, v3
+; VI-NEXT:    v_or_b32_e32 v8, v9, v8
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_v32i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
+; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
+; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v8, 3, v8
+; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
+; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
+; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT:    v_or_b32_e32 v13, v15, v14
+; GFX11-NEXT:    v_and_b32_e32 v12, 3, v12
+; GFX11-NEXT:    v_and_b32_e32 v3, 15, v6
+; GFX11-NEXT:    v_lshlrev_b16 v6, 1, v29
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v26
+; GFX11-NEXT:    v_and_b32_e32 v7, 1, v28
+; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v25
+; GFX11-NEXT:    v_and_b32_e32 v10, 1, v24
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT:    v_or_b32_e32 v1, v12, v13
+; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v27
+; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v10, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 1, v22
+; GFX11-NEXT:    v_lshlrev_b16 v10, 1, v21
+; GFX11-NEXT:    v_and_b32_e32 v12, 1, v20
+; GFX11-NEXT:    v_and_b32_e32 v13, 1, v18
+; GFX11-NEXT:    v_lshlrev_b16 v14, 1, v17
+; GFX11-NEXT:    v_and_b32_e32 v15, 1, v16
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 1, v30
+; GFX11-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v23
+; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v12, v10
+; GFX11-NEXT:    v_lshlrev_b16 v12, 3, v19
+; GFX11-NEXT:    v_lshlrev_b16 v13, 2, v13
+; GFX11-NEXT:    v_or_b32_e32 v14, v15, v14
+; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v31
+; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 3, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 3, v10
+; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-NEXT:    v_and_b32_e32 v12, 3, v14
+; GFX11-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v6
+; GFX11-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
+; GFX11-NEXT:    v_or_b32_e32 v8, v12, v10
+; GFX11-NEXT:    v_lshlrev_b16 v4, 4, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX11-NEXT:    v_lshlrev_b16 v6, 4, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX11-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-NEXT:    v_lshlrev_b16 v4, 12, v5
+; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <32 x i1> %arg0, %arg0
-  store <32 x i1> %add, ptr addrspace(1) undef
+  store <32 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
 ; SI-LABEL: amdgpu_cs_inreg_i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_and_b32 s0, s0, 1
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_inreg_i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add i1 %arg0, %arg0
-  store i1 %add, ptr addrspace(1) undef
+  store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
 ; SI-LABEL: amdgpu_cs_inreg_v8i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s7, s7, 3
+; SI-NEXT:    s_and_b32 s6, s6, 1
+; SI-NEXT:    s_lshl_b32 s5, s5, 1
+; SI-NEXT:    s_and_b32 s4, s4, 1
+; SI-NEXT:    s_lshl_b32 s8, s3, 3
+; SI-NEXT:    s_and_b32 s2, s2, 1
+; SI-NEXT:    s_lshl_b32 s1, s1, 1
+; SI-NEXT:    s_and_b32 s0, s0, 1
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_lshl_b32 s6, s6, 2
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s2, s2, 2
+; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_or_b32 s1, s7, s6
+; SI-NEXT:    s_and_b32 s4, s4, 3
+; SI-NEXT:    s_or_b32 s2, s8, s2
+; SI-NEXT:    s_and_b32 s0, s0, 3
+; SI-NEXT:    s_or_b32 s1, s4, s1
+; SI-NEXT:    s_or_b32 s0, s0, s2
+; SI-NEXT:    s_lshl_b32 s1, s1, 4
+; SI-NEXT:    s_and_b32 s0, s0, 15
+; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v8i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e64 v1, s6, 1
+; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s7
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s5
+; VI-NEXT:    v_and_b32_e64 v2, s4, 1
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_and_b32_e32 v1, 3, v1
+; VI-NEXT:    v_and_b32_e64 v2, s2, 1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s3
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s1
+; VI-NEXT:    v_and_b32_e64 v3, s0, 1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 3, v2
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
+; VI-NEXT:    v_and_b32_e32 v1, 15, v1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v8i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e64 v1, s6, 1
+; GFX11-NEXT:    v_lshlrev_b16 v2, 1, s5
+; GFX11-NEXT:    v_and_b32_e64 v3, s4, 1
+; GFX11-NEXT:    v_and_b32_e64 v4, s2, 1
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, s1
+; GFX11-NEXT:    v_and_b32_e64 v6, s0, 1
+; GFX11-NEXT:    v_lshlrev_b16 v0, 3, s7
+; GFX11-NEXT:    v_lshlrev_b16 v1, 2, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s3
+; GFX11-NEXT:    v_lshlrev_b16 v4, 2, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b16 v0, 4, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <8 x i1> %arg0, %arg0
-  store <8 x i1> %add, ptr addrspace(1) undef
+  store <8 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
 ; SI-LABEL: amdgpu_cs_inreg_v16i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s15, s15, 3
+; SI-NEXT:    s_and_b32 s14, s14, 1
+; SI-NEXT:    s_lshl_b32 s13, s13, 1
+; SI-NEXT:    s_and_b32 s12, s12, 1
+; SI-NEXT:    s_lshl_b32 s11, s11, 3
+; SI-NEXT:    s_and_b32 s10, s10, 1
+; SI-NEXT:    s_lshl_b32 s9, s9, 1
+; SI-NEXT:    s_and_b32 s8, s8, 1
+; SI-NEXT:    s_lshl_b32 s7, s7, 3
+; SI-NEXT:    s_and_b32 s6, s6, 1
+; SI-NEXT:    s_lshl_b32 s5, s5, 1
+; SI-NEXT:    s_and_b32 s4, s4, 1
+; SI-NEXT:    s_lshl_b32 s16, s3, 3
+; SI-NEXT:    s_and_b32 s2, s2, 1
+; SI-NEXT:    s_lshl_b32 s1, s1, 1
+; SI-NEXT:    s_and_b32 s0, s0, 1
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_lshl_b32 s14, s14, 2
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    s_lshl_b32 s10, s10, 2
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_lshl_b32 s6, s6, 2
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s2, s2, 2
+; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_or_b32 s1, s15, s14
+; SI-NEXT:    s_and_b32 s5, s12, 3
+; SI-NEXT:    s_or_b32 s9, s11, s10
+; SI-NEXT:    s_and_b32 s8, s8, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s4, s4, 3
+; SI-NEXT:    s_or_b32 s2, s16, s2
+; SI-NEXT:    s_and_b32 s0, s0, 3
+; SI-NEXT:    s_or_b32 s1, s5, s1
+; SI-NEXT:    s_or_b32 s5, s8, s9
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_or_b32 s0, s0, s2
+; SI-NEXT:    s_lshl_b32 s1, s1, 12
+; SI-NEXT:    s_and_b32 s2, s5, 15
+; SI-NEXT:    s_lshl_b32 s4, s4, 4
+; SI-NEXT:    s_and_b32 s0, s0, 15
+; SI-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-NEXT:    s_or_b32 s0, s0, s4
+; SI-NEXT:    s_or_b32 s1, s1, s2
+; SI-NEXT:    s_and_b32 s0, s0, 0xff
+; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v16i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e64 v1, s14, 1
+; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s15
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s13
+; VI-NEXT:    v_and_b32_e64 v2, s12, 1
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_and_b32_e32 v1, 3, v1
+; VI-NEXT:    v_and_b32_e64 v2, s10, 1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s11
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s9
+; VI-NEXT:    v_and_b32_e64 v3, s8, 1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 3, v2
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v2, 15
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
+; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e64 v2, s6, 1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s7
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s5
+; VI-NEXT:    v_and_b32_e64 v3, s4, 1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 3, v2
+; VI-NEXT:    v_and_b32_e64 v3, s2, 1
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 3, s3
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s1
+; VI-NEXT:    v_and_b32_e64 v4, s0, 1
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; VI-NEXT:    v_and_b32_e32 v2, 15, v2
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v16i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e64 v0, s10, 1
+; GFX11-NEXT:    v_lshlrev_b16 v2, 1, s13
+; GFX11-NEXT:    v_and_b32_e64 v3, s12, 1
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, s9
+; GFX11-NEXT:    v_and_b32_e64 v6, s8, 1
+; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s11
+; GFX11-NEXT:    v_lshlrev_b16 v0, 2, v0
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e64 v8, s4, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v5
+; GFX11-NEXT:    v_and_b32_e64 v5, s6, 1
+; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s5
+; GFX11-NEXT:    v_and_b32_e64 v9, s2, 1
+; GFX11-NEXT:    v_lshlrev_b16 v10, 1, s1
+; GFX11-NEXT:    v_and_b32_e64 v11, s0, 1
+; GFX11-NEXT:    v_and_b32_e64 v1, s14, 1
+; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s7
+; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s3
+; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
+; GFX11-NEXT:    v_lshlrev_b16 v7, 3, s15
+; GFX11-NEXT:    v_lshlrev_b16 v1, 2, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 3, v6
+; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
+; GFX11-NEXT:    v_and_b32_e32 v8, 3, v10
+; GFX11-NEXT:    v_or_b32_e32 v1, v7, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX11-NEXT:    v_or_b32_e32 v4, v8, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v2, 4, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 15, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <16 x i1> %arg0, %arg0
-  store <16 x i1> %add, ptr addrspace(1) undef
+  store <16 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
 ; SI-LABEL: amdgpu_cs_inreg_v32i1:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s31, s31, 3
+; SI-NEXT:    s_and_b32 s30, s30, 1
+; SI-NEXT:    s_lshl_b32 s29, s29, 1
+; SI-NEXT:    s_and_b32 s28, s28, 1
+; SI-NEXT:    s_lshl_b32 s27, s27, 3
+; SI-NEXT:    s_and_b32 s26, s26, 1
+; SI-NEXT:    s_lshl_b32 s25, s25, 1
+; SI-NEXT:    s_and_b32 s24, s24, 1
+; SI-NEXT:    s_lshl_b32 s23, s23, 3
+; SI-NEXT:    s_and_b32 s22, s22, 1
+; SI-NEXT:    s_lshl_b32 s21, s21, 1
+; SI-NEXT:    s_and_b32 s20, s20, 1
+; SI-NEXT:    s_lshl_b32 s19, s19, 3
+; SI-NEXT:    s_and_b32 s18, s18, 1
+; SI-NEXT:    s_lshl_b32 s17, s17, 1
+; SI-NEXT:    s_and_b32 s16, s16, 1
+; SI-NEXT:    s_lshl_b32 s15, s15, 3
+; SI-NEXT:    s_and_b32 s14, s14, 1
+; SI-NEXT:    s_lshl_b32 s13, s13, 1
+; SI-NEXT:    s_and_b32 s12, s12, 1
+; SI-NEXT:    s_lshl_b32 s11, s11, 3
+; SI-NEXT:    s_and_b32 s10, s10, 1
+; SI-NEXT:    s_lshl_b32 s9, s9, 1
+; SI-NEXT:    s_and_b32 s8, s8, 1
+; SI-NEXT:    s_lshl_b32 s7, s7, 3
+; SI-NEXT:    s_and_b32 s6, s6, 1
+; SI-NEXT:    s_lshl_b32 s5, s5, 1
+; SI-NEXT:    s_and_b32 s4, s4, 1
+; SI-NEXT:    s_lshl_b32 s33, s3, 3
+; SI-NEXT:    s_and_b32 s2, s2, 1
+; SI-NEXT:    s_lshl_b32 s1, s1, 1
+; SI-NEXT:    s_and_b32 s0, s0, 1
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_lshl_b32 s30, s30, 2
+; SI-NEXT:    s_or_b32 s28, s28, s29
+; SI-NEXT:    s_lshl_b32 s26, s26, 2
+; SI-NEXT:    s_or_b32 s24, s24, s25
+; SI-NEXT:    s_lshl_b32 s22, s22, 2
+; SI-NEXT:    s_or_b32 s20, s20, s21
+; SI-NEXT:    s_lshl_b32 s18, s18, 2
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_lshl_b32 s14, s14, 2
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    s_lshl_b32 s10, s10, 2
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_lshl_b32 s6, s6, 2
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_lshl_b32 s2, s2, 2
+; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_or_b32 s1, s31, s30
+; SI-NEXT:    s_and_b32 s5, s28, 3
+; SI-NEXT:    s_or_b32 s9, s27, s26
+; SI-NEXT:    s_and_b32 s13, s24, 3
+; SI-NEXT:    s_or_b32 s17, s23, s22
+; SI-NEXT:    s_and_b32 s20, s20, 3
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s16, s16, 3
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    s_and_b32 s12, s12, 3
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_and_b32 s8, s8, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s4, s4, 3
+; SI-NEXT:    s_or_b32 s2, s33, s2
+; SI-NEXT:    s_and_b32 s0, s0, 3
+; SI-NEXT:    s_or_b32 s1, s5, s1
+; SI-NEXT:    s_or_b32 s5, s13, s9
+; SI-NEXT:    s_or_b32 s7, s20, s17
+; SI-NEXT:    s_or_b32 s9, s16, s18
+; SI-NEXT:    s_or_b32 s11, s12, s14
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_or_b32 s0, s0, s2
+; SI-NEXT:    s_lshl_b32 s1, s1, 12
+; SI-NEXT:    s_and_b32 s2, s5, 15
+; SI-NEXT:    s_lshl_b32 s5, s7, 4
+; SI-NEXT:    s_and_b32 s6, s9, 15
+; SI-NEXT:    s_lshl_b32 s7, s11, 12
+; SI-NEXT:    s_and_b32 s8, s8, 15
+; SI-NEXT:    s_lshl_b32 s4, s4, 4
+; SI-NEXT:    s_and_b32 s0, s0, 15
+; SI-NEXT:    s_lshl_b32 s2, s2, 8
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_lshl_b32 s6, s8, 8
+; SI-NEXT:    s_or_b32 s0, s0, s4
+; SI-NEXT:    s_or_b32 s1, s1, s2
+; SI-NEXT:    s_and_b32 s2, s5, 0xff
+; SI-NEXT:    s_or_b32 s4, s7, s6
+; SI-NEXT:    s_and_b32 s0, s0, 0xff
+; SI-NEXT:    s_or_b32 s1, s2, s1
+; SI-NEXT:    s_or_b32 s0, s0, s4
+; SI-NEXT:    s_lshl_b32 s1, s1, 16
+; SI-NEXT:    s_and_b32 s0, s0, 0xffff
+; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v32i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_and_b32_e64 v1, s14, 1
+; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s15
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s13
+; VI-NEXT:    v_and_b32_e64 v2, s12, 1
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_and_b32_e32 v1, 3, v1
+; VI-NEXT:    v_and_b32_e64 v2, s10, 1
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s11
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s9
+; VI-NEXT:    v_and_b32_e64 v3, s8, 1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_and_b32_e32 v2, 3, v2
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_mov_b32_e32 v2, 15
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
+; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e64 v3, s6, 1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s7
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_or_b32_e32 v1, v1, v3
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s5
+; VI-NEXT:    v_and_b32_e64 v4, s4, 1
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_and_b32_e64 v4, s2, 1
+; VI-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s3
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s1
+; VI-NEXT:    v_and_b32_e64 v5, s0, 1
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
+; VI-NEXT:    v_and_b32_e32 v3, 15, v3
+; VI-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-NEXT:    v_and_b32_e64 v3, s30, 1
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s31
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_or_b32_e32 v1, v1, v3
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s29
+; VI-NEXT:    v_and_b32_e64 v4, s28, 1
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_and_b32_e64 v4, s26, 1
+; VI-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s27
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s25
+; VI-NEXT:    v_and_b32_e64 v5, s24, 1
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
+; VI-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_and_b32_e64 v3, s22, 1
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v2, 3, s23
+; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s21
+; VI-NEXT:    v_and_b32_e64 v4, s20, 1
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_and_b32_e32 v3, 3, v3
+; VI-NEXT:    v_and_b32_e64 v4, s18, 1
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s19
+; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s17
+; VI-NEXT:    v_and_b32_e64 v5, s16, 1
+; VI-NEXT:    v_or_b32_e32 v4, v5, v4
+; VI-NEXT:    v_and_b32_e32 v4, 3, v4
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 4, v2
+; VI-NEXT:    v_and_b32_e32 v3, 15, v3
+; VI-NEXT:    v_or_b32_e32 v2, v3, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v32i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_and_b32_e64 v0, s14, 1
+; GFX11-NEXT:    v_lshlrev_b16 v1, 1, s13
+; GFX11-NEXT:    v_and_b32_e64 v2, s12, 1
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s15
+; GFX11-NEXT:    v_lshlrev_b16 v4, 1, s9
+; GFX11-NEXT:    v_lshlrev_b16 v0, 2, v0
+; GFX11-NEXT:    v_and_b32_e64 v5, s8, 1
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT:    v_and_b32_e64 v2, s10, 1
+; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s5
+; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s11
+; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-NEXT:    v_and_b32_e64 v5, s6, 1
+; GFX11-NEXT:    v_and_b32_e64 v7, s4, 1
+; GFX11-NEXT:    v_lshlrev_b16 v8, 1, s1
+; GFX11-NEXT:    v_and_b32_e64 v9, s0, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 3, v4
+; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s7
+; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX11-NEXT:    v_and_b32_e64 v7, s2, 1
+; GFX11-NEXT:    v_and_b32_e32 v1, 3, v1
+; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 3, v6
+; GFX11-NEXT:    v_lshlrev_b16 v6, 3, s3
+; GFX11-NEXT:    v_lshlrev_b16 v7, 2, v7
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 3, v8
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s29
+; GFX11-NEXT:    v_and_b32_e64 v7, s28, 1
+; GFX11-NEXT:    v_lshlrev_b16 v9, 1, s25
+; GFX11-NEXT:    v_and_b32_e64 v10, s24, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX11-NEXT:    v_and_b32_e64 v4, s26, 1
+; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s27
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, v10, v9
+; GFX11-NEXT:    v_and_b32_e64 v9, s22, 1
+; GFX11-NEXT:    v_lshlrev_b16 v4, 2, v4
+; GFX11-NEXT:    v_lshlrev_b16 v10, 1, s21
+; GFX11-NEXT:    v_and_b32_e64 v12, s20, 1
+; GFX11-NEXT:    v_and_b32_e64 v13, s18, 1
+; GFX11-NEXT:    v_lshlrev_b16 v14, 1, s17
+; GFX11-NEXT:    v_and_b32_e64 v15, s16, 1
+; GFX11-NEXT:    v_and_b32_e64 v5, s30, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, v8, v4
+; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s23
+; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
+; GFX11-NEXT:    v_or_b32_e32 v10, v12, v10
+; GFX11-NEXT:    v_lshlrev_b16 v12, 3, s19
+; GFX11-NEXT:    v_lshlrev_b16 v13, 2, v13
+; GFX11-NEXT:    v_or_b32_e32 v14, v15, v14
+; GFX11-NEXT:    v_lshlrev_b16 v11, 3, s31
+; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 3, v7
+; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-NEXT:    v_and_b32_e32 v9, 3, v10
+; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-NEXT:    v_and_b32_e32 v12, 3, v14
+; GFX11-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 3, v6
+; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
+; GFX11-NEXT:    v_or_b32_e32 v8, v12, v10
+; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX11-NEXT:    v_lshlrev_b16 v2, 4, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX11-NEXT:    v_lshlrev_b16 v6, 4, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX11-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b16 v3, 12, v5
+; GFX11-NEXT:    v_lshlrev_b16 v4, 8, v4
+; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %add = add <32 x i1> %arg0, %arg0
-  store <32 x i1> %add, ptr addrspace(1) undef
+  store <32 x i1> %arg0, ptr addrspace(1) undef
   ret void
 }
 

>From ec22dc17dfac59d7bd300128629ce19e963e27eb Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27 at gmail.com>
Date: Fri, 1 Mar 2024 00:09:18 -0800
Subject: [PATCH 4/4] fixup! [AMDGPU] promote i1 arg type for admgpu_cs

---
 .../CodeGen/AMDGPU/calling-conventions.ll     | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 0f947d82fd0ac7..15ebdd70ae8818 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -3243,4 +3243,53 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
   ret void
 }
 
+define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
+; SI-LABEL: amdgpu_cs_i1_sext:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_i1_sext:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_i1_sext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
+; SI-LABEL: amdgpu_cs_i1_zext:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: amdgpu_cs_i1_zext:
+; VI:       ; %bb.0:
+; VI-NEXT:    flat_store_byte v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: amdgpu_cs_i1_zext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  store i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
 attributes #0 = { nounwind noinline }



More information about the llvm-commits mailing list