[llvm] [AMDGPU] Add branching to vector rebroadcast tests. (PR #95122)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 11 07:07:27 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Leon Clark (PeddleSpam)

<details>
<summary>Changes</summary>



---

Patch is 256.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95122.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll (+3899-1507) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index b079a94b5fcc3..7e5cfab431cca 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -2,1870 +2,4262 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
-define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
+define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0, i1 %cond) {
 ; GFX9-LABEL: shuffle_v2i8_rebroadcast:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:                                ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:                                          ; implicit-def: $vgpr2
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_2
+; GFX9-NEXT:  ; %bb.1:                                ; %then
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
+; GFX9-NEXT:  .LBB0_2:                                ; %finally
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i8_rebroadcast:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10:       ; %bb.0:                                ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:                                          ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB0_2
+; GFX10-NEXT:  ; %bb.1:                                ; %then
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v0
+; GFX10-NEXT:  .LBB0_2:                                ; %finally
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11:       ; %bb.0:                                ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:                                          ; implicit-def: $vgpr2
+; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB0_2
+; GFX11-NEXT:  ; %bb.1:                                ; %then
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b16 v2, 8, v0
+; GFX11-NEXT:  .LBB0_2:                                ; %finally
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
+  br i1 %cond, label %then, label %else
+
+then:
   %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
-  ret <2 x i8> %val1
+  br label %finally
+
+else:
+  %val2 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 2, i32 2>
+  br label %finally
+
+finally:
+  %val3 = phi <2 x i8> [ %val1, %then ], [ %val2, %else ]
+  ret <2 x i8> %val3
 }
 
-define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
+define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0, i1 %cond) {
 ; GFX9-LABEL: shuffle_v4i8_rebroadcast:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:                                ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:                                          ; implicit-def: $vgpr0
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_2
+; GFX9-NEXT:  ; %bb.1:                                ; %else
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:                                          ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB1_2:                                ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB1_4
+; GFX9-NEXT:  ; %bb.3:                                ; %then
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:  .LBB1_4:                                ; %finally
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i8_rebroadcast:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10:       ; %bb.0:                                ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT:                                          ; implicit-def: $vgpr0
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    s_and_saveexec_b32 s5, s4
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execz .LBB1_2
+; GFX10-NEXT:  ; %bb.1:                                ; %else
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:                                          ; implicit-def: $vgpr1
+; GFX10-NEXT:  .LBB1_2:                                ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB1_4
+; GFX10-NEXT:  ; %bb.3:                                ; %then
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX10-NEXT:  .LBB1_4:                                ; %finally
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11:       ; %bb.0:                                ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:                                          ; implicit-def: $vgpr0
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execz .LBB1_2
+; GFX11-NEXT:  ; %bb.1:                                ; %else
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:                                          ; implicit-def: $vgpr1
+; GFX11-NEXT:  .LBB1_2:                                ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB1_4
+; GFX11-NEXT:  ; %bb.3:                                ; %then
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:  .LBB1_4:                                ; %finally
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i8>, ptr addrspace(1) %arg0
+  br i1 %cond, label %then, label %else
+
+then:
   %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i8> %val1
+  br label %finally
+
+else:
+  %val2 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  br label %finally
+
+finally:
+  %val3 = phi <4 x i8> [ %val1, %then ], [ %val2, %else ]
+  ret <4 x i8> %val3
 }
 
-define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
+define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0, i1 %cond) {
 ; GFX9-LABEL: shuffle_v8i8_rebroadcast:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9:       ; %bb.0:                                ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:                                          ; implicit-def: $vgpr3
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX9-NEXT:  ; %bb.1:                                ; %else
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:                                          ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:  ; %bb.2:                                ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX9-NEXT:  ; %bb.3:                                ; %then
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:  ; %bb.4:                                ; %finally
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v3
+; GFX9-NEXT:    v_or_b32_sdwa v8, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v3
+; GFX9-NEXT:    v_or_b32_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_or_b32_sdwa v10, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i8_rebroadcast:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10:       ; %bb.0:                                ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX10-NEXT:                                          ; implicit-def: $vgpr3
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    s_xor_b32 s4, vcc_lo, -1
+; GFX10-NEXT:    s_and_saveexec_b32 s5, s4
+; GFX10-NEXT:    s_xor_b32 s4, exec_lo, s5
+; GFX10-NEXT:  ; %bb.1:                                ; %else
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:                                          ; implicit-def: $vgpr0_vgpr1
+; GFX10-NEXT:  ; %bb.2:                                ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s4, s4
+; GFX10-NEXT:  ; %bb.3:                                ; %then
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:  ; %bb.4:                                ; %finally
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v3
+; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v3
+; GFX10-NEXT:    v_lshlrev_b16 v4, 8, v3
+; GFX10-NEXT:    v_or_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v8, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX10-NEXT:    v_or_b32_sdwa v10, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX10-NEXT:    v_mov_b32_e32 v4, v8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11:       ; %bb.0:                                ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX11-NEXT:                                          ; implicit-def: $vgpr3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_xor_b32 s0, vcc_lo, -1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s1
+; GFX11-NEXT:  ; %bb.1:                                ; %else
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:                                          ; implicit-def: $vgpr0_vgpr1
+; GFX11-NEXT:  ; %bb.2:                                ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT:  ; %bb.3:                                ; %then
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-NEXT:  ; %bb.4:                                ; %finally
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v3
+; GFX11-NEXT:    v_or_b32_e32 v8, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-NEXT:    v_or_b32_e32 v2, v0, v1
+; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v5, v1, v7
+; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v1, v9, v4
+; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v4, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i8>, ptr addrspace(1) %arg0
+  br i1 %cond, label %then, label %else
+
+then:
   %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  ret <8 x i8> %val1
+  br label %finally
+
+else:
+  %val2 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  br label %finally
+
+finally:
+  %val3 = phi <8 x i8> [ %val1, %then ], [ %val2, %else ]
+  ret <8 x i8> %val3
 }
 
-define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
+define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0, i1 %cond) {
 ; GFX9-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/95122


More information about the llvm-commits mailing list