[llvm] [AMDGPU][True16][Codegen] remove packed build_vector pattern from true16 (PR #148715)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 15 10:20:02 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

Some of the packed build_vector use vgpr_32 for i16/f16/bf16. In true16 mode, this causes isel to insert illegal copy "vgpr32 = copy vgpr16". This illegal copy confuses cse pass and trigger wrong code elimination.

Remove the packed build_vector pattern from true16. After removal, ISel use vgpr16 patterns for these instead.

---

Patch is 3.43 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148715.diff


45 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10-8) 
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+8-6) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+5976-5930) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+959-994) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+664-644) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+1516-1430) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+114-102) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+43-44) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+2797-2681) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+550-575) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+541-534) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+538-303) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+152-150) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+152-150) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+152-150) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+81-80) 
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+15-13) 
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+244-242) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+42-15) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+62-61) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+228-304) 
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll (+21-35) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+228-304) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+130-127) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+117-114) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+356-338) 
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+8-4) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll (+84-40) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+93-22) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+93-22) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+61-14) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+33-14) 
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+3406-3679) 
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+3406-3679) 
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+8-6) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+2-2) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4419ce00b473c..e36d34cbe95a4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3451,30 +3451,32 @@ def : GCNPat <
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
 >;
 
-
 def : GCNPat <
-  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
-  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
 
 def : GCNPat <
-  (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
-  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
 >;
+}
 
 def : GCNPat <
-  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
-  (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
-  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 foreach vecTy = [v2i16, v2f16, v2bf16] in {
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 50d20e9b0e4d7..6cb236dbee76e 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
@@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index cb2f0f28a29d6..0d5f538215f18 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
@@ -6549,319 +6549,314 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
+; GFX...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/148715


More information about the llvm-commits mailing list