[llvm] [AMDGPU][True16][Codegen] remove packed build_vector pattern from true16 (PR #148715)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 15 10:20:02 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Some of the packed build_vector use vgpr_32 for i16/f16/bf16. In true16 mode, this causes isel to insert illegal copy "vgpr32 = copy vgpr16". This illegal copy confuses cse pass and trigger wrong code elimination.
Remove the packed build_vector pattern from true16. After removal, ISel use vgpr16 patterns for these instead.
---
Patch is 3.43 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148715.diff
45 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+5976-5930)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+959-994)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+664-644)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+1516-1430)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+114-102)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+43-44)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+2797-2681)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+550-575)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+541-534)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+538-303)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+152-150)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+152-150)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+152-150)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+81-80)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+15-13)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+244-242)
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+42-15)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+62-61)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+228-304)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll (+21-35)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+228-304)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+130-127)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+117-114)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+356-338)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll (+84-40)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+93-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+93-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+61-14)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+33-14)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+3406-3679)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+3406-3679)
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4419ce00b473c..e36d34cbe95a4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3451,30 +3451,32 @@ def : GCNPat <
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
>;
-
def : GCNPat <
- (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
- (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
def : GCNPat <
- (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
- (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
+}
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
- (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
foreach vecTy = [v2i16, v2f16, v2bf16] in {
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 50d20e9b0e4d7..6cb236dbee76e 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
@@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index cb2f0f28a29d6..0d5f538215f18 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -6549,319 +6549,314 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/148715
More information about the llvm-commits
mailing list