[llvm] [AMDGPU][True16][CodeGen] use vgpr16 for zext patterns (#153894) (PR #154211)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 18 14:57:36 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
recreate this patch from https://github.com/llvm/llvm-project/pull/153894
This patch cause the upstream hip test to fail, but this should not cause real failure in the downstream branch.
---
Patch is 1.87 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154211.diff
45 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+22)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+5636-6265)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+546-602)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+620-700)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+1352-1534)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+108-132)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+2537-2877)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+287-350)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+283-311)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+28-36)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+18-24)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+20-22)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+20-22)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+49-51)
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+20-22)
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+48-58)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+50-60)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+50-60)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+48-58)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+118-133)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+105-117)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+48-58)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+50-60)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+50-60)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+48-58)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+20-21)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+12-16)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+26-34)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+32-36)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+32-36)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+26-34)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+24-7)
- (modified) llvm/test/CodeGen/AMDGPU/mad.u16.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-hi16.ll (+33-21)
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+56-70)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+34-44)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bd5dfa92a8e43..6488fa3dacfb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3056,6 +3056,8 @@ def : GCNPat<
}
} // AddedComplexity = 1
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat<
(i32 (DivergentUnaryFrag<zext> i16:$src)),
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
@@ -3071,6 +3073,26 @@ def : GCNPat<
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<zext> i16:$src)),
+ (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<zext> i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16), sub0,
+ (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
+ (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
+ (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
+>;
+}
def : GCNPat <
(i32 (trunc i64:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 01854c8560ce2..637aaf7529364 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538215f18..d03d6a8940b2f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -6549,307 +6549,266 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154211
More information about the llvm-commits
mailing list