[llvm] [AMDGPU][True16][CodeGen] use vgpr16 for zext patterns (#153894) (PR #154211)

via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 18 14:57:36 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

recreate this patch from https://github.com/llvm/llvm-project/pull/153894

This patch cause the upstream hip test to fail, but this should not cause real failure in the downstream branch.

---

Patch is 1.87 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154211.diff


45 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+22) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+5636-6265) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+546-602) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+620-700) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+1352-1534) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+108-132) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+2537-2877) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+287-350) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+283-311) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll (+1) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+28-36) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+18-24) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+20-22) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+20-22) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+49-51) 
- (modified) llvm/test/CodeGen/AMDGPU/clamp-modifier.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+20-22) 
- (modified) llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+48-58) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+50-60) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+50-60) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+48-58) 
- (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+2-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+2-4) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+118-133) 
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+105-117) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+48-58) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+50-60) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+50-60) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+48-58) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+20-21) 
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+12-16) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+26-34) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+32-36) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+32-36) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+26-34) 
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+24-7) 
- (modified) llvm/test/CodeGen/AMDGPU/mad.u16.ll (+3-4) 
- (modified) llvm/test/CodeGen/AMDGPU/preserve-hi16.ll (+33-21) 
- (modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+2-4) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+56-70) 
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+34-44) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bd5dfa92a8e43..6488fa3dacfb3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3056,6 +3056,8 @@ def : GCNPat<
 }
 }  // AddedComplexity = 1
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat<
   (i32 (DivergentUnaryFrag<zext> i16:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
@@ -3071,6 +3073,26 @@ def : GCNPat<
 def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
   (COPY VSrc_b16:$src)>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<zext> i16:$src)),
+  (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<zext> i16:$src)),
+  (REG_SEQUENCE VReg_64,
+    (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16), sub0,
+    (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
+  (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
+  (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
+>;
+}
 
 def : GCNPat <
   (i32 (trunc i64:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 01854c8560ce2..637aaf7529364 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538215f18..d03d6a8940b2f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
 ; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
@@ -6549,307 +6549,266 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v162.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v39.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v65.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v39.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v150.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v147.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v39.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/154211


More information about the llvm-commits mailing list