[llvm] [AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types (PR #154875)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 8 11:03:13 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Update isel pattern with 16bit types to use vgp16 in true16 mode. This stop isel from generating illegal `vgpr32 = copy vpgr16`
This includes fcopysign, scalar_to_vector and i1 trunc. Updated lit test and added a few mir tests.
---
Patch is 245.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154875.diff
18 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+73-6)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+188-188)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+394-239)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+426-260)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll (+4-3)
- (added) llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll (+82)
- (modified) llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+18-12)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+89-62)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+186-123)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+60-28)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+248-246)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+28-26)
- (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+3-7)
- (added) llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll (+63)
- (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+39-19)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b6e697d34c3d3..fba5d9de28306 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2201,6 +2201,8 @@ def : GCNPat <
}
foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2231,6 +2233,42 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
+}
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (fcopysign fp16vt:$src0, fp16vt:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
+>;
+
+def : GCNPat <
+ (fcopysign f32:$src0, fp16vt:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
+>;
+
+def : GCNPat <
+ (fcopysign f64:$src0, fp16vt:$src1),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f32:$src1),
+ (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)),
+ (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16)
+>;
+
+def : GCNPat <
+ (fcopysign fp16vt:$src0, f64:$src1),
+ (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+}
} // End foreach fp16vt = [f16, bf16]
@@ -3154,6 +3192,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
}
let True16Predicate = UseRealTrue16Insts in {
@@ -3171,6 +3214,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0))
+>;
}
def : GCNPat <
@@ -3199,11 +3247,6 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
-def : GCNPat <
- (i1 (DivergentUnaryFrag<trunc> i16:$a)),
- (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
def IMMBitSelConst : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
MVT::i32);
@@ -3807,7 +3850,8 @@ def : GCNPat <
(v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -3827,6 +3871,29 @@ def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 49fe1eed9c514..44c719f3635c8 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -40442,11 +40442,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX11TRUE16-LABEL: v_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v2.h, s0
@@ -42871,16 +42871,16 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v3.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v4.l, s0
@@ -43195,28 +43195,28 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
; GFX11TRUE16-LABEL: v_vselect_v8bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v5.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v6.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v1.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v2.l
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v15.l, v11.l, s2
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v14.l, v10.l, s3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v9.h, s1
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v14.h, v10.h, s5
@@ -43872,38 +43872,38 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.h, 1, v6.l
+; GFX11TRUE16-NEXT: v_and_b16 v4.l, 1, v9.l
+; GFX11TRUE16-NEXT: v_and_b16 v4.h, 1, v8.l
+; GFX11TRUE16-NEXT: v_and_b16 v5.l, 1, v11.l
+; GFX11TRUE16-NEXT: v_and_b16 v5.h, 1, v10.l
+; GFX11TRUE16-NEXT: v_and_b16 v6.l, 1, v13.l
+; GFX11TRUE16-NEXT: v_and_b16 v6.h, 1, v12.l
+; GFX11TRUE16-NEXT: v_and_b16 v7.l, 1, v15.l
+; GFX11TRUE16-NEXT: v_and_b16 v7.h, 1, v14.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v2.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v2.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v3.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v3.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 1, v4.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 1, v4.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 1, v5.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 1, v6.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 1, v6.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 1, v5.h
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 1, v7.l
+; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 1, v7.h
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v30.l, v22.l, s10
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v30.h, v22.h, s11
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v29.l, v21.l, s12
@@ -45512,149 +45512,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60
-; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120
-; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116
-; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48
-; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108
-; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104
-; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
-; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92
-; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88
-; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20
-; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
+; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
+; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44
+; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28
+; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20
; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v28
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26
+; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
+; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l
+; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l
+; GFX11TRUE16-NEXT: v_and_b16 v3.h...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154875
More information about the llvm-commits
mailing list