[llvm] [AMDGPU][True16][CodeGen] build_vector pattern in true16 (PR #118904)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 16:44:39 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
build_vector pattern in true16
---
Patch is 71.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118904.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+14-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+20)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+389-397)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+17-65)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3be865f03df1fd..707ad0caa83042 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -700,9 +700,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
return true;
// TODO: This should probably be a combine somewhere
- // (build_vector $src0, undef) -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+ if (Subtarget->useRealTrue16Insts() && IsVector) {
+ // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
+ // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
+ .addReg(Undef)
+ .addReg(Src0)
+ .addImm(AMDGPU::lo16);
+ MI.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
+ RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
+ }
+ // (build_vector $src0, undef) -> copy $src0
MI.setDesc(TII.get(AMDGPU::COPY));
MI.removeOperand(2);
const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc25d75131cc35..c822a3ea0d8667 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3275,6 +3275,8 @@ def : GCNPat <
(COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
(COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3284,6 +3286,7 @@ def : GCNPat <
(vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
(S_LSHL_B32 SReg_32:$src1, (i32 16))
>;
+}
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3293,6 +3296,8 @@ def : GCNPat <
}
let SubtargetPredicate = HasVOP3PInsts in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3322,12 +3327,25 @@ def : GCNPat <
(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
// Take the lower 16 bits from each VGPR_32 and concat them
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
>;
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
+ (REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
+>;
+// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
+def : GCNPat <
+ (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+>;
+}
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3353,6 +3371,8 @@ def : GCNPat <
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
// Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat <
(vecTy (DivergentBinFrag<build_vector>
(Ty !if(!eq(Ty, i16),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9a647f04d43da6..230a62f447938c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9341,7 +9341,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fadd_v3bf16:
@@ -13109,7 +13109,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fsub_v3bf16:
@@ -13778,7 +13778,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmul_v3bf16:
@@ -17856,7 +17856,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_minnum_v3bf16:
@@ -21475,7 +21475,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
@@ -28670,8 +28670,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
@@ -28774,9 +28774,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -28897,18 +28897,18 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
@@ -30480,8 +30480,8 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -31001,7 +31001,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -31921,7 +31921,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -32755,7 +32755,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -33278,7 +33278,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -34047,8 +34047,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -34745,9 +34745,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_cndmask_b32 v0, v2, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_select_v2bf16:
@@ -34851,13 +34851,14 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v2bf16:
@@ -35034,14 +35035,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s3
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s0
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_select_v2bf16:
@@ -35148,19 +35149,19 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s3
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -36868,33 +36869,32 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s1
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11TRUE16-NEXT: s_lshr_b32 s4, s3, 16
; GFX11TRUE16-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s1, s2, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s3
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, s0
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s4
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s5
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, s4
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, s1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc_lo
-; GFX11TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37070,29 +37070,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.l
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11TR...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/118904
More information about the llvm-commits
mailing list