[llvm] [AArch64][GlobalISel] Prefer to use Vector Truncate (PR #105692)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 09:32:05 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (chuongg3)
<details>
<summary>Changes</summary>
Tries to combine scalarised truncates into vector truncate operations
**EXAMPLE**:
`%a(i32), %b(i32) = G_UNMERGE %src(<2 x i32>)`
`%T_a(i16) = G_TRUNC %a(i32)`
`%T_b(i16) = G_TRUNC %b(i32)`
`%Imp(i16) = G_IMPLICIT_DEF(i16)`
`%dst(v8i16) = G_MERGE_VALUES %T_a(i16), %T_b(i16), %Imp(i16), %Imp(i16)`
**===>**
`%Imp(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>)`
`%Mid(<4 x s16>) = G_CONCAT_VECTORS %src(<2 x i32>), %Imp(<2 x i32>)`
`%dst(<4 x s16>) = G_TRUNC %Mid(<4 x s16>)`
---
Patch is 305.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105692.diff
66 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+3)
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+8-1)
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+104)
- (modified) llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp (+29-3)
- (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+10-2)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+65-35)
- (modified) llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (+15)
- (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+2-1)
- (modified) llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp (+39)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir (+6-18)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir (+4-7)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll (+7-5)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir (+14-7)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir (+15-1)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/select-load.mir (+126-22)
- (modified) llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll (-1)
- (modified) llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll (-1)
- (modified) llvm/test/CodeGen/AArch64/aarch64-smull.ll (+7-9)
- (modified) llvm/test/CodeGen/AArch64/abs.ll (+2-5)
- (modified) llvm/test/CodeGen/AArch64/arm64-dup.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll (+21-24)
- (modified) llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll (+50-99)
- (modified) llvm/test/CodeGen/AArch64/arm64-ld1.ll (+38-92)
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-copy.ll (+106-123)
- (modified) llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll (+9-9)
- (modified) llvm/test/CodeGen/AArch64/arm64-tbl.ll (+75-84)
- (modified) llvm/test/CodeGen/AArch64/bitcast.ll (+30-32)
- (modified) llvm/test/CodeGen/AArch64/bswap.ll (+1-4)
- (modified) llvm/test/CodeGen/AArch64/concat-vector.ll (+69-96)
- (modified) llvm/test/CodeGen/AArch64/fabs.ll (+13-19)
- (modified) llvm/test/CodeGen/AArch64/faddsub.ll (+32-48)
- (modified) llvm/test/CodeGen/AArch64/fcmp.ll (+107-125)
- (modified) llvm/test/CodeGen/AArch64/fcopysign.ll (+14-16)
- (modified) llvm/test/CodeGen/AArch64/fcvt.ll (+91-133)
- (modified) llvm/test/CodeGen/AArch64/fdiv.ll (+16-24)
- (modified) llvm/test/CodeGen/AArch64/fexplog.ll (+160-155)
- (modified) llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll (+4-16)
- (modified) llvm/test/CodeGen/AArch64/fminimummaximum.ll (+32-48)
- (modified) llvm/test/CodeGen/AArch64/fminmax.ll (+32-48)
- (modified) llvm/test/CodeGen/AArch64/fmla.ll (+69-99)
- (modified) llvm/test/CodeGen/AArch64/fmul.ll (+16-24)
- (modified) llvm/test/CodeGen/AArch64/fneg.ll (+13-19)
- (modified) llvm/test/CodeGen/AArch64/fpow.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/fpowi.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/fptoi.ll (+20-54)
- (modified) llvm/test/CodeGen/AArch64/fptrunc.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/frem.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/fsincos.ll (+64-62)
- (modified) llvm/test/CodeGen/AArch64/fsqrt.ll (+13-19)
- (modified) llvm/test/CodeGen/AArch64/insertextract.ll (+17-40)
- (modified) llvm/test/CodeGen/AArch64/itofp.ll (+61-82)
- (modified) llvm/test/CodeGen/AArch64/llvm.exp10.ll (+5-5)
- (modified) llvm/test/CodeGen/AArch64/load.ll (+25-23)
- (modified) llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll (+14-20)
- (modified) llvm/test/CodeGen/AArch64/neon-compare-instructions.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/neon-extadd.ll (+22-22)
- (modified) llvm/test/CodeGen/AArch64/neon-perm.ll (+7-6)
- (modified) llvm/test/CodeGen/AArch64/ptradd.ll (+11-11)
- (modified) llvm/test/CodeGen/AArch64/sadd_sat_vec.ll (+20-18)
- (modified) llvm/test/CodeGen/AArch64/shift.ll (+47-107)
- (modified) llvm/test/CodeGen/AArch64/shufflevector.ll (+18-35)
- (modified) llvm/test/CodeGen/AArch64/ssub_sat_vec.ll (+20-18)
- (modified) llvm/test/CodeGen/AArch64/uadd_sat_vec.ll (+20-18)
- (modified) llvm/test/CodeGen/AArch64/usub_sat_vec.ll (+20-18)
- (modified) llvm/test/CodeGen/AArch64/xtn.ll (+6-17)
- (modified) llvm/test/CodeGen/AArch64/zext.ll (+6-8)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 05d7e882f5135c..8556692dcf4f3b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -605,6 +605,9 @@ class CombinerHelper {
bool matchRotateOutOfRange(MachineInstr &MI);
void applyRotateOutOfRange(MachineInstr &MI);
+ bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
+ void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
+
/// \returns true if a G_ICMP instruction \p MI can be replaced with a true
/// or false constant based off of KnownBits information.
bool matchICmpToTrueFalseKnownBits(MachineInstr &MI, int64_t &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 2246e20ecc1dc8..16a3ddf849ca5b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1599,6 +1599,13 @@ def insert_vector_elt_oob : GICombineRule<
[{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+// Combine v8i8 (buildvector i8 (trunc(unmerge)), i8 (trunc), i8 (trunc), i8 (trunc), undef, undef, undef, undef)
+def combine_use_vector_truncate : GICombineRule<
+ (defs root:$root, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_BUILD_VECTOR):$root,
+ [{ return Helper.matchUseVectorTruncate(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyUseVectorTruncate(*${root}, ${matchinfo}); }])>;
+
def add_of_vscale : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (G_VSCALE $left, $imm1),
@@ -1875,7 +1882,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
sub_add_reg, select_to_minmax, redundant_binop_in_equality,
fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
- sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat]>;
+ sext_trunc, zext_trunc, prefer_sign_combines, combine_shuffle_concat, combine_use_vector_truncate]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d930ab29846297..61cd464cd619b7 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -3367,6 +3367,110 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
isConstTrueVal(TLI, Cst, IsVector, IsFP);
}
+// This combine tries to reduce the number of scalarised G_TRUNC instructions by
+// using vector truncates instead
+//
+// EXAMPLE:
+// %a(i32), %b(i32) = G_UNMERGE %src(<2 x i32>)
+// %T_a(i16) = G_TRUNC %a(i32)
+// %T_b(i16) = G_TRUNC %b(i32)
+// %Undef(i16) = G_IMPLICIT_DEF(i16)
+// %dst(v8i16) = G_MERGE_VALUES %T_a(i16), %T_b(i16), %Undef(i16), %Undef(i16)
+//
+// ===>
+// %Undef(<2 x i32>) = G_IMPLICIT_DEF(<2 x i32>)
+// %Mid(<4 x s16>) = G_CONCAT_VECTORS %src(<2 x i32>), %Undef(<2 x i32>)
+// %dst(<4 x s16>) = G_TRUNC %Mid(<4 x s16>)
+bool CombinerHelper::matchUseVectorTruncate(MachineInstr &MI,
+ Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "Expected G_BUILD_VECTOR instruction\n");
+
+ unsigned NumOperands = MI.getNumOperands();
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+ // Check the G_BUILD_VECTOR sources
+ SmallVector<MachineInstr *> TruncMIs;
+ for (unsigned i = 1; i < NumOperands; ++i) {
+ auto SrcMI = MRI.getVRegDef(MI.getOperand(i).getReg());
+ auto SrcMIOpc = SrcMI->getOpcode();
+
+ if (SrcMIOpc == TargetOpcode::G_TRUNC)
+ TruncMIs.push_back(SrcMI);
+ else if (SrcMIOpc != TargetOpcode::G_IMPLICIT_DEF)
+ return false;
+ }
+
+ if (TruncMIs.size() < 2)
+ return false;
+
+ // Check if the Trunc instructions all come from the same MI
+ auto UnmergeMI = MRI.getVRegDef(TruncMIs[0]->getOperand(1).getReg());
+ if (UnmergeMI->getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+ return false;
+
+ for (auto TruncMI : TruncMIs) {
+ auto SrcMI = MRI.getVRegDef(TruncMI->getOperand(1).getReg());
+ if (!UnmergeMI->isIdenticalTo(*SrcMI))
+ return false;
+ }
+
+ // Check the size of unmerge source
+ unsigned numOps = UnmergeMI->getNumOperands();
+ MatchInfo = UnmergeMI->getOperand(numOps - 1).getReg();
+ LLT UnmergeSrcTy = MRI.getType(MatchInfo);
+ unsigned DstTyNumElt = DstTy.getNumElements();
+ unsigned UnmergeSrcTyNumElt = UnmergeSrcTy.getNumElements();
+ if (UnmergeSrcTyNumElt % UnmergeSrcTyNumElt != 0)
+ return false;
+
+ // If post legalizer, ensure generated instructions are legal
+ if (!IsPreLegalize) {
+ LLT MidTy = DstTy.changeElementSize(UnmergeSrcTy.getScalarSizeInBits());
+
+ if (DstTyNumElt != UnmergeSrcTyNumElt &&
+ !isLegal({TargetOpcode::G_CONCAT_VECTORS, {MidTy, UnmergeSrcTy}}))
+ return false;
+
+ if (!isLegal({TargetOpcode::G_TRUNC, {DstTy, MidTy}}))
+ return false;
+ }
+
+ return true;
+}
+
+void CombinerHelper::applyUseVectorTruncate(MachineInstr &MI,
+ Register &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+ "Expected G_BUILD_VECTOR instruction\n");
+
+ Register MidReg;
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ LLT UnmergeSrcTy = MRI.getType(MatchInfo);
+ unsigned DstTyNumElt = DstTy.getNumElements();
+ unsigned UnmergeSrcTyNumElt = UnmergeSrcTy.getNumElements();
+
+ // No need to pad vector if only G_TRUNC is needed
+ if (DstTyNumElt / UnmergeSrcTyNumElt == 1) {
+ MidReg = MatchInfo;
+ } else {
+ Register UndefReg = Builder.buildUndef(UnmergeSrcTy).getReg(0);
+ SmallVector<Register> ConcatRegs = {MatchInfo};
+ for (unsigned i = 1; i < DstTyNumElt / UnmergeSrcTyNumElt; ++i)
+ ConcatRegs.push_back(UndefReg);
+
+ MidReg = Builder
+ .buildConcatVectors(DstTy.changeElementSize(
+ UnmergeSrcTy.getScalarSizeInBits()),
+ ConcatRegs)
+ .getReg(0);
+ }
+
+ Builder.buildTrunc(DstReg, MidReg);
+ MI.eraseFromParent();
+}
+
bool CombinerHelper::matchNotCmp(MachineInstr &MI,
SmallVectorImpl<Register> &RegsToNegate) {
assert(MI.getOpcode() == TargetOpcode::G_XOR);
diff --git a/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
index 26752369a7711a..c44fe3bcd9cf22 100644
--- a/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
@@ -61,15 +61,41 @@ bool GIMatchTableExecutor::isBaseWithConstantOffset(
bool GIMatchTableExecutor::isObviouslySafeToFold(MachineInstr &MI,
MachineInstr &IntoMI) const {
+ auto IntoMIIter = IntoMI.getIterator();
+
// Immediate neighbours are already folded.
if (MI.getParent() == IntoMI.getParent() &&
- std::next(MI.getIterator()) == IntoMI.getIterator())
+ std::next(MI.getIterator()) == IntoMIIter)
return true;
// Convergent instructions cannot be moved in the CFG.
if (MI.isConvergent() && MI.getParent() != IntoMI.getParent())
return false;
- return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
- !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
+ if (MI.isLoadFoldBarrier())
+ return false;
+
+ // If the load is simple, check instructions between MI and IntoMI
+ if (MI.mayLoad() && MI.getParent() == IntoMI.getParent()) {
+ if (MI.memoperands_empty())
+ return false;
+ auto &MMO = **(MI.memoperands_begin());
+ if (MMO.isAtomic() || MMO.isVolatile())
+ return false;
+
+ // Ensure instructions between MI and IntoMI are not affected when combined
+ unsigned Iter = 0;
+ const unsigned MaxIter = 20;
+ for (auto CurrMI = MI.getIterator(); CurrMI != IntoMIIter; ++CurrMI) {
+ if (CurrMI->isLoadFoldBarrier())
+ return false;
+
+ if (Iter++ == MaxIter)
+ return false;
+ }
+
+ return true;
+ }
+
+ return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 3f717c8a60050f..c8724d1c610324 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -290,6 +290,13 @@ def combine_mul_cmlt : GICombineRule<
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;
+def lower_build_insert_vec_elt : GICombineRule<
+ (defs root:$root, register_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_BUILD_VECTOR):$root,
+ [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]),
+ (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }])
+>;
+
// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
@@ -300,7 +307,8 @@ def AArch64PostLegalizerLowering
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mull,
- vector_unmerge_lowering, insertelt_nonconst]> {
+ vector_unmerge_lowering, insertelt_nonconst,
+ lower_build_insert_vec_elt]> {
}
// Post-legalization combines which are primarily optimizations.
@@ -322,5 +330,5 @@ def AArch64PostLegalizerCombiner
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating,
- combine_mul_cmlt]> {
+ combine_mul_cmlt, combine_use_vector_truncate]> {
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba9242768a..3ca92c5ffa6bac 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3302,6 +3302,10 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
// Pre-fetch.
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
+ [(vector_insert undef, node:$src, (i64 0)),
+ (scalar_to_vector node:$src)]>;
+
// For regular load, we do not have any alignment requirement.
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
@@ -3310,13 +3314,13 @@ multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
ValueType ScalTy, ValueType VecTy,
Instruction LOADW, Instruction LOADX,
SubRegIndex sub> {
- def : Pat<(VecTy (scalar_to_vector (ScalTy
+ def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
sub)>;
- def : Pat<(VecTy (scalar_to_vector (ScalTy
+ def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
@@ -3344,12 +3348,12 @@ defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
ro_Wextend64:$extend))))),
(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
ro_Xextend64:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
@@ -3482,38 +3486,65 @@ def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
// Thus, it is safe to directly map the vector loads with interesting
// addressing modes.
// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
+def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
+def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
+def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
+def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
+def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
+def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
+def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+def : Pat<(v2i32 (vector_insert (v2i32 undef), (i32 GPR32:$Rn), (i64 0))),
+ (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v4i32 (vector_insert (v4i32 undef), (i32 GPR32:$Rn), (i64 0))),
+ (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v1i64 (vector_insert (v1i64 undef), (i64 GPR64:$Rn), (i64 0))),
+ (INSERT_SUBREG (v1i64 (IMPLICIT_DEF)), GPR64:$Rn, dsub)>;
+def : Pat<(v2i64 (vector_insert (v2i64 undef), (i64 GPR64:$Rn), (i64 0))),
+ (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GPR64:$Rn, dsub)>;
+
+def : Pat<(v8i8 (vec_ins_or_scal_vec (i8 (vector_extract (v8i8 V64:$Rm), (i64 0))))),
+ (v8i8 V64:$Rm)>;
+def : Pat<(v4i16 (vec_ins_or_scal_vec (i16 (vector_extract (v4i16 V64:$Rm), (i64 0))))),
+ (v4i16 V64:$Rm)>;
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 (vector_extract (v2i32 V64:$Rm), (i64 0))))),
+ (v2i32 V64:$Rm)>;
+def : Pat<(v1i64 (vec_ins_or_scal_vec (i32 (vector_extract (v1i64 V64:$Rm), (i64 0))))),
+ (v1i64 V64:$Rm)>;
+
+def : Pat<(v16i8 (vec_ins_or_scal_vec (i8 (vector_extract (v16i8 V128:$Rm), (i64 0))))),
+ (v16i8 V128:$Rm)>;
+def : Pat<(v8i16 (vec_ins_or_scal_vec (i16 (vector_extract (v8i16 V128:$Rm), (i64 0))))),
+ (v8i16 V128:$Rm)>;
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 (vector_extract (v4i32 V128:$Rm), (i64 0))))),
+ (v4i32 V128:$Rm)>;
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 (vector_extract (v2i64 V128:$Rm), (i64 0))))),
+ (v2i64 V128:$Rm)>;
+
// Match all load 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
// We must use LD1 to perform vector loads in big-endian.
@@ -6824,10 +6855,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
defm INS : SIMDIns;
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
@@ -6835,50 +6866,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))),
(SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>;
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMP...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/105692
More information about the llvm-commits
mailing list