[llvm] 3a98934 - [AArch64] Treat `@llvm.ssub.sat` the same as `@llvm.aarch64.neon.sqsub` (#140454)
via llvm-commits
llvm-commits at lists.llvm.org
Sat May 31 03:26:31 PDT 2025
Author: Folkert de Vries
Date: 2025-05-31T11:26:27+01:00
New Revision: 3a989344741949be04bc2512a4633edc1c0e69d6
URL: https://github.com/llvm/llvm-project/commit/3a989344741949be04bc2512a4633edc1c0e69d6
DIFF: https://github.com/llvm/llvm-project/commit/3a989344741949be04bc2512a4633edc1c0e69d6.diff
LOG: [AArch64] Treat `@llvm.ssub.sat` the same as `@llvm.aarch64.neon.sqsub` (#140454)
Fixes #94463
Co-authored-by: Spencer Abson <spencer.abson at arm.com>
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
llvm/test/CodeGen/AArch64/arm64-vmul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef68734fa039c..ae34e6b7dcc3c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5989,6 +5989,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getNode(
AArch64ISD::URSHR_I, dl, Op.getOperand(1).getValueType(), Op.getOperand(1), Op.getOperand(2)));
return SDValue();
+ case Intrinsic::aarch64_neon_sqadd:
+ if (Op.getValueType().isVector())
+ return DAG.getNode(ISD::SADDSAT, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ return SDValue();
+ case Intrinsic::aarch64_neon_sqsub:
+ if (Op.getValueType().isVector())
+ return DAG.getNode(ISD::SSUBSAT, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ return SDValue();
+ case Intrinsic::aarch64_neon_uqadd:
+ if (Op.getValueType().isVector())
+ return DAG.getNode(ISD::UADDSAT, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ return SDValue();
+ case Intrinsic::aarch64_neon_uqsub:
+ if (Op.getValueType().isVector())
+ return DAG.getNode(ISD::USUBSAT, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2));
+ return SDValue();
case Intrinsic::aarch64_sve_whilelt:
return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
/*IsEqual=*/false);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 5489541fcb318..6adf84879052f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6256,24 +6256,6 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
}
-multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> {
- def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)),
- (!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>;
- def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
- (!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>;
- def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
- (!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>;
-
- def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)),
- (!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>;
- def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
- (!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>;
- def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
- (!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>;
- def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
- (!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>;
-}
-
// As above, but D sized elements unsupported.
multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
@@ -9861,14 +9843,15 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
}
multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
- SDPatternOperator Accum> {
+ SDPatternOperator VecAcc,
+ SDPatternOperator ScalAcc> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
asm, ".4s", ".4s", ".4h", ".h",
[(set (v4i32 V128:$dst),
- (Accum (v4i32 V128:$Rd),
- (v4i32 (int_aarch64_neon_sqdmull
+ (VecAcc (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
(dup_v8i16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx)))))]> {
@@ -9883,8 +9866,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
V128_lo, VectorIndexH,
asm#"2", ".4s", ".4s", ".8h", ".h",
[(set (v4i32 V128:$dst),
- (Accum (v4i32 V128:$Rd),
- (v4i32 (int_aarch64_neon_sqdmull
+ (VecAcc (v4i32 V128:$Rd),
+ (v4i32 (int_aarch64_neon_sqdmull
(extract_high_v8i16 (v8i16 V128:$Rn)),
(extract_high_dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))))]> {
bits<3> idx;
@@ -9898,8 +9881,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm, ".2d", ".2d", ".2s", ".s",
[(set (v2i64 V128:$dst),
- (Accum (v2i64 V128:$Rd),
- (v2i64 (int_aarch64_neon_sqdmull
+ (VecAcc (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
(v2i32 V64:$Rn),
(dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> {
bits<2> idx;
@@ -9912,8 +9895,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm#"2", ".2d", ".2d", ".4s", ".s",
[(set (v2i64 V128:$dst),
- (Accum (v2i64 V128:$Rd),
- (v2i64 (int_aarch64_neon_sqdmull
+ (VecAcc (v2i64 V128:$Rd),
+ (v2i64 (int_aarch64_neon_sqdmull
(extract_high_v4i32 (v4i32 V128:$Rn)),
(extract_high_dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))))]> {
bits<2> idx;
@@ -9930,8 +9913,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
let Inst{20} = idx{0};
}
- def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
+ def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
(v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
(v4i16 V64:$Rm))),
@@ -9942,8 +9925,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
(i64 0))>;
- def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
+ def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
(v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
(dup_v8i16 (v8i16 V128_lo:$Rm),
@@ -9959,8 +9942,8 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
FPR64Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i64 FPR64Op:$dst),
- (Accum (i64 FPR64Op:$Rd),
- (i64 (int_aarch64_neon_sqdmulls_scalar
+ (ScalAcc (i64 FPR64Op:$Rd),
+ (i64 (int_aarch64_neon_sqdmulls_scalar
(i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))))]> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 72445172059bf..96d0146c1e752 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5968,12 +5968,12 @@ defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp
defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
-defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", saddsat>;
defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
-defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", ssubsat>;
defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", avgceils>;
defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
@@ -5987,10 +5987,10 @@ defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp
defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
-defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", uaddsat>;
defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
-defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", usubsat>;
defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", avgceilu>;
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
@@ -5999,12 +5999,6 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
int_aarch64_neon_sqrdmlsh>;
-// Extra saturate patterns, other than the intrinsics matches above
-defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>;
-defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>;
-defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>;
-defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
-
defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
@@ -6720,10 +6714,8 @@ defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
-defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
- int_aarch64_neon_sqadd>;
-defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
- int_aarch64_neon_sqsub>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>;
defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
int_aarch64_neon_sqdmull>;
defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
@@ -8282,9 +8274,9 @@ defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
-defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat,
int_aarch64_neon_sqadd>;
-defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat,
int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
int_aarch64_neon_sqrdmlah>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 7ff2e55e802c5..93f4bc423b63c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1622,8 +1622,10 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
- auto LowerBinOp = [&MI](unsigned Opcode) {
- MachineIRBuilder MIB(MI);
+ MachineIRBuilder &MIB = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *MIB.getMRI();
+
+ auto LowerBinOp = [&MI, &MIB](unsigned Opcode) {
MIB.buildInstr(Opcode, {MI.getOperand(0)},
{MI.getOperand(2), MI.getOperand(3)});
MI.eraseFromParent();
@@ -1642,7 +1644,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineFunction &MF = *MI.getMF();
auto Val = MF.getRegInfo().createGenericVirtualRegister(
LLT::scalar(VaListSize * 8));
- MachineIRBuilder MIB(MI);
MIB.buildLoad(Val, MI.getOperand(2),
*MF.getMachineMemOperand(MachinePointerInfo(),
MachineMemOperand::MOLoad,
@@ -1655,7 +1656,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
case Intrinsic::get_dynamic_area_offset: {
- MachineIRBuilder &MIB = Helper.MIRBuilder;
MIB.buildConstant(MI.getOperand(0).getReg(), 0);
MI.eraseFromParent();
return true;
@@ -1664,14 +1664,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
// Anyext the value being set to 64 bit (only the bottom 8 bits are read by
// the instruction).
- MachineIRBuilder MIB(MI);
auto &Value = MI.getOperand(3);
Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
Value.setReg(ExtValueReg);
return true;
}
case Intrinsic::aarch64_prefetch: {
- MachineIRBuilder MIB(MI);
auto &AddrVal = MI.getOperand(1);
int64_t IsWrite = MI.getOperand(2).getImm();
@@ -1694,8 +1692,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::aarch64_neon_smaxv:
case Intrinsic::aarch64_neon_uminv:
case Intrinsic::aarch64_neon_sminv: {
- MachineIRBuilder MIB(MI);
- MachineRegisterInfo &MRI = *MIB.getMRI();
bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
IntrinsicID == Intrinsic::aarch64_neon_sminv;
@@ -1720,8 +1716,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
}
case Intrinsic::aarch64_neon_uaddlp:
case Intrinsic::aarch64_neon_saddlp: {
- MachineIRBuilder MIB(MI);
-
unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
? AArch64::G_UADDLP
: AArch64::G_SADDLP;
@@ -1732,9 +1726,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
}
case Intrinsic::aarch64_neon_uaddlv:
case Intrinsic::aarch64_neon_saddlv: {
- MachineIRBuilder MIB(MI);
- MachineRegisterInfo &MRI = *MIB.getMRI();
-
unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
? AArch64::G_UADDLV
: AArch64::G_SADDLV;
@@ -1790,11 +1781,30 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerBinOp(AArch64::G_UMULL);
case Intrinsic::aarch64_neon_abs: {
// Lower the intrinsic to G_ABS.
- MachineIRBuilder MIB(MI);
MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
MI.eraseFromParent();
return true;
}
+ case Intrinsic::aarch64_neon_sqadd: {
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ return LowerBinOp(TargetOpcode::G_SADDSAT);
+ break;
+ }
+ case Intrinsic::aarch64_neon_sqsub: {
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ return LowerBinOp(TargetOpcode::G_SSUBSAT);
+ break;
+ }
+ case Intrinsic::aarch64_neon_uqadd: {
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ return LowerBinOp(TargetOpcode::G_UADDSAT);
+ break;
+ }
+ case Intrinsic::aarch64_neon_uqsub: {
+ if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+ return LowerBinOp(TargetOpcode::G_USUBSAT);
+ break;
+ }
case Intrinsic::vector_reverse:
// TODO: Add support for vector_reverse
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll b/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
index 9fb8e4c8fe031..bd28d13973f9c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
@@ -2539,16 +2539,16 @@ define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coer
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsr x8, x0, #16
; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000
-; CHECK-NEXT: fmov d5, x0
; CHECK-NEXT: rev32 v4.8h, v0.8h
; CHECK-NEXT: dup v2.8h, w8
; CHECK-NEXT: sqneg v3.8h, v2.8h
; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: sqdmull v2.4s, v0.4h, v5.h[0]
-; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v5.h[0]
-; CHECK-NEXT: sqdmlal v2.4s, v4.4h, v1.4h
-; CHECK-NEXT: sqdmlal2 v0.4s, v4.8h, v1.8h
-; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: fmov d3, x0
+; CHECK-NEXT: sqdmull v2.4s, v4.4h, v1.4h
+; CHECK-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h
+; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0]
+; CHECK-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0]
+; CHECK-NEXT: uzp2 v0.8h, v2.8h, v1.8h
; CHECK-NEXT: ret
entry:
%scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 499786470d4ac..937a17ca6c1e0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -2863,3 +2863,187 @@ define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
%prod = mul <1 x i64> %lhs, %rhs
ret <1 x i64> %prod
}
+
+define <4 x i32> @sqdmlal4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: sqdmlal4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal.4s v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2)
+ %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlal2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: sqdmlal2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal.2d v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2)
+ %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlal2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: sqdmlal2_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
+ %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlal2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: sqdmlal2_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
+ %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlal_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: sqdmlal_lane_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sqdmlal.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0)
+ %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlal_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: sqdmlal_lane_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0)
+ %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlal2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: sqdmlal2_lane_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal2.4s v0, v1, v2[7]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+ %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
+ %sum = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlal2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: sqdmlal2_lane_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
+ %sum = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlsl4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: sqdmlsl4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl.4s v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %v2)
+ %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlsl2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: sqdmlsl2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl.2d v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %v2)
+ %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlsl2_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: sqdmlsl2_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
+ %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlsl2_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: sqdmlsl2_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
+ %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlsl_lane_4s_lib(<4 x i32> %dst, <4 x i16> %v1, <4 x i16> %v2) {
+; CHECK-LABEL: sqdmlsl_lane_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i16> %v2, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %tmp1 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %v1, <4 x i16> %tmp0)
+ %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp1)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlsl_lane_2d_lib(<2 x i64> %dst, <2 x i32> %v1, <2 x i32> %v2) {
+; CHECK-LABEL: sqdmlsl_lane_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <2 x i32> %v2, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %tmp1 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %v1, <2 x i32> %tmp0)
+ %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp1)
+ ret <2 x i64> %sum
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s_lib(<4 x i32> %dst, <8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: sqdmlsl2_lane_4s_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl2.4s v0, v1, v2[7]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <8 x i16> %v1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+ %tmp2 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp0, <4 x i16> %tmp1)
+ %sum = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %dst, <4 x i32> %tmp2)
+ ret <4 x i32> %sum
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d_lib(<2 x i64> %dst, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: sqdmlsl2_lane_2d_lib:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmlsl2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+ %tmp0 = shufflevector <4 x i32> %v1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+ %tmp1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %tmp2 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp0, <2 x i32> %tmp1)
+ %sum = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %dst, <2 x i64> %tmp2)
+ ret <2 x i64> %sum
+}
More information about the llvm-commits
mailing list