[llvm] [AArch64] Add lowering for NEON rounding multiple (accumulate) intrinsics (PR #172851)
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 18 05:38:21 PST 2025
https://github.com/kmclaughlin-arm created https://github.com/llvm/llvm-project/pull/172851
Extends #161840 to add lowering with bitcasts for sqdmulh, sqrdmulh,
sqrdmlah & sqrdmlsh intrinsics.
>From 13e57e13001c6f3a8bed55e2ea516db35cf69bb9 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 12 Dec 2025 14:47:40 +0000
Subject: [PATCH] [AArch64] Add lowering for NEON rounding multiple
(accumulate) intrinsics
Extends #161840 to add lowering with bitcasts for sqdmulh, sqrdmulh,
sqrdmlah & sqrdmlsh intrinsics.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 19 ++--
.../lib/Target/AArch64/AArch64InstrFormats.td | 68 +++++++----
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 33 +++---
llvm/test/CodeGen/AArch64/arm64-int-neon.ll | 54 +++++++++
llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll | 106 ++++++++++++------
llvm/test/CodeGen/AArch64/arm64-vmul.ll | 30 ++---
6 files changed, 208 insertions(+), 102 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 47de9b745c9f6..fafe7e0fdd8e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4564,6 +4564,9 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
SelectionDAG &DAG) {
+ if (Op.getValueType().isVector())
+ return SDValue();
+
SDLoc DL(Op);
auto getFloatVT = [](EVT VT) {
assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
@@ -6428,21 +6431,21 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1).getValueType(),
Op.getOperand(1), Op.getOperand(2)));
return SDValue();
+ case Intrinsic::aarch64_neon_sqdmulh:
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
+ case Intrinsic::aarch64_neon_sqrdmulh:
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
+ case Intrinsic::aarch64_neon_sqrdmlah:
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
+ case Intrinsic::aarch64_neon_sqrdmlsh:
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
case Intrinsic::aarch64_neon_sqrshl:
- if (Op.getValueType().isVector())
- return SDValue();
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
case Intrinsic::aarch64_neon_sqshl:
- if (Op.getValueType().isVector())
- return SDValue();
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
case Intrinsic::aarch64_neon_uqrshl:
- if (Op.getValueType().isVector())
- return SDValue();
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
case Intrinsic::aarch64_neon_uqshl:
- if (Op.getValueType().isVector())
- return SDValue();
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
case Intrinsic::aarch64_neon_sqadd:
if (Op.getValueType().isVector())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 43319f7eb8a8f..736601e176da9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7733,19 +7733,31 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
}
multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode, SDPatternOperator G_OpNode> {
def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm,
- [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+ [(set FPR32:$Rd, (G_OpNode FPR32:$Rn, FPR32:$Rm))]>;
def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
+
+ def : Pat<(f32 (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
}
-multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm> {
- def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
- (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
- asm, []>;
- def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
- (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
- asm, []>;
+let Predicates = [HasRDM] in {
+ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+ SDPatternOperator OpNode, SDPatternOperator G_OpNode> {
+ def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+ (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm),
+ asm, []>;
+ def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+ (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm),
+ asm, []>;
+
+ def : Pat<(i32 (G_OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+ def : Pat<(f32 (OpNode FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+ }
}
multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
@@ -9573,12 +9585,12 @@ multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
}
multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode, SDPatternOperator G_OpNode> {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$Rd),
- (OpNode (v4i16 V64:$Rn),
+ (G_OpNode (v4i16 V64:$Rn),
(dup_v8i16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx)))]> {
bits<3> idx;
let Inst{11} = idx{2};
@@ -9591,7 +9603,7 @@ multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$Rd),
- (OpNode (v8i16 V128:$Rn),
+ (G_OpNode (v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
bits<3> idx;
let Inst{11} = idx{2};
@@ -9604,7 +9616,7 @@ multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$Rd),
- (OpNode (v2i32 V64:$Rn),
+ (G_OpNode (v2i32 V64:$Rn),
(dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx)))]> {
bits<2> idx;
let Inst{11} = idx{1};
@@ -9616,7 +9628,7 @@ multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$Rd),
- (OpNode (v4i32 V128:$Rn),
+ (G_OpNode (v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
@@ -9636,13 +9648,19 @@ multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$Rd),
- (OpNode FPR32Op:$Rn,
+ (G_OpNode FPR32Op:$Rn,
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
+
+ def : Pat<(f32 (OpNode (f32 FPR32Op:$Rn),
+ (bitconvert (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))),
+ (!cast<Instruction>(NAME # v1i32_indexed) FPR32Op:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
}
multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
@@ -11643,12 +11661,12 @@ multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
}
multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
- SDPatternOperator op> {
+ SDPatternOperator op, SDPatternOperator G_op> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V64, V64, V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$dst),
- (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (v4i16 (G_op (v4i16 V64:$Rd), (v4i16 V64:$Rn),
(dup_v8i16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx))))]> {
bits<3> idx;
@@ -11661,7 +11679,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
V128, V128, V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$dst),
- (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (v8i16 (G_op (v8i16 V128:$Rd), (v8i16 V128:$Rn),
(v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
VectorIndexH:$idx)))))]> {
bits<3> idx;
@@ -11674,7 +11692,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
V64, V64, V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$dst),
- (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (G_op (v2i32 V64:$Rd), (v2i32 V64:$Rn),
(dup_v4i32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
bits<2> idx;
let Inst{11} = idx{1};
@@ -11685,7 +11703,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
V128, V128, V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$dst),
- (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 (G_op (v4i32 V128:$Rd), (v4i32 V128:$Rn),
(v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx)))))]> {
bits<2> idx;
@@ -11707,13 +11725,21 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$dst),
- (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn),
+ (i32 (G_op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
VectorIndexS:$idx)))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
+
+ def : Pat<(f32 (op (f32 FPR32Op:$Rd),
+ (bitconvert (i32 FPR32Op:$Rn)),
+ (bitconvert (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))),
+ (!cast<Instruction>(NAME # v1i32_indexed) FPR32Op:$Rd, FPR32Op:$Rn,
+ V128:$Rm, VectorIndexS:$idx)>;
+
}
} // let Predicates = [HasNeon, HasRDM]
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c22929f379dfc..71b9b5a8a3178 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1034,11 +1034,13 @@ def AArch64uqadd: SDNode<"AArch64ISD::UQADD", SDTFPBinOp>;
def AArch64uqrshl: SDNode<"AArch64ISD::UQRSHL", SDTFPBinOp>;
def AArch64uqshl: SDNode<"AArch64ISD::UQSHL", SDTFPBinOp>;
def AArch64uqsub: SDNode<"AArch64ISD::UQSUB", SDTFPBinOp>;
-def AArch64sqdmull: SDNode<"AArch64ISD::SQDMULL",
- SDTypeProfile<1, 2, [ SDTCisSameAs<1, 2>,
+def AArch64sqdmull: SDNode<"AArch64ISD::SQDMULL",
+ SDTypeProfile<1, 2, [ SDTCisSameAs<1, 2>,
SDTCisFP<0>, SDTCisFP<1>]>>;
-//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
+def AArch64sqrdmulh: SDNode<"AArch64ISD::SQRDMULH", SDTFPBinOp>;
+def AArch64sqrdmlah: SDNode<"AArch64ISD::SQRDMLAH", SDTFPTernaryOp>;
+def AArch64sqrdmlsh: SDNode<"AArch64ISD::SQRDMLSH", SDTFPTernaryOp>;
// Vector immediate ops
def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
@@ -1084,7 +1086,8 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull,
[SDNPCommutative]>;
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull,
[SDNPCommutative]>;
-def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>;
+def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH",
+ SDTypeProfile<1, 2, [ SDTCisSameAs<1, 2>]>>;
// Reciprocal estimates and steps.
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
@@ -6435,8 +6438,8 @@ defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx,
defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", AArch64sqadd, int_aarch64_neon_sqadd, saddsat>;
-defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", AArch64sqdmulh, int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", AArch64sqrdmulh, int_aarch64_neon_sqrdmulh>;
defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", AArch64sqrshl, int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", AArch64sqshl, int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", AArch64sqsub, int_aarch64_neon_sqsub, ssubsat>;
@@ -6450,14 +6453,8 @@ defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", AArch64uqsub, int_aarch
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
let Predicates = [HasRDM] in {
- defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
- defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
- def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))),
- (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
- def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))),
- (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+ defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah", AArch64sqrdmlah, int_aarch64_neon_sqrdmlah>;
+ defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh", AArch64sqrdmlsh, int_aarch64_neon_sqrdmlsh>;
}
defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
@@ -8804,8 +8801,8 @@ def : Pat<(v2f64 (any_fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
(i64 0))>;
-defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
-defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", AArch64sqdmulh, int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", AArch64sqrdmulh, int_aarch64_neon_sqrdmulh>;
defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
int_aarch64_neon_sqdmulh_laneq>;
@@ -8827,9 +8824,9 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat,
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat,
AArch64sqsub, int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
- int_aarch64_neon_sqrdmlah>;
+ AArch64sqrdmlah, int_aarch64_neon_sqrdmlah>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
- int_aarch64_neon_sqrdmlsh>;
+ AArch64sqrdmlsh, int_aarch64_neon_sqrdmlsh>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
diff --git a/llvm/test/CodeGen/AArch64/arm64-int-neon.ll b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
index e8ae8a3e53c9b..24d68b3b694dc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
@@ -228,3 +228,57 @@ define i64 @test_sqdmulls_scalar(float %A){
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %cvt, i32 %cvt)
ret i64 %prod
}
+
+define i32 @test_sqdmulh_scalar(float noundef %a) {
+; CHECK-LABEL: test_sqdmulh_scalar:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqdmulh s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i32 @test_sqrdmulh_scalar(float noundef %a) {
+; CHECK-LABEL: test_sqrdmulh_scalar:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqrdmulh s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i32 @test_sqrdmlah_scalar(float noundef %a) #0 {
+; CHECK-LABEL: test_sqrdmlah_scalar:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqrdmlah s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %cvt, i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i32 @test_sqrdmlsh_scalar(float noundef %a) #0 {
+; CHECK-LABEL: test_sqrdmlsh_scalar:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqrdmlsh s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %cvt, i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+attributes #0 = { "target-features"="+rdm" }
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index cb14adc00df00..b59a8b18e7d72 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -463,30 +463,50 @@ define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
}
define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
-; CHECK-LABEL: test_sqrdmlah_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w1
-; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: sqrdmulh s0, s0, s1
-; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: sqadd s0, s1, s0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sqrdmlah_i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w1
+; CHECK-SD-NEXT: sqrdmulh s0, s1, s0
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: sqadd s0, s1, s0
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sqrdmlah_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s0, w1
+; CHECK-GI-NEXT: fmov s1, w2
+; CHECK-GI-NEXT: sqrdmulh s0, s0, s1
+; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: sqadd s0, s1, s0
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
ret i32 %retval
}
define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
-; CHECK-LABEL: test_sqrdmlsh_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w1
-; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: sqrdmulh s0, s0, s1
-; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: sqsub s0, s1, s0
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sqrdmlsh_i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w1
+; CHECK-SD-NEXT: sqrdmulh s0, s1, s0
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: sqsub s0, s1, s0
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sqrdmlsh_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s0, w1
+; CHECK-GI-NEXT: fmov s1, w2
+; CHECK-GI-NEXT: sqrdmulh s0, s0, s1
+; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: sqsub s0, s1, s0
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
ret i32 %retval
@@ -631,14 +651,23 @@ entry:
}
define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: test_vqrdmlahs_s32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqrdmlah s0, s1, s2
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vqrdmlahs_s32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w1
+; CHECK-SD-NEXT: fmov s2, w0
+; CHECK-SD-NEXT: sqrdmlah s2, s1, s0
+; CHECK-SD-NEXT: fmov w0, s2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vqrdmlahs_s32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: sqrdmlah s0, s1, s2
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4
ret i32 %vqrdmlahs_s32.i
@@ -781,14 +810,23 @@ entry:
}
define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: test_vqrdmlshs_s32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqrdmlsh s0, s1, s2
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_vqrdmlshs_s32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w1
+; CHECK-SD-NEXT: fmov s2, w0
+; CHECK-SD-NEXT: sqrdmlsh s2, s1, s0
+; CHECK-SD-NEXT: fmov w0, s2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_vqrdmlshs_s32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: sqrdmlsh s0, s1, s2
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4
ret i32 %vqrdmlshs_s32.i
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 712452c70aab1..4a3687d3a9fd4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -304,10 +304,8 @@ define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind {
define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqdmulh_1s:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: ldr w9, [x1]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: ldr s1, [x1]
; CHECK-NEXT: sqdmulh s0, s0, s1
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -376,23 +374,13 @@ define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind {
}
define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: sqrdmulh_1s:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldr w8, [x0]
-; CHECK-SD-NEXT: ldr w9, [x1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: sqrdmulh s0, s0, s1
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: sqrdmulh_1s:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: ldr s1, [x1]
-; CHECK-GI-NEXT: sqrdmulh s0, s0, s1
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: sqrdmulh_1s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: ldr s1, [x1]
+; CHECK-NEXT: sqrdmulh s0, s0, s1
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%tmp1 = load i32, ptr %A
%tmp2 = load i32, ptr %B
%tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
More information about the llvm-commits
mailing list