[llvm] [AArch64] Add bitcasts for lowering saturating add/sub and shift intrinsics. (PR #161840)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 6 06:56:09 PST 2025
https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/161840
>From a29ca01c6f0b0250fd7ea136732b653f3c61040a Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 3 Oct 2025 12:48:31 +0000
Subject: [PATCH 1/3] [AArch64][GlobalISel] Add explicit bitcast when lowering
saturating add/sub and shift intrinsics.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 92 ++++++-
.../lib/Target/AArch64/AArch64InstrFormats.td | 11 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 43 +++-
llvm/test/CodeGen/AArch64/arm64-int-neon.ll | 225 ++++++++++++++++++
llvm/test/CodeGen/AArch64/arm64-vmul.ll | 52 ++--
llvm/test/CodeGen/AArch64/arm64-vshift.ll | 34 +--
6 files changed, 380 insertions(+), 77 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/arm64-int-neon.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9078675da0e95..721aea2a4c8d3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4488,6 +4488,25 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
return DAG.getMergeValues({Sum, OutFlag}, DL);
}
+static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT OrigVT = Op.getValueType();
+ assert((OrigVT == MVT::i32 || OrigVT == MVT::i64) &&
+ "lowerIntNeonIntrinsic expects 32/64-bit scalar operation.");
+
+ EVT NodeVT = (OrigVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+ SmallVector<SDValue, 2> NewOps;
+ NewOps.reserve(Op.getNumOperands() - 1);
+
+ for (unsigned I = 1, E = Op.getNumOperands(); I < E; ++I)
+ NewOps.push_back(DAG.getBitcast(NodeVT, Op.getOperand(I)));
+
+ SDValue OpNode = DAG.getNode(Opcode, DL, NodeVT, NewOps);
+ return DAG.getBitcast(OrigVT, OpNode);
+}
+
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
@@ -6359,26 +6378,45 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1).getValueType(),
Op.getOperand(1), Op.getOperand(2)));
return SDValue();
+ case Intrinsic::aarch64_neon_sqrshl:
+ if (Op.getValueType().isVector())
+ return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
+ case Intrinsic::aarch64_neon_sqshl:
+ if (Op.getValueType().isVector())
+ return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
+ case Intrinsic::aarch64_neon_uqrshl:
+ if (Op.getValueType().isVector())
+ return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
+ case Intrinsic::aarch64_neon_uqshl:
+ if (Op.getValueType().isVector())
+ return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
case Intrinsic::aarch64_neon_sqadd:
if (Op.getValueType().isVector())
return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
- return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
+
case Intrinsic::aarch64_neon_sqsub:
if (Op.getValueType().isVector())
return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
- return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
+
case Intrinsic::aarch64_neon_uqadd:
if (Op.getValueType().isVector())
return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
- return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
case Intrinsic::aarch64_neon_uqsub:
if (Op.getValueType().isVector())
return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
- return SDValue();
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
+
case Intrinsic::aarch64_sve_whilelt:
return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
/*IsEqual=*/false);
@@ -6713,6 +6751,52 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::experimental_vector_match: {
return LowerVectorMatch(Op, DAG);
}
+ // case Intrinsic::aarch64_neon_fcvtas:
+ // case Intrinsic::aarch64_neon_fcvtau:
+ // case Intrinsic::aarch64_neon_fcvtms:
+ // case Intrinsic::aarch64_neon_fcvtmu:
+ // case Intrinsic::aarch64_neon_fcvtns:
+ // case Intrinsic::aarch64_neon_fcvtnu:
+ // case Intrinsic::aarch64_neon_fcvtps:
+ // case Intrinsic::aarch64_neon_fcvtpu:
+ // case Intrinsic::aarch64_neon_fcvtzs:
+ // case Intrinsic::aarch64_neon_fcvtzu:
+ // case Intrinsic::aarch64_neon_sqabs:
+ // case Intrinsic::aarch64_neon_sqneg:
+ // case Intrinsic::aarch64_neon_scalar_sqxtn:
+ // case Intrinsic::aarch64_neon_scalar_sqxtun:
+ // case Intrinsic::aarch64_neon_scalar_uqxtn:
+ // case Intrinsic::aarch64_neon_sqadd:
+ // case Intrinsic::aarch64_neon_sqdmulh:
+ // case Intrinsic::aarch64_neon_sqrdmulh:
+ // case Intrinsic::aarch64_neon_sqrshl:
+ // case Intrinsic::aarch64_neon_sqshl:
+ // case Intrinsic::aarch64_neon_sqshlu:
+ // case Intrinsic::aarch64_neon_sqsub:
+ // case Intrinsic::aarch64_neon_srshl:
+ // case Intrinsic::aarch64_neon_sshl:
+ // case Intrinsic::aarch64_neon_suqadd:
+ // case Intrinsic::aarch64_neon_uqadd:
+ // case Intrinsic::aarch64_neon_uqrshl:
+ // case Intrinsic::aarch64_neon_uqshl:
+ // case Intrinsic::aarch64_neon_uqsub:
+ // case Intrinsic::aarch64_neon_urshl:
+ // case Intrinsic::aarch64_neon_ushl:
+ // case Intrinsic::aarch64_neon_usqadd:
+ // case Intrinsic::aarch64_neon_rshrn:
+ // case Intrinsic::aarch64_neon_sqrshrn:
+ // case Intrinsic::aarch64_neon_sqrshrun:
+ // case Intrinsic::aarch64_neon_sqshrn:
+ // case Intrinsic::aarch64_neon_sqshrun:
+ // case Intrinsic::aarch64_neon_uqrshrn:
+ // case Intrinsic::aarch64_neon_uqshrn:
+ // case Intrinsic::aarch64_neon_sqdmulh_lane:
+ // case Intrinsic::aarch64_neon_sqdmulh_laneq:
+ // case Intrinsic::aarch64_neon_sqrdmulh_lane:
+ // case Intrinsic::aarch64_neon_sqrdmulh_laneq:
+ // case Intrinsic::aarch64_neon_sqrdmlah:
+ // case Intrinsic::aarch64_neon_sqrdmlsh:
+ // case Intrinsic::aarch64_neon_abs:{
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f07d3514d1a99..28314d3aa7fac 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7703,16 +7703,21 @@ multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
}
multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
- SDPatternOperator OpNode, SDPatternOperator SatOp> {
+ SDPatternOperator OpNode, SDPatternOperator G_OpNode, SDPatternOperator SatOp> {
def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
[(set (v1i64 FPR64:$Rd), (SatOp (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
- def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+ def : Pat<(i64 (G_OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
- def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+ def : Pat<(i32 (G_OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+ (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+
+ def : Pat<(f64 (OpNode FPR64:$Rn, FPR64:$Rm)),
+ (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+ def : Pat<(f32 (OpNode FPR32:$Rn, FPR32:$Rm)),
(!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c7510f80c..3cc75ff43f7a3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1000,6 +1000,25 @@ def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
+def AArch64sqadd_node: SDNode<"AArch64ISD::SQADD", SDTFPBinOp>;
+def AArch64sqrshl: SDNode<"AArch64ISD::SQRSHL", SDTFPBinOp>;
+def AArch64sqshl: SDNode<"AArch64ISD::SQSHL", SDTFPBinOp>;
+def AArch64sqsub_node: SDNode<"AArch64ISD::SQSUB", SDTFPBinOp>;
+def AArch64uqadd: SDNode<"AArch64ISD::UQADD", SDTFPBinOp>;
+def AArch64uqrshl: SDNode<"AArch64ISD::UQRSHL", SDTFPBinOp>;
+def AArch64uqshl: SDNode<"AArch64ISD::UQSHL", SDTFPBinOp>;
+def AArch64uqsub: SDNode<"AArch64ISD::UQSUB", SDTFPBinOp>;
+
+// This patfrags are temporary hack to get around pattern matching issues with not yet updated intrinsics.
+def AArch64sqadd: PatFrags<(ops node:$lhs, node:$rhs),
+ [(bitconvert (AArch64sqadd_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
+ (bitconvert (AArch64sqadd_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
+ (int_aarch64_neon_sqadd node:$lhs, node:$rhs)]>;
+def AArch64sqsub: PatFrags<(ops node:$lhs, node:$rhs),
+ [(bitconvert (AArch64sqsub_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
+ (bitconvert (AArch64sqsub_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
+ (int_aarch64_neon_sqsub node:$lhs, node:$rhs)]>;
+
//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
// Vector immediate ops
@@ -6453,19 +6472,19 @@ defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
-defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd, saddsat>;
+defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", AArch64sqadd_node, int_aarch64_neon_sqadd, saddsat>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
-defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
-defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
-defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub, ssubsat>;
+defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", AArch64sqrshl, int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
+defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", AArch64sqshl, int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
+defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", AArch64sqsub_node, int_aarch64_neon_sqsub, ssubsat>;
defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
-defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd, uaddsat>;
-defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl", int_aarch64_neon_uqrshl, int_aarch64_neon_uqrshl>;
-defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl, int_aarch64_neon_uqshl>;
-defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub, usubsat>;
+defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", AArch64uqadd, int_aarch64_neon_uqadd, uaddsat>;
+defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl", AArch64uqrshl, int_aarch64_neon_uqrshl, int_aarch64_neon_uqrshl>;
+defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", AArch64uqshl, int_aarch64_neon_uqshl, int_aarch64_neon_uqshl>;
+defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", AArch64uqsub, int_aarch64_neon_uqsub, usubsat>;
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
let Predicates = [HasRDM] in {
@@ -6520,11 +6539,11 @@ defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+def : Pat<(i64 (AArch64sqadd (i64 FPR64:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+def : Pat<(i64 (AArch64sqsub (i64 FPR64:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
(i32 FPR32:$Rm))))),
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
@@ -8545,9 +8564,9 @@ defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat,
- int_aarch64_neon_sqadd>;
+ AArch64sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat,
- int_aarch64_neon_sqsub>;
+ AArch64sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
int_aarch64_neon_sqrdmlah>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
diff --git a/llvm/test/CodeGen/AArch64/arm64-int-neon.ll b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
new file mode 100644
index 0000000000000..819c00cdd6815
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+
+
+; CHECK-GI: warning: Instruction selection used fallback path for test_sqrshl_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqrshl_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqshl_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqshl_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqrshl_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqrshl_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqshl_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqshl_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqadd_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqadd_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqsub_s32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqsub_s64
+
+define i32 @test_sqrshl_s32(float noundef %a){
+; CHECK-LABEL: test_sqrshl_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqrshl s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_sqrshl_s64(float noundef %a){
+; CHECK-LABEL: test_sqrshl_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: sqrshl d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_sqshl_s32(float noundef %a) {
+; CHECK-LABEL: test_sqshl_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqshl s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.sqshl.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_sqshl_s64(float noundef %a) {
+; CHECK-LABEL: test_sqshl_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: sqshl d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_uqrshl_s32(float noundef %a) {
+; CHECK-LABEL: test_uqrshl_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: uqrshl s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_uqrshl_s64(float noundef %a) {
+; CHECK-LABEL: test_uqrshl_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: uqrshl d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_uqshl_s32(float noundef %a) {
+; CHECK-LABEL: test_uqshl_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: uqshl s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.uqshl.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_uqshl_s64(float noundef %a) {
+; CHECK-LABEL: test_uqshl_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: uqshl d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_sqadd_s32(float noundef %a) {
+; CHECK-LABEL: test_sqadd_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqadd s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_sqadd_s64(float noundef %a) {
+; CHECK-LABEL: test_sqadd_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: sqadd d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_sqsub_s32(float noundef %a) {
+; CHECK-LABEL: test_sqsub_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqsub s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_sqsub_s64(float noundef %a) {
+; CHECK-LABEL: test_sqsub_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: sqsub d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_uqadd_s32(float noundef %a) {
+; CHECK-LABEL: test_uqadd_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: uqadd s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_uqadd_s64(float noundef %a) {
+; CHECK-LABEL: test_uqadd_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: uqadd d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
+
+define i32 @test_uqsub_s32(float noundef %a) {
+; CHECK-LABEL: test_uqsub_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: uqsub s0, s0, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
+ %res = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %cvt, i32 %cvt)
+ ret i32 %res
+}
+
+define i64 @test_uqsub_s64(float noundef %a) {
+; CHECK-LABEL: test_uqsub_s64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: uqsub d0, d0, d0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+entry:
+ %cvt = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %a)
+ %res = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %cvt, i64 %cvt)
+ ret i64 %res
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index e6df9f2fb2c56..fed7439bf95fb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1766,24 +1766,14 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
-; CHECK-SD-LABEL: sqadd_lane1_sqdmull4s:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: sqadd s0, s0, s1
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: sqadd_lane1_sqdmull4s:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov s0, v0.s[1]
-; CHECK-GI-NEXT: sqadd s0, s1, s0
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: sqadd_lane1_sqdmull4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: mov s0, v0.s[1]
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
%prod = extractelement <4 x i32> %prod.vec, i32 1
%res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
@@ -1791,24 +1781,14 @@ define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
}
define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
-; CHECK-SD-LABEL: sqsub_lane1_sqdmull4s:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: sqsub s0, s0, s1
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: sqsub_lane1_sqdmull4s:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT: fmov s1, w0
-; CHECK-GI-NEXT: mov s0, v0.s[1]
-; CHECK-GI-NEXT: sqsub s0, s1, s0
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: sqsub_lane1_sqdmull4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: mov s0, v0.s[1]
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
%prod = extractelement <4 x i32> %prod.vec, i32 1
%res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 8ec5434085d6a..d27e2e69f8605 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -168,10 +168,8 @@ define <1 x i64> @sqshl1d_constant(ptr %A) nounwind {
define i64 @sqshl_scalar(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqshl_scalar:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -363,10 +361,8 @@ define <1 x i64> @uqshl1d_constant(ptr %A) nounwind {
define i64 @uqshl_scalar(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: uqshl_scalar:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uqshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -888,10 +884,8 @@ define <1 x i64> @sqrshl1d_constant(ptr %A) nounwind {
define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: sqrshl_scalar:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqrshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -904,10 +898,9 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind {
define i64 @sqrshl_scalar_constant(ptr %A) nounwind {
; CHECK-LABEL: sqrshl_scalar_constant:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x9, [x0]
-; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov x8, #1 // =0x1
+; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: sqrshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -997,10 +990,8 @@ define <1 x i64> @uqrshl1d_constant(ptr %A) nounwind {
define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: uqrshl_scalar:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x9, [x1]
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: fmov d1, x9
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: uqrshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
@@ -1013,10 +1004,9 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind {
define i64 @uqrshl_scalar_constant(ptr %A) nounwind {
; CHECK-LABEL: uqrshl_scalar_constant:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x9, [x0]
-; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov x8, #1 // =0x1
+; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: fmov d1, x8
-; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: uqrshl d0, d0, d1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
>From d5e26cead5a1567cf7bcff204aa898a3b65d8e2d Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 6 Nov 2025 14:00:26 +0000
Subject: [PATCH 2/3] Clean up the patch and add sqmulls_scalar lowering
---
.../include/llvm/Target/TargetSelectionDAG.td | 3 +
.../Target/AArch64/AArch64ISelLowering.cpp | 65 ++-------
.../lib/Target/AArch64/AArch64InstrFormats.td | 56 +++++++-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 45 +++---
llvm/test/CodeGen/AArch64/arm64-int-neon.ll | 13 ++
llvm/test/CodeGen/AArch64/arm64-vmul.ll | 132 +++++++++++-------
6 files changed, 180 insertions(+), 134 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..72edff4e92217 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -346,6 +346,9 @@ def SDTAtomicStore : SDTypeProfile<0, 2, [
def SDTAtomicLoad : SDTypeProfile<1, 1, [
SDTCisPtrTy<1>
]>;
+def SDTFPMulOp : SDTypeProfile<1, 2, [
+ SDTCisSameAs<1, 2>, SDTCisFP<0>, SDTCisFP<1>
+]>;
class SDCallSeqStart<list<SDTypeConstraint> constraints> :
SDTypeProfile<0, 2, constraints>;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b03efba57ac03..affff377cd92c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4489,19 +4489,17 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
SelectionDAG &DAG) {
SDLoc DL(Op);
- EVT OrigVT = Op.getValueType();
- assert((OrigVT == MVT::i32 || OrigVT == MVT::i64) &&
- "lowerIntNeonIntrinsic expects 32/64-bit scalar operation.");
-
- EVT NodeVT = (OrigVT == MVT::i32) ? MVT::f32 : MVT::f64;
-
+ auto getFloatVT = [](EVT VT) { return VT == MVT::i32 ? MVT::f32 : MVT::f64; };
+ auto bitcastToFloat = [&](SDValue Val) {
+ return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
+ };
SmallVector<SDValue, 2> NewOps;
NewOps.reserve(Op.getNumOperands() - 1);
for (unsigned I = 1, E = Op.getNumOperands(); I < E; ++I)
- NewOps.push_back(DAG.getBitcast(NodeVT, Op.getOperand(I)));
-
- SDValue OpNode = DAG.getNode(Opcode, DL, NodeVT, NewOps);
+ NewOps.push_back(bitcastToFloat(Op.getOperand(I)));
+ EVT OrigVT = Op.getValueType();
+ SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
return DAG.getBitcast(OrigVT, OpNode);
}
@@ -6385,7 +6383,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2));
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
-
+ case Intrinsic::aarch64_neon_sqdmulls_scalar:
+ return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
case Intrinsic::aarch64_sve_whilelt:
return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
/*IsEqual=*/false);
@@ -6719,52 +6718,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::experimental_vector_match: {
return LowerVectorMatch(Op, DAG);
}
- // case Intrinsic::aarch64_neon_fcvtas:
- // case Intrinsic::aarch64_neon_fcvtau:
- // case Intrinsic::aarch64_neon_fcvtms:
- // case Intrinsic::aarch64_neon_fcvtmu:
- // case Intrinsic::aarch64_neon_fcvtns:
- // case Intrinsic::aarch64_neon_fcvtnu:
- // case Intrinsic::aarch64_neon_fcvtps:
- // case Intrinsic::aarch64_neon_fcvtpu:
- // case Intrinsic::aarch64_neon_fcvtzs:
- // case Intrinsic::aarch64_neon_fcvtzu:
- // case Intrinsic::aarch64_neon_sqabs:
- // case Intrinsic::aarch64_neon_sqneg:
- // case Intrinsic::aarch64_neon_scalar_sqxtn:
- // case Intrinsic::aarch64_neon_scalar_sqxtun:
- // case Intrinsic::aarch64_neon_scalar_uqxtn:
- // case Intrinsic::aarch64_neon_sqadd:
- // case Intrinsic::aarch64_neon_sqdmulh:
- // case Intrinsic::aarch64_neon_sqrdmulh:
- // case Intrinsic::aarch64_neon_sqrshl:
- // case Intrinsic::aarch64_neon_sqshl:
- // case Intrinsic::aarch64_neon_sqshlu:
- // case Intrinsic::aarch64_neon_sqsub:
- // case Intrinsic::aarch64_neon_srshl:
- // case Intrinsic::aarch64_neon_sshl:
- // case Intrinsic::aarch64_neon_suqadd:
- // case Intrinsic::aarch64_neon_uqadd:
- // case Intrinsic::aarch64_neon_uqrshl:
- // case Intrinsic::aarch64_neon_uqshl:
- // case Intrinsic::aarch64_neon_uqsub:
- // case Intrinsic::aarch64_neon_urshl:
- // case Intrinsic::aarch64_neon_ushl:
- // case Intrinsic::aarch64_neon_usqadd:
- // case Intrinsic::aarch64_neon_rshrn:
- // case Intrinsic::aarch64_neon_sqrshrn:
- // case Intrinsic::aarch64_neon_sqrshrun:
- // case Intrinsic::aarch64_neon_sqshrn:
- // case Intrinsic::aarch64_neon_sqshrun:
- // case Intrinsic::aarch64_neon_uqrshrn:
- // case Intrinsic::aarch64_neon_uqshrn:
- // case Intrinsic::aarch64_neon_sqdmulh_lane:
- // case Intrinsic::aarch64_neon_sqdmulh_laneq:
- // case Intrinsic::aarch64_neon_sqrdmulh_lane:
- // case Intrinsic::aarch64_neon_sqrdmulh_laneq:
- // case Intrinsic::aarch64_neon_sqrdmlah:
- // case Intrinsic::aarch64_neon_sqrdmlsh:
- // case Intrinsic::aarch64_neon_abs:{
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 3c699546bbfc3..01d8775bcd189 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7804,10 +7804,14 @@ class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
+
+ let mayLoad = 0;
+ let mayStore = 0;
+ let hasSideEffects = 0;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+ SDPatternOperator G_OpNode = null_frag,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
(outs FPR32:$Rd),
@@ -7815,10 +7819,12 @@ multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
(outs FPR64:$Rd),
(ins FPR32:$Rn, FPR32:$Rm), asm, "",
- [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+ [(set (i64 FPR64:$Rd), (G_OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+
+ def: Pat<(f64 (OpNode FPR32:$Rn, FPR32:$Rm)),
+ (!cast<Instruction>(NAME#"i32") FPR32:$Rn, FPR32:$Rm)>;
}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
@@ -9820,7 +9826,8 @@ multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
SDPatternOperator VecAcc,
- SDPatternOperator ScalAcc> {
+ SDPatternOperator ScalAcc,
+ SDPatternOperator G_ScalAcc> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V128, V64,
V128_lo, VectorIndexH,
@@ -9889,7 +9896,7 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
let Inst{20} = idx{0};
}
- def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd),
+ def : Pat<(i32 (G_ScalAcc (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
@@ -9901,7 +9908,19 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
(i64 0))>;
- def : Pat<(i32 (ScalAcc (i32 FPR32Op:$Rd),
+ def : Pat<(f32 (ScalAcc FPR32Op:$Rd,
+ (bitconvert (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (v4i16 V64:$Rm))),
+ (i64 0)))))),
+ (!cast<Instruction>(NAME # v1i32_indexed)
+ FPR32Op:$Rd,
+ (f16 (EXTRACT_SUBREG V64:$Rn, hsub)),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
+ (i64 0))>;
+
+ def : Pat<(i32 (G_ScalAcc (i32 FPR32Op:$Rd),
(i32 (vector_extract
(v4i32 (int_aarch64_neon_sqdmull
(v4i16 V64:$Rn),
@@ -9914,11 +9933,24 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
V128_lo:$Rm,
VectorIndexH:$idx)>;
+ def : Pat<(f32 (ScalAcc FPR32Op:$Rd,
+ (bitconvert (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (dup_v8i16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx))),
+ (i64 0)))))),
+ (!cast<Instruction>(NAME # v1i32_indexed)
+ FPR32Op:$Rd,
+ (f16 (EXTRACT_SUBREG V64:$Rn, hsub)),
+ V128_lo:$Rm,
+ VectorIndexH:$idx)>;
+
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR64Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i64 FPR64Op:$dst),
- (ScalAcc (i64 FPR64Op:$Rd),
+ (G_ScalAcc (i64 FPR64Op:$Rd),
(i64 (int_aarch64_neon_sqdmulls_scalar
(i32 FPR32Op:$Rn),
(i32 (vector_extract (v4i32 V128:$Rm),
@@ -9928,6 +9960,16 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
+
+ def : Pat<(f64 (ScalAcc FPR64Op:$Rd,
+ (AArch64sqdmull FPR32Op:$Rn,
+ (bitconvert (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))))),
+ (!cast<Instruction>(NAME # v1i64_indexed)
+ FPR64Op:$Rd,
+ FPR32Op:$Rn,
+ V128:$Rm,
+ VectorIndexS:$idx)>;
}
multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 38e97bc1ade6c..16f181e6a9bdf 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1012,24 +1012,15 @@ def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
-def AArch64sqadd_node: SDNode<"AArch64ISD::SQADD", SDTFPBinOp>;
+def AArch64sqadd: SDNode<"AArch64ISD::SQADD", SDTFPBinOp>;
def AArch64sqrshl: SDNode<"AArch64ISD::SQRSHL", SDTFPBinOp>;
def AArch64sqshl: SDNode<"AArch64ISD::SQSHL", SDTFPBinOp>;
-def AArch64sqsub_node: SDNode<"AArch64ISD::SQSUB", SDTFPBinOp>;
+def AArch64sqsub: SDNode<"AArch64ISD::SQSUB", SDTFPBinOp>;
def AArch64uqadd: SDNode<"AArch64ISD::UQADD", SDTFPBinOp>;
def AArch64uqrshl: SDNode<"AArch64ISD::UQRSHL", SDTFPBinOp>;
def AArch64uqshl: SDNode<"AArch64ISD::UQSHL", SDTFPBinOp>;
def AArch64uqsub: SDNode<"AArch64ISD::UQSUB", SDTFPBinOp>;
-
-// This patfrags are temporary hack to get around pattern matching issues with not yet updated intrinsics.
-def AArch64sqadd: PatFrags<(ops node:$lhs, node:$rhs),
- [(bitconvert (AArch64sqadd_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
- (bitconvert (AArch64sqadd_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
- (int_aarch64_neon_sqadd node:$lhs, node:$rhs)]>;
-def AArch64sqsub: PatFrags<(ops node:$lhs, node:$rhs),
- [(bitconvert (AArch64sqsub_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
- (bitconvert (AArch64sqsub_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
- (int_aarch64_neon_sqsub node:$lhs, node:$rhs)]>;
+def AArch64sqdmull: SDNode<"AArch64ISD::SQDMULL", SDTFPMulOp>;
//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
@@ -6433,12 +6424,12 @@ defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
-defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", AArch64sqadd_node, int_aarch64_neon_sqadd, saddsat>;
+defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", AArch64sqadd, int_aarch64_neon_sqadd, saddsat>;
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", AArch64sqrshl, int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", AArch64sqshl, int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
-defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", AArch64sqsub_node, int_aarch64_neon_sqsub, ssubsat>;
+defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", AArch64sqsub, int_aarch64_neon_sqsub, ssubsat>;
defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
@@ -6496,17 +6487,25 @@ def : InstAlias<"faclt $dst, $src1, $src2",
// Advanced SIMD three scalar instructions (mixed operands).
//===----------------------------------------------------------------------===//
defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
- int_aarch64_neon_sqdmulls_scalar>;
+ int_aarch64_neon_sqdmulls_scalar,
+ AArch64sqdmull>;
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-def : Pat<(i64 (AArch64sqadd (i64 FPR64:$Rd),
- (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))))),
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+ (int_aarch64_neon_sqdmulls_scalar FPR32:$Rn, FPR32:$Rm))),
+ (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+def : Pat<(f64 (AArch64sqadd FPR64:$Rd,
+ (AArch64sqdmull FPR32:$Rn, FPR32:$Rm))),
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (AArch64sqsub (i64 FPR64:$Rd),
- (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))))),
+
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+ (int_aarch64_neon_sqdmulls_scalar FPR32:$Rn, FPR32:$Rm))),
+ (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+
+def : Pat<(f64 (AArch64sqsub FPR64:$Rd,
+ (AArch64sqdmull FPR32:$Rn, FPR32:$Rm))),
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
//===----------------------------------------------------------------------===//
@@ -8719,9 +8718,9 @@ defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat,
- AArch64sqadd>;
+ AArch64sqadd, int_aarch64_neon_sqadd>;
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat,
- AArch64sqsub>;
+ AArch64sqsub, int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
int_aarch64_neon_sqrdmlah>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
diff --git a/llvm/test/CodeGen/AArch64/arm64-int-neon.ll b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
index 819c00cdd6815..2c7a850f280bc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-int-neon.ll
@@ -15,6 +15,7 @@
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqadd_s64
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqsub_s32
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_uqsub_s64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqdmulls_scalar
define i32 @test_sqrshl_s32(float noundef %a){
; CHECK-LABEL: test_sqrshl_s32:
@@ -223,3 +224,15 @@ entry:
%res = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %cvt, i64 %cvt)
ret i64 %res
}
+
+define i64 @test_sqdmulls_scalar(float %A){
+; CHECK-LABEL: test_sqdmulls_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: sqdmull d0, s0, s0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %cvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
+ %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %cvt, i32 %cvt)
+ ret i64 %prod
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 300429cfee86e..712452c70aab1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1721,14 +1721,23 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou
}
define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-; CHECK-LABEL: sqdmlal_lane_1s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlal s2, h1, v0.h[1]
-; CHECK-NEXT: fmov w0, s2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqdmlal_lane_1s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: fmov s2, w1
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: sqdmlal s1, h2, v0.h[1]
+; CHECK-SD-NEXT: fmov w0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sqdmlal_lane_1s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: sqdmlal s2, h1, v0.h[1]
+; CHECK-GI-NEXT: fmov w0, s2
+; CHECK-GI-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
%rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
@@ -1739,14 +1748,23 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-; CHECK-LABEL: sqdmlsl_lane_1s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlsl s2, h1, v0.h[1]
-; CHECK-NEXT: fmov w0, s2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqdmlsl_lane_1s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: fmov s2, w1
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: sqdmlsl s1, h2, v0.h[1]
+; CHECK-SD-NEXT: fmov w0, s1
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sqdmlsl_lane_1s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w0
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: sqdmlsl s2, h1, v0.h[1]
+; CHECK-GI-NEXT: fmov w0, s2
+; CHECK-GI-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
%rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
@@ -1789,11 +1807,11 @@ define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_1d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: fmov s2, w1
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov d2, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlal d1, s2, v0.s[1]
-; CHECK-NEXT: fmov x0, d1
+; CHECK-NEXT: sqdmlal d2, s1, v0.s[1]
+; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%rhs = extractelement <2 x i32> %C, i32 1
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
@@ -1806,11 +1824,11 @@ declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_1d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: fmov s2, w1
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov d2, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlsl d1, s2, v0.s[1]
-; CHECK-NEXT: fmov x0, d1
+; CHECK-NEXT: sqdmlsl d2, s1, v0.s[1]
+; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%rhs = extractelement <2 x i32> %C, i32 1
%prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
@@ -3169,14 +3187,23 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
}
define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
-; CHECK-LABEL: sqdmlal_s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqdmlal s2, h0, v1.h[0]
-; CHECK-NEXT: fmov w0, s2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqdmlal_s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: fmov s2, w1
+; CHECK-SD-NEXT: sqdmlal s0, h1, v2.h[0]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sqdmlal_s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: sqdmlal s2, h0, v1.h[0]
+; CHECK-GI-NEXT: fmov w0, s2
+; CHECK-GI-NEXT: ret
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -3188,11 +3215,11 @@ define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlal_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x2
+; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: fmov s2, w1
-; CHECK-NEXT: sqdmlal d0, s1, s2
-; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fmov d2, x2
+; CHECK-NEXT: sqdmlal d2, s1, s0
+; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
%tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
@@ -3200,14 +3227,23 @@ define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
}
define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
-; CHECK-LABEL: sqdmlsl_s:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqdmlsl s2, h0, v1.h[0]
-; CHECK-NEXT: fmov w0, s2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqdmlsl_s:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w2
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: fmov s2, w1
+; CHECK-SD-NEXT: sqdmlsl s0, h1, v2.h[0]
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: sqdmlsl_s:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fmov s1, w1
+; CHECK-GI-NEXT: fmov s2, w2
+; CHECK-GI-NEXT: sqdmlsl s2, h0, v1.h[0]
+; CHECK-GI-NEXT: fmov w0, s2
+; CHECK-GI-NEXT: ret
%tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
%tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
%tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -3219,11 +3255,11 @@ define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlsl_d:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d0, x2
+; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: fmov s2, w1
-; CHECK-NEXT: sqdmlsl d0, s1, s2
-; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fmov d2, x2
+; CHECK-NEXT: sqdmlsl d2, s1, s0
+; CHECK-NEXT: fmov x0, d2
; CHECK-NEXT: ret
%tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
%tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
>From 91d4102c1b46b13d5ddc4312333d8dc0207bb571 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 6 Nov 2025 14:55:45 +0000
Subject: [PATCH 3/3] Remove unnecessary globalISel patterns for unsupported
nodes
---
.../lib/Target/AArch64/AArch64InstrFormats.td | 33 +++++--------------
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1 -
2 files changed, 8 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 01d8775bcd189..fdc9256ef87f7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7804,14 +7804,10 @@ class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
let Inst{10} = 0;
let Inst{9-5} = Rn;
let Inst{4-0} = Rd;
-
- let mayLoad = 0;
- let mayStore = 0;
- let hasSideEffects = 0;
}
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
- SDPatternOperator G_OpNode = null_frag,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
(outs FPR32:$Rd),
@@ -7819,12 +7815,10 @@ multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc,
(outs FPR64:$Rd),
(ins FPR32:$Rn, FPR32:$Rm), asm, "",
- [(set (i64 FPR64:$Rd), (G_OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-
- def: Pat<(f64 (OpNode FPR32:$Rn, FPR32:$Rm)),
- (!cast<Instruction>(NAME#"i32") FPR32:$Rn, FPR32:$Rm)>;
+ [(set (f64 FPR64:$Rd), (OpNode FPR32:$Rn, FPR32:$Rm))]>;
}
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode = null_frag> {
def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc,
@@ -9949,27 +9943,16 @@ multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR64Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
- [(set (i64 FPR64Op:$dst),
- (G_ScalAcc (i64 FPR64Op:$Rd),
- (i64 (int_aarch64_neon_sqdmulls_scalar
- (i32 FPR32Op:$Rn),
- (i32 (vector_extract (v4i32 V128:$Rm),
- VectorIndexS:$idx))))))]> {
+ [(set (f64 FPR64Op:$dst),
+ (ScalAcc FPR64Op:$Rd,
+ (AArch64sqdmull FPR32Op:$Rn,
+ (bitconvert (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx))))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
-
- def : Pat<(f64 (ScalAcc FPR64Op:$Rd,
- (AArch64sqdmull FPR32Op:$Rn,
- (bitconvert (i32 (vector_extract (v4i32 V128:$Rm),
- VectorIndexS:$idx)))))),
- (!cast<Instruction>(NAME # v1i64_indexed)
- FPR64Op:$Rd,
- FPR32Op:$Rn,
- V128:$Rm,
- VectorIndexS:$idx)>;
}
multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 16f181e6a9bdf..8b2b6b8fc711d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6487,7 +6487,6 @@ def : InstAlias<"faclt $dst, $src1, $src2",
// Advanced SIMD three scalar instructions (mixed operands).
//===----------------------------------------------------------------------===//
defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
- int_aarch64_neon_sqdmulls_scalar,
AArch64sqdmull>;
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
More information about the llvm-commits
mailing list