[llvm] [AArch64][GlobalISel] Select UMULL instruction (PR #65469)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 7 02:19:22 PDT 2023
https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/65469:
>From aa0918f58dad401584b71c94d8e80aad948a338a Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 6 Sep 2023 11:04:20 +0100
Subject: [PATCH] [AArch64][GlobalISel] Select UMULL instruction
Global ISel now selects UMULL and UMULL2 instructions.
MUL instructions with input operands coming from ZEXT or SEXT
operations are turned into UMULL.
---
llvm/lib/Target/AArch64/AArch64InstrGISel.td | 15 +
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 75 +-
.../AArch64/GISel/AArch64LegalizerInfo.h | 3 +
.../GlobalISel/legalizer-info-validation.mir | 26 +-
llvm/test/CodeGen/AArch64/aarch64-smull.ll | 1543 +++++++++++++----
llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll | 291 +++-
6 files changed, 1512 insertions(+), 441 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index f9f860607b5877..02d3b68486b825 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -215,6 +215,18 @@ def G_PREFETCH : AArch64GenericInstruction {
let hasSideEffects = 1;
}
+def G_UMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_SMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
// Generic bitwise insert if true.
def G_BIT : AArch64GenericInstruction {
let OutOperandList = (outs type0:$dst);
@@ -254,6 +266,9 @@ def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
def : GINodeEquiv<G_BIT, AArch64bit>;
+def : GINodeEquiv<G_UMULL, AArch64umull>;
+def : GINodeEquiv<G_SMULL, AArch64smull>;
+
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e2df8fb1321df8..3c43771088d7bc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -119,13 +119,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampScalar(0, s32, s64);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
- .scalarizeIf(
- [=](const LegalityQuery &Query) {
- return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
- },
- 0)
- .legalFor({v2s64})
+ .customIf([=](const LegalityQuery &Query) {
+ return Query.Opcode == G_MUL &&
+ (Query.Types[0] == v4s32 || Query.Types[0] == v8s16 ||
+ Query.Types[0] == v2s64);
+ })
+ .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
.widenScalarToNextPow2(0)
.clampScalar(0, s32, s64)
.clampMaxNumElements(0, s8, 16)
@@ -1023,11 +1022,73 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFCopySign(MI, Helper);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return legalizeExtractVectorElt(MI, MRI, Helper);
+ case TargetOpcode::G_MUL:
+ return legalizeMULL(MI, MRI, MIRBuilder, Helper);
}
llvm_unreachable("expected switch to return");
}
+bool AArch64LegalizerInfo::legalizeMULL(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ LegalizerHelper &Helper) const {
+ // Get the instruction that defined the source operand
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+ MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+ // If the source operands were EXTENDED before, then UMULL can be used
+ unsigned I1Opcode = I1->getOpcode();
+ unsigned I2Opcode = I2->getOpcode();
+ if (((I1Opcode == TargetOpcode::G_ZEXT && I2Opcode == TargetOpcode::G_ZEXT) ||
+ (I1Opcode == TargetOpcode::G_SEXT &&
+ I2Opcode == TargetOpcode::G_SEXT)) &&
+ (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+ (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+ // {S/U}MULL{2} of the source of the extend
+ MIRBuilder.buildInstr(
+ I1Opcode == TargetOpcode::G_ZEXT ? AArch64::G_UMULL : AArch64::G_SMULL,
+ {MI.getOperand(0).getReg()},
+ {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()});
+ I1->eraseFromParent();
+ I2->eraseFromParent();
+ MI.eraseFromParent();
+ }
+ // When the destination type is v2s64, scalarize the instruction
+ // Used to be handled in getActionDefinitionsBuilder
+ else if (DstTy.getNumElements() == 2 && DstTy.getScalarSizeInBits() == 64) {
+
+ // If previous instruction is G_{S/Z}EXT followed by G_UNMERGE_VALUES}, DO
+ // NOT SCALARIZE. The Extend instruction should be unmerged then merged,
+ // folding the current unmerge
+ if ((I1Opcode == TargetOpcode::G_UNMERGE_VALUES &&
+ I2Opcode == TargetOpcode::G_UNMERGE_VALUES)) {
+
+ unsigned I1SrcIdx = I1->getNumOperands() - 1;
+ unsigned I2SrcIdx = I2->getNumOperands() - 1;
+
+ I1 = getDefIgnoringCopies(I1->getOperand(I1SrcIdx).getReg(), MRI);
+ I2 = getDefIgnoringCopies(I2->getOperand(I2SrcIdx).getReg(), MRI);
+ I1Opcode = I1->getOpcode();
+ I2Opcode = I2->getOpcode();
+ if ((I1Opcode == TargetOpcode::G_ZEXT &&
+ I2Opcode == TargetOpcode::G_ZEXT) ||
+ (I1Opcode == TargetOpcode::G_SEXT &&
+ I2Opcode == TargetOpcode::G_SEXT)) {
+ return true;
+ }
+ }
+ Helper.fewerElementsVector(
+ MI, 0,
+ DstTy.changeElementCount(
+ DstTy.getElementCount().divideCoefficientBy(2)));
+ }
+ return true;
+}
+
bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 853d5a2305ac68..eb89966f0c14e3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -51,6 +51,9 @@ class AArch64LegalizerInfo : public LegalizerInfo {
LegalizerHelper &Helper) const;
bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
+ bool legalizeMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder,
+ LegalizerHelper &Helper) const;
bool legalizeFunnelShift(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &MIRBuilder,
GISelChangeObserver &Observer,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index b38868a530264e..e4f1738a8843c2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -14,20 +14,20 @@
# DEBUG: G_ADD (opcode [[ADD_OPC:[0-9]+]]): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]]
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_MUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
-# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index
+# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
#
@@ -57,18 +57,18 @@
#
# DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_OR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_XOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
#
# DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index c1470239995c99..32218e459aecb9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull_v8i8_v8i16:
@@ -48,14 +49,36 @@ define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr q2, [x1]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: smull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr q2, [x1]
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr q2, [x1]
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i8>, ptr %A
%load.B = load <8 x i16>, ptr %B
%zext.A = zext <8 x i8> %load.A to <8 x i32>
@@ -65,14 +88,36 @@ define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
}
define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: smull2 v1.4s, v2.8h, v0.8h
-; CHECK-NEXT: smull v0.4s, v2.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x1]
+; CHECK-NEON-NEXT: ldr q2, [x0]
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h
+; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x1]
+; CHECK-SVE-NEXT: ldr q2, [x0]
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h
+; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x1]
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i16>, ptr %A
%load.B = load <8 x i8>, ptr %B
%sext.A = sext <8 x i16> %load.A to <8 x i32>
@@ -82,18 +127,46 @@ define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounw
}
define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: orr v0.8h, #128, lsl #8
-; CHECK-NEXT: sshll v3.4s, v1.4h, #0
-; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: mul v0.4s, v2.4s, v3.4s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr q1, [x1]
+; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8
+; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0
+; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s
+; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr q1, [x1]
+; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8
+; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0
+; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI5_0
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i16>, ptr %A
%or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
%load.B = load <8 x i16>, ptr %B
@@ -146,6 +219,21 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x1]
+; CHECK-GI-NEXT: ldrh w8, [x0]
+; CHECK-GI-NEXT: ldrh w10, [x0, #2]
+; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x9, x10, x9
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%load.A = load <2 x i16>, ptr %A
%load.B = load <2 x i32>, ptr %B
%zext.A = zext <2 x i16> %load.A to <2 x i64>
@@ -155,13 +243,42 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_and_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: bic v0.2s, #128, lsl #24
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI8_0
+; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%load.A = load <2 x i32>, ptr %A
%and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
%load.B = load <2 x i32>, ptr %B
@@ -217,13 +334,31 @@ define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI12_0
+; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -234,14 +369,33 @@ define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: ldr d2, [x1]
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d1, [x0]
+; CHECK-NEON-NEXT: ldr d2, [x1]
+; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d1, [x0]
+; CHECK-SVE-NEXT: ldr d2, [x1]
+; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI13_0
+; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -252,14 +406,33 @@ define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: ldr d2, [x1]
-; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
-; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d1, [x0]
+; CHECK-NEON-NEXT: ldr d2, [x1]
+; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d1, [x0]
+; CHECK-SVE-NEXT: ldr d2, [x1]
+; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI14_0
+; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -378,14 +551,34 @@ define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
}
define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI21_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = load <8 x i8>, ptr %C
@@ -398,15 +591,36 @@ define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
}
define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI22_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i16>, ptr %C
@@ -419,15 +633,36 @@ define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
}
define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI23_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i32>, ptr %C
@@ -548,14 +783,34 @@ define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
}
define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI30_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = load <8 x i8>, ptr %C
@@ -568,15 +823,36 @@ define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
}
define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI31_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i16>, ptr %C
@@ -589,15 +865,36 @@ define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
}
define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI32_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i32>, ptr %C
@@ -611,11 +908,25 @@ define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #244
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #244
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #244
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI33_0
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = sext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
ret <8 x i16> %tmp4
@@ -623,47 +934,111 @@ define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #64537 // =0xfc19
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: dup v1.8h, w8
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19
+; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: dup v1.8h, w8
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19
+; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: dup v1.8h, w8
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI34_0
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = sext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
ret <8 x i16> %tmp4
}
define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mvni v1.4h, #11
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mvni v1.4h, #11
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mvni v1.4h, #11
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI35_0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
%tmp3 = sext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
ret <4 x i32> %tmp4
}
define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-1234 // =0xfffffb2e
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov x8, #-1234 // =0xfffffffffffffb2e
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mul x9, x9, x8
+; CHECK-GI-NEXT: fmov x10, d1
+; CHECK-GI-NEXT: mul x8, x10, x8
+; CHECK-GI-NEXT: fmov d0, x9
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: ret
%tmp3 = sext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
ret <2 x i64> %tmp4
}
define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #12
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #12
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #12
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI37_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI37_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
ret <8 x i16> %tmp4
@@ -671,49 +1046,118 @@ define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #999 // =0x3e7
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: dup v1.8h, w8
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: dup v1.8h, w8
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: dup v1.8h, w8
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI38_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
ret <8 x i16> %tmp4
}
define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.4h, w8
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.4h, w8
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.4h, w8
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI39_0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
%tmp3 = zext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
ret <4 x i32> %tmp4
}
define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mul x9, x9, x8
+; CHECK-GI-NEXT: fmov x10, d1
+; CHECK-GI-NEXT: mul x8, x10, x8
+; CHECK-GI-NEXT: fmov d0, x9
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
ret <2 x i64> %tmp4
}
define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #12
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #12
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #12
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI41_1
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI41_1]
+; CHECK-GI-NEXT: adrp x8, .LCPI41_0
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
%and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -721,14 +1165,34 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
}
define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.4h, w8
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.4h, w8
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.4h, w8
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI42_1
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_1]
+; CHECK-GI-NEXT: adrp x8, .LCPI42_0
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
%and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -736,14 +1200,39 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
}
define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mul x9, x9, x8
+; CHECK-GI-NEXT: fmov x10, d1
+; CHECK-GI-NEXT: mul x8, x10, x8
+; CHECK-GI-NEXT: fmov d0, x9
+; CHECK-GI-NEXT: mov v0.d[1], x8
+; CHECK-GI-NEXT: adrp x8, .LCPI43_0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
%and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
@@ -788,15 +1277,36 @@ ret <8 x i16> %3
}
define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
-; CHECK-LABEL: distribute:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr q0, [x1]
-; CHECK-NEXT: dup v1.8b, w2
-; CHECK-NEXT: mov d2, v0.d[1]
-; CHECK-NEXT: umull v2.8h, v2.8b, v1.8b
-; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: str q2, [x0]
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: distribute:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: ldr q0, [x1]
+; CHECK-NEON-NEXT: dup v1.8b, w2
+; CHECK-NEON-NEXT: mov d2, v0.d[1]
+; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b
+; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: str q2, [x0]
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: distribute:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ldr q0, [x1]
+; CHECK-SVE-NEXT: dup v1.8b, w2
+; CHECK-SVE-NEXT: mov d2, v0.d[1]
+; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b
+; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: str q2, [x0]
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: distribute:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q0, [x1]
+; CHECK-GI-NEXT: dup v2.8b, w2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: uaddl v0.8h, v1.8b, v0.8b
+; CHECK-GI-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: str q0, [x0]
+; CHECK-GI-NEXT: ret
entry:
%0 = trunc i32 %mul to i8
%1 = insertelement <8 x i8> undef, i8 %0, i32 0
@@ -817,12 +1327,26 @@ entry:
}
define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: umull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -830,12 +1354,26 @@ define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: smull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -843,12 +1381,26 @@ define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: umull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -856,12 +1408,26 @@ define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: smull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -869,12 +1435,26 @@ define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: umull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -882,12 +1462,26 @@ define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
}
define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: smull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -895,14 +1489,33 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
}
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: amull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
-; CHECK-NEXT: bic v2.8h, #255, lsl #8
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: mov v0.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: mov v0.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: adrp x8, .LCPI53_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -911,14 +1524,33 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: amull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT: smull v3.4s, v0.4h, v1.4h
-; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: adrp x8, .LCPI54_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -927,14 +1559,33 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: amull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT: smull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI55_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI55_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -944,12 +1595,28 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI56_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -958,12 +1625,28 @@ entry:
}
define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i16_c:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16_c:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI57_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -972,13 +1655,30 @@ entry:
}
define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and256_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.8h, #1, lsl #8
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and256_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.8h, #1, lsl #8
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and256_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.8h, #1, lsl #8
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and256_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI58_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -987,11 +1687,25 @@ entry:
}
define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_andconst_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_andconst_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_andconst_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_andconst_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI59_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -999,14 +1713,35 @@ entry:
}
define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_smaller_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.8b, #15
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_smaller_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.8b, #15
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_smaller_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.8b, #15
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_smaller_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI60_1
+; CHECK-GI-NEXT: adrp x9, .LCPI60_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI60_1]
+; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i4> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -1015,13 +1750,30 @@ entry:
}
define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: umull_and_v4i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i32:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: xtn v1.4h, v1.4s
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI61_0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i16> %src1 to <4 x i32>
%in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
@@ -1030,15 +1782,37 @@ entry:
}
define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
-; CHECK-LABEL: umull_and_v8i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff
-; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: uzp1 v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i32:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI62_0
+; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i16> %src1 to <8 x i32>
%in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -1047,13 +1821,31 @@ entry:
}
define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
-; CHECK-LABEL: umull_and_v8i32_dup:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: dup v2.8h, w8
-; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i32_dup:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: and w8, w0, #0xff
+; CHECK-NEON-NEXT: dup v2.8h, w8
+; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32_dup:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: and w8, w0, #0xff
+; CHECK-SVE-NEXT: dup v2.8h, w8
+; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32_dup:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: and w8, w0, #0xff
+; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT: dup v3.4s, w8
+; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i16> %src1 to <8 x i32>
%in2 = and i32 %src2, 255
@@ -1064,13 +1856,39 @@ entry:
}
define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
-; CHECK-LABEL: umull_and_v2i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.2d, #0x000000000000ff
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v2i64:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: xtn v1.2s, v1.2d
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v2i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI64_0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI64_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <2 x i32> %src1 to <2 x i64>
%in2 = and <2 x i64> %src2, <i64 255, i64 255>
@@ -1079,15 +1897,55 @@ entry:
}
define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
-; CHECK-LABEL: umull_and_v4i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v3.2d, #0x000000000000ff
-; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: uzp1 v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i64:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI65_0
+; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI65_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: fmov x8, d4
+; CHECK-GI-NEXT: mov d3, v4.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov d4, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mov d0, v2.d[1]
+; CHECK-GI-NEXT: fmov x11, d4
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: fmov x12, d0
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mul x11, x11, x12
+; CHECK-GI-NEXT: mov v0.d[1], x10
+; CHECK-GI-NEXT: mov v1.d[1], x11
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
@@ -1096,13 +1954,46 @@ entry:
}
define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
-; CHECK-LABEL: umull_and_v4i64_dup:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i64_dup:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: and w8, w0, #0xff
+; CHECK-NEON-NEXT: dup v2.4s, w8
+; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64_dup:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: and w8, w0, #0xff
+; CHECK-SVE-NEXT: dup v2.4s, w8
+; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64_dup:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: and x8, x0, #0xff
+; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: dup v2.2d, x8
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: mov d1, v2.d[1]
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: fmov x12, d2
+; CHECK-GI-NEXT: mul x9, x10, x9
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mul x11, x12, x11
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mov v0.d[1], x10
+; CHECK-GI-NEXT: mov v1.d[1], x11
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and i64 %src2, 255
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index a5154641400309..4c0d1efb99498f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -1,15 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-GI
; Tests for wider-than-legal extensions into mul/mla.
define <16 x i16> @mul_i16(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: mov v1.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i16>
%eb = zext <16 x i8> %b to <16 x i16>
@@ -18,17 +26,29 @@ entry:
}
define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NEXT: ushll v4.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v5.8h, v0.16b, #0
-; CHECK-NEXT: ushll2 v6.8h, v1.16b, #0
-; CHECK-NEXT: umull v0.4s, v2.4h, v4.4h
-; CHECK-NEXT: umull2 v1.4s, v2.8h, v4.8h
-; CHECK-NEXT: umull2 v3.4s, v5.8h, v6.8h
-; CHECK-NEXT: umull v2.4s, v5.4h, v6.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v4.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v5.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll2 v6.8h, v1.16b, #0
+; CHECK-SD-NEXT: umull v0.4s, v2.4h, v4.4h
+; CHECK-SD-NEXT: umull2 v1.4s, v2.8h, v4.8h
+; CHECK-SD-NEXT: umull2 v3.4s, v5.8h, v6.8h
+; CHECK-SD-NEXT: umull v2.4s, v5.4h, v6.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT: umull v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT: umull v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT: umull2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i32>
%eb = zext <16 x i8> %b to <16 x i32>
@@ -37,29 +57,53 @@ entry:
}
define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-NEXT: ushll v5.4s, v0.4h, #0
-; CHECK-NEXT: ushll v6.4s, v3.4h, #0
-; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: ushll v16.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v7.4s, v3.8h, #0
-; CHECK-NEXT: ushll2 v17.4s, v0.8h, #0
-; CHECK-NEXT: ushll2 v18.4s, v1.8h, #0
-; CHECK-NEXT: umull2 v1.2d, v4.4s, v6.4s
-; CHECK-NEXT: umull v0.2d, v4.2s, v6.2s
-; CHECK-NEXT: umull2 v3.2d, v2.4s, v7.4s
-; CHECK-NEXT: umull v2.2d, v2.2s, v7.2s
-; CHECK-NEXT: umull v4.2d, v5.2s, v16.2s
-; CHECK-NEXT: umull2 v7.2d, v17.4s, v18.4s
-; CHECK-NEXT: umull2 v5.2d, v5.4s, v16.4s
-; CHECK-NEXT: umull v6.2d, v17.2s, v18.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v6.4s, v3.4h, #0
+; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT: ushll v16.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v7.4s, v3.8h, #0
+; CHECK-SD-NEXT: ushll2 v17.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v18.4s, v1.8h, #0
+; CHECK-SD-NEXT: umull2 v1.2d, v4.4s, v6.4s
+; CHECK-SD-NEXT: umull v0.2d, v4.2s, v6.2s
+; CHECK-SD-NEXT: umull2 v3.2d, v2.4s, v7.4s
+; CHECK-SD-NEXT: umull v2.2d, v2.2s, v7.2s
+; CHECK-SD-NEXT: umull v4.2d, v5.2s, v16.2s
+; CHECK-SD-NEXT: umull2 v7.2d, v17.4s, v18.4s
+; CHECK-SD-NEXT: umull2 v5.2d, v5.4s, v16.4s
+; CHECK-SD-NEXT: umull v6.2d, v17.2s, v18.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT: umull v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT: umull v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT: umull2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT: umull v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT: umull2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT: umull v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT: umull2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i64>
%eb = zext <16 x i8> %b to <16 x i64>
@@ -69,13 +113,21 @@ entry:
define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
-; CHECK-LABEL: mla_i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umlal2 v3.8h, v0.16b, v1.16b
-; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i16>
%eb = zext <16 x i8> %b to <16 x i16>
@@ -85,21 +137,37 @@ entry:
}
define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
-; CHECK-LABEL: mla_i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: umlal v2.4s, v6.4h, v7.4h
-; CHECK-NEXT: umlal2 v3.4s, v6.8h, v7.8h
-; CHECK-NEXT: umlal2 v5.4s, v0.8h, v1.8h
-; CHECK-NEXT: umlal v4.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: mov v2.16b, v4.16b
-; CHECK-NEXT: mov v3.16b, v5.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: umlal v2.4s, v6.4h, v7.4h
+; CHECK-SD-NEXT: umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-SD-NEXT: umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT: umlal v4.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: mov v2.16b, v4.16b
+; CHECK-SD-NEXT: mov v3.16b, v5.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: umlal v2.4s, v6.4h, v7.4h
+; CHECK-GI-NEXT: umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-GI-NEXT: umlal v4.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v1.16b, v3.16b
+; CHECK-GI-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-NEXT: mov v3.16b, v5.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i32>
%eb = zext <16 x i8> %b to <16 x i32>
@@ -109,38 +177,71 @@ entry:
}
define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
-; CHECK-LABEL: mla_i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov v17.16b, v7.16b
-; CHECK-NEXT: mov v16.16b, v6.16b
-; CHECK-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: ushll v18.4s, v6.4h, #0
-; CHECK-NEXT: ushll2 v21.4s, v6.8h, #0
-; CHECK-NEXT: ushll v19.4s, v0.4h, #0
-; CHECK-NEXT: ushll v20.4s, v7.4h, #0
-; CHECK-NEXT: ushll v22.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v23.4s, v7.8h, #0
-; CHECK-NEXT: ldp q6, q7, [sp]
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: umlal2 v3.2d, v18.4s, v20.4s
-; CHECK-NEXT: umlal v2.2d, v18.2s, v20.2s
-; CHECK-NEXT: umlal v16.2d, v19.2s, v22.2s
-; CHECK-NEXT: umlal2 v5.2d, v21.4s, v23.4s
-; CHECK-NEXT: umlal v4.2d, v21.2s, v23.2s
-; CHECK-NEXT: umlal2 v17.2d, v19.4s, v22.4s
-; CHECK-NEXT: umlal2 v7.2d, v0.4s, v1.4s
-; CHECK-NEXT: umlal v6.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: mov v2.16b, v4.16b
-; CHECK-NEXT: mov v3.16b, v5.16b
-; CHECK-NEXT: mov v4.16b, v16.16b
-; CHECK-NEXT: mov v5.16b, v17.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov v17.16b, v7.16b
+; CHECK-SD-NEXT: mov v16.16b, v6.16b
+; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v18.4s, v6.4h, #0
+; CHECK-SD-NEXT: ushll2 v21.4s, v6.8h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v20.4s, v7.4h, #0
+; CHECK-SD-NEXT: ushll v22.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v23.4s, v7.8h, #0
+; CHECK-SD-NEXT: ldp q6, q7, [sp]
+; CHECK-SD-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT: umlal2 v3.2d, v18.4s, v20.4s
+; CHECK-SD-NEXT: umlal v2.2d, v18.2s, v20.2s
+; CHECK-SD-NEXT: umlal v16.2d, v19.2s, v22.2s
+; CHECK-SD-NEXT: umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-SD-NEXT: umlal v4.2d, v21.2s, v23.2s
+; CHECK-SD-NEXT: umlal2 v17.2d, v19.4s, v22.4s
+; CHECK-SD-NEXT: umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT: umlal v6.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: mov v2.16b, v4.16b
+; CHECK-SD-NEXT: mov v3.16b, v5.16b
+; CHECK-SD-NEXT: mov v4.16b, v16.16b
+; CHECK-SD-NEXT: mov v5.16b, v17.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v16.16b, v2.16b
+; CHECK-GI-NEXT: mov v17.16b, v3.16b
+; CHECK-GI-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-NEXT: mov v3.16b, v5.16b
+; CHECK-GI-NEXT: mov v4.16b, v6.16b
+; CHECK-GI-NEXT: mov v5.16b, v7.16b
+; CHECK-GI-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v18.4s, v6.4h, #0
+; CHECK-GI-NEXT: ushll v20.4s, v7.4h, #0
+; CHECK-GI-NEXT: ushll2 v19.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll v21.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0
+; CHECK-GI-NEXT: ushll v23.4s, v1.4h, #0
+; CHECK-GI-NEXT: ldp q6, q7, [sp]
+; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: umlal v16.2d, v18.2s, v20.2s
+; CHECK-GI-NEXT: umlal2 v17.2d, v18.4s, v20.4s
+; CHECK-GI-NEXT: umlal v2.2d, v19.2s, v22.2s
+; CHECK-GI-NEXT: umlal2 v3.2d, v19.4s, v22.4s
+; CHECK-GI-NEXT: umlal v4.2d, v21.2s, v23.2s
+; CHECK-GI-NEXT: umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-GI-NEXT: umlal v6.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v16.16b
+; CHECK-GI-NEXT: mov v1.16b, v17.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i64>
%eb = zext <16 x i8> %b to <16 x i64>
More information about the llvm-commits
mailing list