[llvm] 45f51f9 - [AArch64][GlobalISel] Select UMULL instruction (#65469)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 25 01:34:55 PDT 2023
Author: chuongg3
Date: 2023-09-25T09:34:51+01:00
New Revision: 45f51f9f7c5bc36f0f91695bb5e600013c2114ec
URL: https://github.com/llvm/llvm-project/commit/45f51f9f7c5bc36f0f91695bb5e600013c2114ec
DIFF: https://github.com/llvm/llvm-project/commit/45f51f9f7c5bc36f0f91695bb5e600013c2114ec.diff
LOG: [AArch64][GlobalISel] Select UMULL instruction (#65469)
Global ISel now selects `UMULL` and `UMULL2` instructions.
G_MUL instruction with input operands coming from `SEXT` or `ZEXT`
operations are turned into UMULL
G_MUL instructions with v2s64 result type is always scalarised except:
`mul ( unmerge( ext ), unmerge( ext ))`
So the extend could be unmerged and fold away the unmerge in the middle:
`mul ( unmerge( ext ), unmerge( ext ))` =>
`mul ( unmerge( merge( ext( unmerge )), unmerge( merge( ext( unmerge
))))` =>
`mul ( ext(unmerge)), ( ext( unmerge ))) `
Added:
Modified:
llvm/lib/Target/AArch64/AArch64Combine.td
llvm/lib/Target/AArch64/AArch64InstrGISel.td
llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
llvm/test/CodeGen/AArch64/aarch64-smull.ll
llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 6a02c6d5388869b..f7b55cad4269944 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -156,6 +156,13 @@ def mul_const : GICombineRule<
(apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
>;
+def lower_mull : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_MUL):$root,
+ [{ return matchExtMulToMULL(*${root}, MRI); }]),
+ (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer); }])
+>;
+
def build_vector_to_dup : GICombineRule<
(defs root:$root),
(match (wip_match_opcode G_BUILD_VECTOR):$root,
@@ -232,7 +239,7 @@ def AArch64PostLegalizerLowering
icmp_lowering, build_vector_lowering,
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
- unmerge_ext_to_unmerge]> {
+ unmerge_ext_to_unmerge, lower_mull]> {
}
// Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 0f3ef2327769eae..c6ff7bea4bd2c92 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -215,6 +215,18 @@ def G_PREFETCH : AArch64GenericInstruction {
let hasSideEffects = 1;
}
+def G_UMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_SMULL : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
// expands into BSL/BIT/BIF after register allocation.
def G_BSP : AArch64GenericInstruction {
@@ -255,6 +267,9 @@ def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
def : GINodeEquiv<G_BSP, AArch64bsp>;
+def : GINodeEquiv<G_UMULL, AArch64umull>;
+def : GINodeEquiv<G_SMULL, AArch64smull>;
+
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d07de82de1335af..8d3d94290b0e580 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -119,13 +119,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.clampScalar(0, s32, s64);
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
- .scalarizeIf(
- [=](const LegalityQuery &Query) {
- return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
- },
- 0)
- .legalFor({v2s64})
+ .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
.widenScalarToNextPow2(0)
.clampScalar(0, s32, s64)
.clampMaxNumElements(0, s8, 16)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 9640a1c17b87c33..687063873a16b24 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1110,6 +1110,75 @@ void applyUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
Observer.changedInstr(MI);
}
+// Match mul({z/s}ext , {z/s}ext) => {u/s}mull OR
+// Match v2s64 mul instructions, which will then be scalarised later on
+// Doing these two matches in one function to ensure that the order of matching
+// will always be the same.
+// Try lowering MUL to MULL before trying to scalarize if needed.
+bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ // Get the instructions that defined the source operand
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+ MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+ if (DstTy.isVector()) {
+ // If the source operands were EXTENDED before, then {U/S}MULL can be used
+ unsigned I1Opc = I1->getOpcode();
+ unsigned I2Opc = I2->getOpcode();
+ if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
+ (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
+ (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+ (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+ return true;
+ }
+ // If result type is v2s64, scalarise the instruction
+ else if (DstTy == LLT::fixed_vector(2, 64)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, GISelChangeObserver &Observer) {
+ assert(MI.getOpcode() == TargetOpcode::G_MUL &&
+ "Expected a G_MUL instruction");
+
+ // Get the instructions that defined the source operand
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+ MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+ // If the source operands were EXTENDED before, then {U/S}MULL can be used
+ unsigned I1Opc = I1->getOpcode();
+ unsigned I2Opc = I2->getOpcode();
+ if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
+ (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
+ (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+ (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+ MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+
+ B.setInstrAndDebugLoc(MI);
+ B.buildInstr(I1->getOpcode() == TargetOpcode::G_ZEXT ? AArch64::G_UMULL
+ : AArch64::G_SMULL,
+ {MI.getOperand(0).getReg()},
+ {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()});
+ MI.eraseFromParent();
+ }
+ // If result type is v2s64, scalarise the instruction
+ else if (DstTy == LLT::fixed_vector(2, 64)) {
+ LegalizerHelper Helper(*MI.getMF(), Observer, B);
+ B.setInstrAndDebugLoc(MI);
+ Helper.fewerElementsVector(
+ MI, 0,
+ DstTy.changeElementCount(
+ DstTy.getElementCount().divideCoefficientBy(2)));
+ }
+}
+
class AArch64PostLegalizerLoweringImpl : public Combiner {
protected:
// TODO: Make CombinerHelper methods const.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index 2513eb4989a5e82..0e895c49eff0991 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -175,12 +175,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
- ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UV]], [[UV2]]
- ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[UV1]], [[UV3]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64)
- ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: $q0 = COPY [[MUL]](<2 x s64>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<2 x s64>) = COPY $q0
%1:_(<2 x s64>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
index edbb776a7f4c4ad..b707e3ce2a3d50b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -203,11 +203,9 @@ body: |
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[UV]], [[UV2]]
; CHECK-NEXT: [[SDIV1:%[0-9]+]]:_(s64) = G_SDIV [[UV1]], [[UV3]]
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>)
- ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SDIV]], [[UV4]]
- ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SDIV1]], [[UV5]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL1]](s64)
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<2 x s64>) = G_SUB [[COPY]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SDIV]](s64), [[SDIV1]](s64)
+ ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[BUILD_VECTOR]], [[COPY1]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<2 x s64>) = G_SUB [[COPY]], [[MUL]]
; CHECK-NEXT: $q0 = COPY [[SUB]](<2 x s64>)
%0:_(<2 x s64>) = COPY $q0
%1:_(<2 x s64>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index b38868a530264e9..5483e80286cde50 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -27,7 +27,7 @@
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
#
-# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index
+# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
#
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index c1470239995c99c..6842afb561a1d6f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: smull_v8i8_v8i16:
@@ -48,14 +49,36 @@ define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr q2, [x1]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: smull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr q2, [x1]
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: smull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr q2, [x1]
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: smull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i8>, ptr %A
%load.B = load <8 x i16>, ptr %B
%zext.A = zext <8 x i8> %load.A to <8 x i32>
@@ -65,14 +88,36 @@ define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
}
define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: smull2 v1.4s, v2.8h, v0.8h
-; CHECK-NEXT: smull v0.4s, v2.4h, v0.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x1]
+; CHECK-NEON-NEXT: ldr q2, [x0]
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: smull2 v1.4s, v2.8h, v0.8h
+; CHECK-NEON-NEXT: smull v0.4s, v2.4h, v0.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x1]
+; CHECK-SVE-NEXT: ldr q2, [x0]
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: smull2 v1.4s, v2.8h, v0.8h
+; CHECK-SVE-NEXT: smull v0.4s, v2.4h, v0.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x1]
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mul v1.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i16>, ptr %A
%load.B = load <8 x i8>, ptr %B
%sext.A = sext <8 x i16> %load.A to <8 x i32>
@@ -82,18 +127,46 @@ define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounw
}
define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: orr v0.8h, #128, lsl #8
-; CHECK-NEXT: sshll v3.4s, v1.4h, #0
-; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: mul v0.4s, v2.4s, v3.4s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr q1, [x1]
+; CHECK-NEON-NEXT: orr v0.8h, #128, lsl #8
+; CHECK-NEON-NEXT: sshll v3.4s, v1.4h, #0
+; CHECK-NEON-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-NEON-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-NEON-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-NEON-NEXT: mul v1.4s, v0.4s, v1.4s
+; CHECK-NEON-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr q1, [x1]
+; CHECK-SVE-NEXT: orr v0.8h, #128, lsl #8
+; CHECK-SVE-NEXT: sshll v3.4s, v1.4h, #0
+; CHECK-SVE-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-SVE-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-SVE-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SVE-NEXT: mul v1.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT: mul v0.4s, v2.4s, v3.4s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI5_0
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: ret
%load.A = load <8 x i16>, ptr %A
%or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
%load.B = load <8 x i16>, ptr %B
@@ -146,6 +219,33 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: ldr h1, [x0, #2]
+; CHECK-GI-NEXT: adrp x8, .LCPI7_0
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI7_0]
+; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT: mov s1, v0.s[1]
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: ldr d0, [x1]
+; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: mov d3, v0.d[1]
+; CHECK-GI-NEXT: mov v1.d[1], x9
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mov d2, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%load.A = load <2 x i16>, ptr %A
%load.B = load <2 x i32>, ptr %B
%zext.A = zext <2 x i16> %load.A to <2 x i64>
@@ -155,13 +255,42 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_and_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: bic v0.2s, #128, lsl #24
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: bic v0.2s, #128, lsl #24
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: bic v0.2s, #128, lsl #24
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI8_0
+; CHECK-GI-NEXT: ldr d1, [x0]
+; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%load.A = load <2 x i32>, ptr %A
%and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
%load.B = load <2 x i32>, ptr %B
@@ -217,13 +346,31 @@ define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
}
define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI12_0
+; CHECK-GI-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -234,14 +381,33 @@ define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
}
define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: ldr d2, [x1]
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d1, [x0]
+; CHECK-NEON-NEXT: ldr d2, [x1]
+; CHECK-NEON-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: smull v1.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d1, [x0]
+; CHECK-SVE-NEXT: ldr d2, [x1]
+; CHECK-SVE-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: smull v1.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI13_0
+; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -252,14 +418,33 @@ define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
}
define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: ldr d2, [x1]
-; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
-; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr d1, [x0]
+; CHECK-NEON-NEXT: ldr d2, [x1]
+; CHECK-NEON-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: smull v1.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr d1, [x0]
+; CHECK-SVE-NEXT: ldr d2, [x1]
+; CHECK-SVE-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: smull v1.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI14_0
+; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -378,14 +563,34 @@ define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
}
define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI21_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = load <8 x i8>, ptr %C
@@ -398,15 +603,36 @@ define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
}
define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI22_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i16>, ptr %C
@@ -419,15 +645,36 @@ define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
}
define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlal_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlal_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlal_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI23_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i32>, ptr %C
@@ -548,14 +795,34 @@ define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
}
define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI30_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = load <8 x i8>, ptr %C
@@ -568,15 +835,36 @@ define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
}
define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI31_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i16>, ptr %C
@@ -589,15 +877,36 @@ define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
}
define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amlsl_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: ldr q0, [x0]
+; CHECK-NEON-NEXT: ldr d1, [x1]
+; CHECK-NEON-NEXT: ldr d2, [x2]
+; CHECK-NEON-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amlsl_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: ldr q0, [x0]
+; CHECK-SVE-NEXT: ldr d1, [x1]
+; CHECK-SVE-NEXT: ldr d2, [x2]
+; CHECK-SVE-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amlsl_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: adrp x8, .LCPI32_0
+; CHECK-GI-NEXT: ldr d2, [x2]
+; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i32>, ptr %C
@@ -611,11 +920,25 @@ define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #244
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #244
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #244
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI33_0
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = sext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
ret <8 x i16> %tmp4
@@ -623,47 +946,115 @@ define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #64537 // =0xfc19
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: dup v1.8h, w8
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #64537 // =0xfc19
+; CHECK-NEON-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: dup v1.8h, w8
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #64537 // =0xfc19
+; CHECK-SVE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: dup v1.8h, w8
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI34_0
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = sext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
ret <8 x i16> %tmp4
}
define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mvni v1.4h, #11
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mvni v1.4h, #11
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mvni v1.4h, #11
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI35_0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
%tmp3 = sext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
ret <4 x i32> %tmp4
}
define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-1234 // =0xfffffb2e
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #-1234 // =0xfffffb2e
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #-1234 // =0xfffffb2e
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI36_0
+; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%tmp3 = sext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
ret <2 x i64> %tmp4
}
define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #12
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #12
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #12
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI37_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI37_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
ret <8 x i16> %tmp4
@@ -671,49 +1062,122 @@ define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #999 // =0x3e7
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: dup v1.8h, w8
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #999 // =0x3e7
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: dup v1.8h, w8
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #999 // =0x3e7
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: dup v1.8h, w8
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI38_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
ret <8 x i16> %tmp4
}
define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.4h, w8
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.4h, w8
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.4h, w8
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI39_0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
%tmp3 = zext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
ret <4 x i32> %tmp4
}
define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI40_0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI40_0]
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
ret <2 x i64> %tmp4
}
define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v8i8_v8i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #12
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v1.8b, #12
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v1.8b, #12
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI41_1
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI41_1]
+; CHECK-GI-NEXT: adrp x8, .LCPI41_0
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
%tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
%and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -721,14 +1185,34 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
}
define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v4i16_v4i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.4h, w8
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.4h, w8
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.4h, w8
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI42_1
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_1]
+; CHECK-GI-NEXT: adrp x8, .LCPI42_0
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI42_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <4 x i16> %arg to <4 x i32>
%tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
%and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -736,14 +1220,43 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
}
define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v2i32_v2i64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1234 // =0x4d2
-; CHECK-NEXT: dup v1.2s, w8
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT: dup v1.2s, w8
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT: dup v1.2s, w8
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adrp x8, .LCPI43_1
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_1]
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: adrp x8, .LCPI43_0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI43_0]
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT: ret
%tmp3 = zext <2 x i32> %arg to <2 x i64>
%tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
%and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
@@ -788,15 +1301,36 @@ ret <8 x i16> %3
}
define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
-; CHECK-LABEL: distribute:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr q0, [x1]
-; CHECK-NEXT: dup v1.8b, w2
-; CHECK-NEXT: mov d2, v0.d[1]
-; CHECK-NEXT: umull v2.8h, v2.8b, v1.8b
-; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: str q2, [x0]
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: distribute:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: ldr q0, [x1]
+; CHECK-NEON-NEXT: dup v1.8b, w2
+; CHECK-NEON-NEXT: mov d2, v0.d[1]
+; CHECK-NEON-NEXT: umull v2.8h, v2.8b, v1.8b
+; CHECK-NEON-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: str q2, [x0]
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: distribute:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: ldr q0, [x1]
+; CHECK-SVE-NEXT: dup v1.8b, w2
+; CHECK-SVE-NEXT: mov d2, v0.d[1]
+; CHECK-SVE-NEXT: umull v2.8h, v2.8b, v1.8b
+; CHECK-SVE-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: str q2, [x0]
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: distribute:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q0, [x1]
+; CHECK-GI-NEXT: dup v1.8b, w2
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: str q0, [x0]
+; CHECK-GI-NEXT: ret
entry:
%0 = trunc i32 %mul to i8
%1 = insertelement <8 x i8> undef, i8 %0, i32 0
@@ -817,12 +1351,26 @@ entry:
}
define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: umull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -830,12 +1378,26 @@ define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: smull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -843,12 +1405,26 @@ define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: umull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -856,12 +1432,26 @@ define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: smull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: smull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -869,12 +1459,26 @@ define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: umull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -882,12 +1486,26 @@ define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
}
define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: smull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: smull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: mov v1.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: smull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: mov v1.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: smull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: smull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: smull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -895,14 +1513,33 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
}
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: amull2_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b
-; CHECK-NEXT: bic v2.8h, #255, lsl #8
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i8:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT: bic v2.8h, #255, lsl #8
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: mov v0.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i8:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: smull v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: smull2 v1.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT: bic v2.8h, #255, lsl #8
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: mov v0.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: adrp x8, .LCPI53_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
%mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -911,14 +1548,33 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
}
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: amull2_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT: smull v3.4s, v0.4h, v1.4h
-; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i16:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT: smull v3.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i16:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v2.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT: smull v3.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: smull2 v0.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: adrp x8, .LCPI54_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
%mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -927,14 +1583,33 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
}
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: amull2_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT: smull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s
-; CHECK-NEXT: and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: amull2_i32:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT: smull v3.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: amull2_i32:
+; CHECK-SVE: // %bb.0:
+; CHECK-SVE-NEXT: movi v2.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT: smull v3.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: smull2 v0.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT: and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT: and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: amull2_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI55_0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI55_0]
+; CHECK-GI-NEXT: and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
%mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -944,12 +1619,28 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI56_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -958,12 +1649,28 @@ entry:
}
define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16_c:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i16_c:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16_c:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16_c:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI57_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -972,13 +1679,30 @@ entry:
}
define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and256_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.8h, #1, lsl #8
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and256_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.8h, #1, lsl #8
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and256_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.8h, #1, lsl #8
+; CHECK-SVE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and256_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI58_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -987,11 +1711,25 @@ entry:
}
define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_andconst_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_andconst_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_andconst_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_andconst_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI59_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i8> %src1 to <8 x i16>
%out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -999,14 +1737,35 @@ entry:
}
define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_smaller_v8i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.8b, #15
-; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_smaller_v8i16:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.8b, #15
+; CHECK-NEON-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT: xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_smaller_v8i16:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.8b, #15
+; CHECK-SVE-NEXT: bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT: xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-SVE-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_smaller_v8i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI60_1
+; CHECK-GI-NEXT: adrp x9, .LCPI60_0
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI60_1]
+; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i4> %src1 to <8 x i16>
%in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -1015,13 +1774,30 @@ entry:
}
define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: umull_and_v4i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.2d, #0x0000ff000000ff
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i32:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: xtn v1.4h, v1.4s
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI61_0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i16> %src1 to <4 x i32>
%in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
@@ -1030,15 +1806,37 @@ entry:
}
define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
-; CHECK-LABEL: umull_and_v8i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v3.2d, #0x0000ff000000ff
-; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: uzp1 v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i32:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v3.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v3.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT: uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI62_0
+; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT: and v0.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: mul v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT: mul v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i16> %src1 to <8 x i32>
%in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -1047,13 +1845,31 @@ entry:
}
define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
-; CHECK-LABEL: umull_and_v8i32_dup:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: dup v2.8h, w8
-; CHECK-NEXT: umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v8i32_dup:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: and w8, w0, #0xff
+; CHECK-NEON-NEXT: dup v2.8h, w8
+; CHECK-NEON-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32_dup:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: and w8, w0, #0xff
+; CHECK-SVE-NEXT: dup v2.8h, w8
+; CHECK-SVE-NEXT: umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT: umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32_dup:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: and w8, w0, #0xff
+; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT: dup v3.4s, w8
+; CHECK-GI-NEXT: mul v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <8 x i16> %src1 to <8 x i32>
%in2 = and i32 %src2, 255
@@ -1064,13 +1880,39 @@ entry:
}
define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
-; CHECK-LABEL: umull_and_v2i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v2.2d, #0x000000000000ff
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v2i64:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v2.2d, #0x000000000000ff
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT: xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v2i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v2.2d, #0x000000000000ff
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT: xtn v1.2s, v1.2d
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v2i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI64_0
+; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI64_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mov v0.d[1], x9
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <2 x i32> %src1 to <2 x i64>
%in2 = and <2 x i64> %src2, <i64 255, i64 255>
@@ -1079,15 +1921,55 @@ entry:
}
define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
-; CHECK-LABEL: umull_and_v4i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v3.2d, #0x000000000000ff
-; CHECK-NEXT: and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: uzp1 v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i64:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-NEON-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT: uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SVE-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT: uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: adrp x8, .LCPI65_0
+; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI65_0]
+; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT: fmov x8, d4
+; CHECK-GI-NEXT: mov d3, v4.d[1]
+; CHECK-GI-NEXT: fmov x9, d1
+; CHECK-GI-NEXT: mov d4, v1.d[1]
+; CHECK-GI-NEXT: fmov x10, d2
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x9, d0
+; CHECK-GI-NEXT: mov d0, v2.d[1]
+; CHECK-GI-NEXT: fmov x11, d4
+; CHECK-GI-NEXT: mul x9, x9, x10
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: fmov x12, d0
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mul x11, x11, x12
+; CHECK-GI-NEXT: mov v0.d[1], x10
+; CHECK-GI-NEXT: mov v1.d[1], x11
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
@@ -1096,13 +1978,46 @@ entry:
}
define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
-; CHECK-LABEL: umull_and_v4i64_dup:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: umull_and_v4i64_dup:
+; CHECK-NEON: // %bb.0: // %entry
+; CHECK-NEON-NEXT: and w8, w0, #0xff
+; CHECK-NEON-NEXT: dup v2.4s, w8
+; CHECK-NEON-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64_dup:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: and w8, w0, #0xff
+; CHECK-SVE-NEXT: dup v2.4s, w8
+; CHECK-SVE-NEXT: umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT: umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64_dup:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: and x8, x0, #0xff
+; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT: dup v2.2d, x8
+; CHECK-GI-NEXT: mov d3, v1.d[1]
+; CHECK-GI-NEXT: fmov x8, d1
+; CHECK-GI-NEXT: fmov x10, d0
+; CHECK-GI-NEXT: mov d1, v2.d[1]
+; CHECK-GI-NEXT: fmov x9, d2
+; CHECK-GI-NEXT: mov d2, v0.d[1]
+; CHECK-GI-NEXT: mul x8, x8, x9
+; CHECK-GI-NEXT: fmov x11, d1
+; CHECK-GI-NEXT: fmov x12, d2
+; CHECK-GI-NEXT: mul x9, x10, x9
+; CHECK-GI-NEXT: fmov x10, d3
+; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: mul x11, x12, x11
+; CHECK-GI-NEXT: fmov d1, x9
+; CHECK-GI-NEXT: mov v0.d[1], x10
+; CHECK-GI-NEXT: mov v1.d[1], x11
+; CHECK-GI-NEXT: ret
entry:
%in1 = zext <4 x i32> %src1 to <4 x i64>
%in2 = and i64 %src2, 255
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index a5154641400309f..4c0d1efb99498fb 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -1,15 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-GI
; Tests for wider-than-legal extensions into mul/mla.
define <16 x i16> @mul_i16(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v1.16b, v2.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: mov v1.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i16>
%eb = zext <16 x i8> %b to <16 x i16>
@@ -18,17 +26,29 @@ entry:
}
define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NEXT: ushll v4.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v5.8h, v0.16b, #0
-; CHECK-NEXT: ushll2 v6.8h, v1.16b, #0
-; CHECK-NEXT: umull v0.4s, v2.4h, v4.4h
-; CHECK-NEXT: umull2 v1.4s, v2.8h, v4.8h
-; CHECK-NEXT: umull2 v3.4s, v5.8h, v6.8h
-; CHECK-NEXT: umull v2.4s, v5.4h, v6.4h
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v4.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v5.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll2 v6.8h, v1.16b, #0
+; CHECK-SD-NEXT: umull v0.4s, v2.4h, v4.4h
+; CHECK-SD-NEXT: umull2 v1.4s, v2.8h, v4.8h
+; CHECK-SD-NEXT: umull2 v3.4s, v5.8h, v6.8h
+; CHECK-SD-NEXT: umull v2.4s, v5.4h, v6.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT: umull v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT: umull2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT: umull v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT: umull2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i32>
%eb = zext <16 x i8> %b to <16 x i32>
@@ -37,29 +57,53 @@ entry:
}
define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-NEXT: ushll v5.4s, v0.4h, #0
-; CHECK-NEXT: ushll v6.4s, v3.4h, #0
-; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: ushll v16.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v7.4s, v3.8h, #0
-; CHECK-NEXT: ushll2 v17.4s, v0.8h, #0
-; CHECK-NEXT: ushll2 v18.4s, v1.8h, #0
-; CHECK-NEXT: umull2 v1.2d, v4.4s, v6.4s
-; CHECK-NEXT: umull v0.2d, v4.2s, v6.2s
-; CHECK-NEXT: umull2 v3.2d, v2.4s, v7.4s
-; CHECK-NEXT: umull v2.2d, v2.2s, v7.2s
-; CHECK-NEXT: umull v4.2d, v5.2s, v16.2s
-; CHECK-NEXT: umull2 v7.2d, v17.4s, v18.4s
-; CHECK-NEXT: umull2 v5.2d, v5.4s, v16.4s
-; CHECK-NEXT: umull v6.2d, v17.2s, v18.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mul_i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v6.4s, v3.4h, #0
+; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT: ushll v16.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v7.4s, v3.8h, #0
+; CHECK-SD-NEXT: ushll2 v17.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v18.4s, v1.8h, #0
+; CHECK-SD-NEXT: umull2 v1.2d, v4.4s, v6.4s
+; CHECK-SD-NEXT: umull v0.2d, v4.2s, v6.2s
+; CHECK-SD-NEXT: umull2 v3.2d, v2.4s, v7.4s
+; CHECK-SD-NEXT: umull v2.2d, v2.2s, v7.2s
+; CHECK-SD-NEXT: umull v4.2d, v5.2s, v16.2s
+; CHECK-SD-NEXT: umull2 v7.2d, v17.4s, v18.4s
+; CHECK-SD-NEXT: umull2 v5.2d, v5.4s, v16.4s
+; CHECK-SD-NEXT: umull v6.2d, v17.2s, v18.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mul_i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT: umull v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT: umull2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT: umull v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT: umull2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT: umull v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT: umull2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT: umull v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT: umull2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i64>
%eb = zext <16 x i8> %b to <16 x i64>
@@ -69,13 +113,21 @@ entry:
define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
-; CHECK-LABEL: mla_i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umlal2 v3.8h, v0.16b, v1.16b
-; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i16:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i16:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: umlal v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v1.16b, v3.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i16>
%eb = zext <16 x i8> %b to <16 x i16>
@@ -85,21 +137,37 @@ entry:
}
define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
-; CHECK-LABEL: mla_i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: umlal v2.4s, v6.4h, v7.4h
-; CHECK-NEXT: umlal2 v3.4s, v6.8h, v7.8h
-; CHECK-NEXT: umlal2 v5.4s, v0.8h, v1.8h
-; CHECK-NEXT: umlal v4.4s, v0.4h, v1.4h
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: mov v2.16b, v4.16b
-; CHECK-NEXT: mov v3.16b, v5.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i32:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: umlal v2.4s, v6.4h, v7.4h
+; CHECK-SD-NEXT: umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-SD-NEXT: umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT: umlal v4.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: mov v2.16b, v4.16b
+; CHECK-SD-NEXT: mov v3.16b, v5.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i32:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: umlal v2.4s, v6.4h, v7.4h
+; CHECK-GI-NEXT: umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-GI-NEXT: umlal v4.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mov v1.16b, v3.16b
+; CHECK-GI-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-NEXT: mov v3.16b, v5.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i32>
%eb = zext <16 x i8> %b to <16 x i32>
@@ -109,38 +177,71 @@ entry:
}
define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
-; CHECK-LABEL: mla_i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov v17.16b, v7.16b
-; CHECK-NEXT: mov v16.16b, v6.16b
-; CHECK-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT: ushll v18.4s, v6.4h, #0
-; CHECK-NEXT: ushll2 v21.4s, v6.8h, #0
-; CHECK-NEXT: ushll v19.4s, v0.4h, #0
-; CHECK-NEXT: ushll v20.4s, v7.4h, #0
-; CHECK-NEXT: ushll v22.4s, v1.4h, #0
-; CHECK-NEXT: ushll2 v23.4s, v7.8h, #0
-; CHECK-NEXT: ldp q6, q7, [sp]
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT: umlal2 v3.2d, v18.4s, v20.4s
-; CHECK-NEXT: umlal v2.2d, v18.2s, v20.2s
-; CHECK-NEXT: umlal v16.2d, v19.2s, v22.2s
-; CHECK-NEXT: umlal2 v5.2d, v21.4s, v23.4s
-; CHECK-NEXT: umlal v4.2d, v21.2s, v23.2s
-; CHECK-NEXT: umlal2 v17.2d, v19.4s, v22.4s
-; CHECK-NEXT: umlal2 v7.2d, v0.4s, v1.4s
-; CHECK-NEXT: umlal v6.2d, v0.2s, v1.2s
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: mov v1.16b, v3.16b
-; CHECK-NEXT: mov v2.16b, v4.16b
-; CHECK-NEXT: mov v3.16b, v5.16b
-; CHECK-NEXT: mov v4.16b, v16.16b
-; CHECK-NEXT: mov v5.16b, v17.16b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: mla_i64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: mov v17.16b, v7.16b
+; CHECK-SD-NEXT: mov v16.16b, v6.16b
+; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v18.4s, v6.4h, #0
+; CHECK-SD-NEXT: ushll2 v21.4s, v6.8h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v20.4s, v7.4h, #0
+; CHECK-SD-NEXT: ushll v22.4s, v1.4h, #0
+; CHECK-SD-NEXT: ushll2 v23.4s, v7.8h, #0
+; CHECK-SD-NEXT: ldp q6, q7, [sp]
+; CHECK-SD-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT: umlal2 v3.2d, v18.4s, v20.4s
+; CHECK-SD-NEXT: umlal v2.2d, v18.2s, v20.2s
+; CHECK-SD-NEXT: umlal v16.2d, v19.2s, v22.2s
+; CHECK-SD-NEXT: umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-SD-NEXT: umlal v4.2d, v21.2s, v23.2s
+; CHECK-SD-NEXT: umlal2 v17.2d, v19.4s, v22.4s
+; CHECK-SD-NEXT: umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT: umlal v6.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: mov v1.16b, v3.16b
+; CHECK-SD-NEXT: mov v2.16b, v4.16b
+; CHECK-SD-NEXT: mov v3.16b, v5.16b
+; CHECK-SD-NEXT: mov v4.16b, v16.16b
+; CHECK-SD-NEXT: mov v5.16b, v17.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: mla_i64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v16.16b, v2.16b
+; CHECK-GI-NEXT: mov v17.16b, v3.16b
+; CHECK-GI-NEXT: mov v2.16b, v4.16b
+; CHECK-GI-NEXT: mov v3.16b, v5.16b
+; CHECK-GI-NEXT: mov v4.16b, v6.16b
+; CHECK-GI-NEXT: mov v5.16b, v7.16b
+; CHECK-GI-NEXT: ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v18.4s, v6.4h, #0
+; CHECK-GI-NEXT: ushll v20.4s, v7.4h, #0
+; CHECK-GI-NEXT: ushll2 v19.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll v21.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0
+; CHECK-GI-NEXT: ushll v23.4s, v1.4h, #0
+; CHECK-GI-NEXT: ldp q6, q7, [sp]
+; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT: umlal v16.2d, v18.2s, v20.2s
+; CHECK-GI-NEXT: umlal2 v17.2d, v18.4s, v20.4s
+; CHECK-GI-NEXT: umlal v2.2d, v19.2s, v22.2s
+; CHECK-GI-NEXT: umlal2 v3.2d, v19.4s, v22.4s
+; CHECK-GI-NEXT: umlal v4.2d, v21.2s, v23.2s
+; CHECK-GI-NEXT: umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-GI-NEXT: umlal v6.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: mov v0.16b, v16.16b
+; CHECK-GI-NEXT: mov v1.16b, v17.16b
+; CHECK-GI-NEXT: ret
entry:
%ea = zext <16 x i8> %a to <16 x i64>
%eb = zext <16 x i8> %b to <16 x i64>
More information about the llvm-commits
mailing list