[llvm] 5ca26d7 - [AArch64][SME2] Improve register allocation of multi-vector SME intrinsics (#116399)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 12 02:00:28 PST 2024
Author: Kerry McLaughlin
Date: 2024-12-12T10:00:24Z
New Revision: 5ca26d769deedc931ce19b4a68a68c799f8d7564
URL: https://github.com/llvm/llvm-project/commit/5ca26d769deedc931ce19b4a68a68c799f8d7564
DIFF: https://github.com/llvm/llvm-project/commit/5ca26d769deedc931ce19b4a68a68c799f8d7564.diff
LOG: [AArch64][SME2] Improve register allocation of multi-vector SME intrinsics (#116399)
The FORM_TRANSPOSED_REG_TUPLE pseudos have been created to
improve register allocation for intrinsics which use strided and
contiguous multi-vector registers, avoiding unnecessary copies.
If the operands of the pseudo are copies where the source register is in
the StridedOrContiguous class, the pseudo is used by
getRegAllocationHints
to suggest a contigious multi-vector register which matches the
subregister
sequence used by the operands.
If the operands do not match this pattern, the pseudos are expanded
to a REG_SEQUENCE.
Patch contains changes by Matthew Devereau.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
llvm/lib/Target/AArch64/AArch64RegisterInfo.h
llvm/lib/Target/AArch64/SMEInstrFormats.td
llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 055cb3cefcedf9..0ce4b8971625c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -67,6 +67,10 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
TargetRegisterClass ContiguousClass,
TargetRegisterClass StridedClass,
unsigned ContiguousOpc, unsigned StridedOpc);
+ bool expandFormTuplePseudo(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned Size);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
@@ -1142,6 +1146,32 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo(
return true;
}
+bool AArch64ExpandPseudo::expandFormTuplePseudo(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, unsigned Size) {
+ assert(Size == 2 || Size == 4 && "Invalid Tuple Size");
+ MachineInstr &MI = *MBBI;
+ Register ReturnTuple = MI.getOperand(0).getReg();
+
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ for (unsigned I = 0; I < Size; ++I) {
+ Register FormTupleOpReg = MI.getOperand(I + 1).getReg();
+ Register ReturnTupleSubReg =
+ TRI->getSubReg(ReturnTuple, AArch64::zsub0 + I);
+ // Add copies to ensure the subregisters remain in the correct order
+ // for any contigious operation they are used by.
+ if (FormTupleOpReg != ReturnTupleSubReg)
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
+ .addReg(ReturnTupleSubReg, RegState::Define)
+ .addReg(FormTupleOpReg)
+ .addReg(FormTupleOpReg);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1724,6 +1754,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+ return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+ return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
}
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cee609ed1e2f6f..41e0214dab6c73 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8581,6 +8581,56 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
+// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
+// input operands are copy nodes where the source register is in a
+// StridedOrContiguous class. For example:
+//
+// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+//
+bool shouldUseFormStridedPseudo(MachineInstr &MI) {
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+ const TargetRegisterClass *RegClass = nullptr;
+ switch (MI.getOpcode()) {
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+ RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
+ break;
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+ RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode.");
+ }
+
+ MCRegister SubReg = MCRegister::NoRegister;
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
+
+ MachineOperand *Def = MRI.getOneDef(MO.getReg());
+ if (!Def || !Def->getParent()->isCopy())
+ return false;
+
+ const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
+ unsigned OpSubReg = CopySrc.getSubReg();
+ if (SubReg == MCRegister::NoRegister)
+ SubReg = OpSubReg;
+
+ MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
+ if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg ||
+ MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
+ return false;
+ }
+
+ return true;
+}
+
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8606,6 +8656,27 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
+ if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+ // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
+ // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
+ if (shouldUseFormStridedPseudo(MI))
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::REG_SEQUENCE),
+ MI.getOperand(0).getReg());
+
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MIB.add(MI.getOperand(I));
+ MIB.addImm(AArch64::zsub0 + (I - 1));
+ }
+
+ MI.eraseFromParent();
+ return;
+ }
+
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 133cc1344b98ff..85a7663993a046 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1081,6 +1081,58 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
+// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
+// where a consecutive multi-vector tuple is constructed from the same indices
+// of multiple strided loads. This may still result in unnecessary copies
+// between the loads and the tuple. Here we try to return a hint to assign the
+// contiguous ZPRMulReg starting at the same register as the first operand of
+// the pseudo, which should be a subregister of the first strided load.
+//
+// For example, if the first strided load has been assigned $z16_z20_z24_z28
+// and the operands of the pseudo are each accessing subregister zsub2, we
+// should look through through Order to find a contiguous register which
+// begins with $z24 (i.e. $z24_z25_z26_z27).
+//
+bool AArch64RegisterInfo::getRegAllocationHints(
+ Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+ const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+ if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
+ MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+
+ unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
+ switch (FirstOpSubReg) {
+ case AArch64::zsub0:
+ case AArch64::zsub1:
+ case AArch64::zsub2:
+ case AArch64::zsub3:
+ break;
+ default:
+ continue;
+ }
+
+ // Look up the physical register mapped to the first operand of the pseudo.
+ Register FirstOpVirtReg = MI.getOperand(1).getReg();
+ if (!VRM->hasPhys(FirstOpVirtReg))
+ continue;
+
+ MCRegister TupleStartReg =
+ getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
+ for (unsigned I = 0; I < Order.size(); ++I)
+ if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
+ if (R == TupleStartReg)
+ Hints.push_back(Order[I]);
+ }
+
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+}
+
unsigned AArch64RegisterInfo::getLocalAddressRegister(
const MachineFunction &MF) const {
const auto &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 5c8a5e029584fc..11da624af4881b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -134,6 +134,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index a67093b1a58c3d..b62ffcbebc652a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -35,6 +35,30 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
let WantsRoot = true in
def am_sme_indexed_b4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0, 15>">;
+// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
+// improve register allocation for intrinsics which use strided and contiguous
+// multi-vector registers, avoiding unnecessary copies.
+// If the operands of the pseudo are copies where the source register is in
+// the StridedOrContiguous class, the pseudo is used to provide a hint to the
+// register allocator suggesting a contigious multi-vector register which
+// matches the subregister sequence used by the operands.
+// If the operands do not match this pattern, the pseudos are expanded
+// to a REG_SEQUENCE using the post-isel hook.
+
+def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
+ Pseudo<(outs ZPR2Mul2:$tup),
+ (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
+ let hasSideEffects = 0;
+ let hasPostISelHook = 1;
+}
+
+def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
+ Pseudo<(outs ZPR4Mul4:$tup),
+ (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
+ let hasSideEffects = 0;
+ let hasPostISelHook = 1;
+}
+
def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
[SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
@@ -173,14 +197,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
+ (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
zpr_ty:$Zm, imm_ty:$i)>;
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 1e835c92ba9e4c..ef569e480ea3d6 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
target triple="aarch64-linux-gnu"
@@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -585,11 +537,7 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: ret
@@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -620,8 +568,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -635,12 +583,86 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
+define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -654,8 +676,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -673,8 +695,8 @@ define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -688,8 +710,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -703,6 +725,79 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
+define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == Multi, indexed (signed) ==
@@ -710,8 +805,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -725,8 +820,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
@@ -744,8 +839,8 @@ define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -759,8 +854,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -774,12 +869,86 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
+define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -793,8 +962,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -814,8 +983,8 @@ define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -829,8 +998,8 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -844,11 +1013,84 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
+define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
-
; == Multi, multi (unsigned)
declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index a0d8c18f55c3a0..49106e12378bea 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -1,15 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
-
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
; == FVDOT ==
define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -25,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -43,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -58,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -75,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -89,15 +74,87 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
+define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
+define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == UVDOT ==
define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -110,11 +167,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -127,11 +180,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -141,17 +190,87 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
+define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
+define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == SUVDOT ==
define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -161,17 +280,62 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
+define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: suvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == USVDOT ==
define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -181,6 +345,58 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
+define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2" }
+attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
; == FVDOT ==
declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
More information about the llvm-commits
mailing list