[llvm] [AArch64][SME2] Extend getRegAllocationHints for ZPRStridedOrContiguousReg (PR #119865)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 13 03:53:46 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Kerry McLaughlin (kmclaughlin-arm)
<details>
<summary>Changes</summary>
ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE
pseudo should attempt to assign a strided register to avoid unnecessary copies,
even though this may overlap with the list of SVE callee-saved registers.
---
Patch is 65.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119865.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+48)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll (+424)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll (+358)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 85a7663993a046..6900c93f13a25a 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1098,6 +1098,54 @@ bool AArch64RegisterInfo::getRegAllocationHints(
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+
+ // Since the SVE calling convention preserves registers Z8-Z23, there are no
+ // ZPR2Strided or ZPR4Strided registers which do not overlap with the
+ // callee-saved registers. These will be pushed to the back of the allocation
+ // order for the ZPRStridedOrContiguous classes.
+ // However, if any of the instructions which define VirtReg are
+ // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
+ // pseudo, it will likely be better to try assigning a strided register
+ // anyway to avoid extra copy instructions.
+
+ if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+ RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
+
+ if (!MF.getInfo<AArch64FunctionInfo>()->isSVECC())
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+
+ for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+ // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
+ // pseudo is found in the uses, set HintStrided.
+ bool HintStrided = false;
+ for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) {
+ unsigned UseOp = Use.getOpcode();
+ if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+ HintStrided = true;
+ break;
+ }
+ }
+
+ if (!HintStrided)
+ continue;
+
+ // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
+ // allocate these first.
+ TargetRegisterClass StridedRC =
+ RegID == AArch64::ZPR2StridedOrContiguousRegClassID
+ ? AArch64::ZPR2StridedRegClass
+ : AArch64::ZPR4StridedRegClass;
+
+ for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF))
+ Hints.push_back(Reg);
+ }
+
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index ef569e480ea3d6..109b4bc750d3ff 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -607,6 +607,40 @@ entry:
ret void
}
+define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -657,6 +691,78 @@ entry:
ret void
}
+define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-9
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #9
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -749,6 +855,40 @@ entry:
ret void
}
+define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -799,6 +939,78 @@ entry:
ret void
}
+define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-9
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #9
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == Multi, indexed (signed) ==
define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
@@ -893,6 +1105,40 @@ entry:
ret void
}
+define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/119865
More information about the llvm-commits
mailing list