[llvm] [AArch64][SME2] Extend getRegAllocationHints for ZPRStridedOrContiguousReg (PR #119865)
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 18 05:54:59 PST 2024
https://github.com/kmclaughlin-arm updated https://github.com/llvm/llvm-project/pull/119865
>From 8f466f8d4850e015d79cffc65abbd423624795d9 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 13 Dec 2024 09:47:48 +0000
Subject: [PATCH 1/6] Add SME2 dot tests using the SVE calling convention
---
.../AArch64/sme2-intrinsics-int-dots.ll | 456 ++++++++++++++++++
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 390 +++++++++++++++
2 files changed, 846 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index ef569e480ea3d6..0d8ae5a71f1415 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -607,6 +607,40 @@ entry:
ret void
}
+define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -657,6 +691,86 @@ entry:
ret void
}
+define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -749,6 +863,40 @@ entry:
ret void
}
+define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -799,6 +947,86 @@ entry:
ret void
}
+define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == Multi, indexed (signed) ==
define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
@@ -893,6 +1121,40 @@ entry:
ret void
}
+define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -943,6 +1205,86 @@ entry:
ret void
}
+define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -1037,6 +1379,40 @@ entry:
ret void
}
+define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sudot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sudot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -1087,6 +1463,86 @@ entry:
ret void
}
+define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sudot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index 49106e12378bea..f686c9b228d6f7 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -99,6 +99,41 @@ entry:
ret void
}
+define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: svdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9]
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: svdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -149,6 +184,86 @@ entry:
ret void
}
+define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: svdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == UVDOT ==
define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
@@ -215,6 +330,41 @@ entry:
ret void
}
+define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: uvdot_form_2x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9]
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0]
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: uvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
@@ -265,6 +415,86 @@ entry:
ret void
}
+define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: uvdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == SUVDOT ==
define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
@@ -330,6 +560,86 @@ entry:
ret void
}
+define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: suvdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
; == USVDOT ==
define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
@@ -395,6 +705,86 @@ entry:
ret void
}
+define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usvdot_form_4x_tuple_svecc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z8.d, z0.d
+; CHECK-NEXT: mov z9.d, z4.d
+; CHECK-NEXT: mov z10.d, z24.d
+; CHECK-NEXT: mov z11.d, z28.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z1.d
+; CHECK-NEXT: mov z9.d, z5.d
+; CHECK-NEXT: mov z10.d, z25.d
+; CHECK-NEXT: mov z11.d, z29.d
+; CHECK-NEXT: mov z1.d, z7.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z8.d, z2.d
+; CHECK-NEXT: mov z9.d, z6.d
+; CHECK-NEXT: mov z10.d, z26.d
+; CHECK-NEXT: mov z11.d, z30.d
+; CHECK-NEXT: mov z2.d, z27.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
>From 53e62428141c0091384617d80d10f7b03c54f822 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 13 Dec 2024 10:08:50 +0000
Subject: [PATCH 2/6] [AArch64][SME2] Extend getRegAllocationHints for
ZPRStridedOrContiguousReg
ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE pseudo
should attempt to assign a strided register to avoid unnecessary copies,
even though this may overlap with the list of SVE callee-saved registers.
---
.../Target/AArch64/AArch64RegisterInfo.cpp | 48 +++
.../AArch64/sme2-intrinsics-int-dots.ll | 304 ++++++++----------
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 268 +++++++--------
3 files changed, 302 insertions(+), 318 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 85a7663993a046..6900c93f13a25a 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1098,6 +1098,54 @@ bool AArch64RegisterInfo::getRegAllocationHints(
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+
+ // Since the SVE calling convention preserves registers Z8-Z23, there are no
+ // ZPR2Strided or ZPR4Strided registers which do not overlap with the
+ // callee-saved registers. These will be pushed to the back of the allocation
+ // order for the ZPRStridedOrContiguous classes.
+ // However, if any of the instructions which define VirtReg are
+ // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
+ // pseudo, it will likely be better to try assigning a strided register
+ // anyway to avoid extra copy instructions.
+
+ if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+ RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
+
+ if (!MF.getInfo<AArch64FunctionInfo>()->isSVECC())
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+
+ for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+ // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
+ // pseudo is found in the uses, set HintStrided.
+ bool HintStrided = false;
+ for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) {
+ unsigned UseOp = Use.getOpcode();
+ if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+ HintStrided = true;
+ break;
+ }
+ }
+
+ if (!HintStrided)
+ continue;
+
+ // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
+ // allocate these first.
+ TargetRegisterClass StridedRC =
+ RegID == AArch64::ZPR2StridedOrContiguousRegClassID
+ ? AArch64::ZPR2StridedRegClass
+ : AArch64::ZPR4StridedRegClass;
+
+ for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF))
+ Hints.push_back(Reg);
+ }
+
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 0d8ae5a71f1415..109b4bc750d3ff 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -611,20 +611,20 @@ define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: udot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -695,46 +695,38 @@ define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: udot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -867,20 +859,20 @@ define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: usdot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -951,46 +943,38 @@ define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: usdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -1125,20 +1109,20 @@ define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: sdot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -1209,46 +1193,38 @@ define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: sdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -1383,20 +1359,20 @@ define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: sudot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -1467,46 +1443,38 @@ define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: sudot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index f686c9b228d6f7..016f5f55c825b2 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -103,21 +103,21 @@ define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: svdot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x9, x0, x1
-; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
-; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9]
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9]
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -188,46 +188,38 @@ define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: svdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -334,21 +326,21 @@ define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: uvdot_form_2x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: add x9, x0, x1
-; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0]
-; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x9]
+; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z0.h[0]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: mov z1.d, z3.d
+; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x9]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0]
+; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -419,46 +411,38 @@ define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8>
; CHECK-LABEL: uvdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -564,46 +548,38 @@ define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8
; CHECK-LABEL: suvdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -709,46 +685,38 @@ define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8
; CHECK-LABEL: usvdot_form_4x_tuple_svecc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: addvl sp, sp, #-9
; CHECK-NEXT: lsl x9, x1, #1
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
-; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: add x10, x9, x1
-; CHECK-NEXT: str z9, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z24.b - z27.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: ld1b { z28.b - z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: mov z9.d, z4.d
-; CHECK-NEXT: mov z10.d, z24.d
-; CHECK-NEXT: mov z11.d, z28.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z1.d
-; CHECK-NEXT: mov z9.d, z5.d
-; CHECK-NEXT: mov z10.d, z25.d
-; CHECK-NEXT: mov z11.d, z29.d
-; CHECK-NEXT: mov z1.d, z7.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z8.d, z2.d
-; CHECK-NEXT: mov z9.d, z6.d
-; CHECK-NEXT: mov z10.d, z26.d
-; CHECK-NEXT: mov z11.d, z30.d
-; CHECK-NEXT: mov z2.d, z27.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z3.d
-; CHECK-NEXT: mov z3.d, z31.d
+; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: addvl sp, sp, #9
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
>From b49fa70f7ee8a13b02c9ea84988ef5ad1104d86d Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 17 Dec 2024 13:32:43 +0000
Subject: [PATCH 3/6] - Replaced HintStrided with any_of - Removed loop over
VirtReg def instructions - Add registers from Order to Hints if they are
contained in StridedRC
---
.../Target/AArch64/AArch64RegisterInfo.cpp | 45 +++++++------------
1 file changed, 17 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 6900c93f13a25a..a806d53db806e2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1108,43 +1108,32 @@ bool AArch64RegisterInfo::getRegAllocationHints(
// ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
// pseudo, it will likely be better to try assigning a strided register
// anyway to avoid extra copy instructions.
-
if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
- if (!MF.getInfo<AArch64FunctionInfo>()->isSVECC())
- return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
- MF, VRM);
-
- for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
- // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
- // pseudo is found in the uses, set HintStrided.
- bool HintStrided = false;
- for (MachineInstr &Use : MRI.use_nodbg_instructions(VirtReg)) {
- unsigned UseOp = Use.getOpcode();
- if (UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- UseOp == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
- HintStrided = true;
- break;
- }
- }
-
- if (!HintStrided)
- continue;
+ // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
+ // pseudo is found in the uses, set HintStrided.
+ if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) {
+ return Use.getOpcode() ==
+ AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ Use.getOpcode() ==
+ AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
+ })) {
// Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
// allocate these first.
- TargetRegisterClass StridedRC =
+ const TargetRegisterClass *StridedRC =
RegID == AArch64::ZPR2StridedOrContiguousRegClassID
- ? AArch64::ZPR2StridedRegClass
- : AArch64::ZPR4StridedRegClass;
+ ? &AArch64::ZPR2StridedRegClass
+ : &AArch64::ZPR4StridedRegClass;
- for (MCPhysReg Reg : StridedRC.getRawAllocationOrder(MF))
- Hints.push_back(Reg);
- }
+ for (MCPhysReg Reg : Order)
+ if (StridedRC->contains(Reg))
+ Hints.push_back(Reg);
- return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
- VRM);
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
+ }
}
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
>From 1ac19922e6192f7f5023217851694e34439b909e Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 17 Dec 2024 15:03:59 +0000
Subject: [PATCH 4/6] - Replace uses of undef with poison in sme2 dot intrinsic
tests
---
.../AArch64/sme2-intrinsics-int-dots.ll | 96 +++++++++----------
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 80 ++++++++--------
2 files changed, 88 insertions(+), 88 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 109b4bc750d3ff..86ed63d743713c 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -602,8 +602,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -636,8 +636,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -684,10 +684,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -756,10 +756,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -850,8 +850,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -884,8 +884,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -932,10 +932,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1004,10 +1004,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1100,8 +1100,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1134,8 +1134,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1182,10 +1182,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1254,10 +1254,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1350,8 +1350,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1384,8 +1384,8 @@ entry:
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1432,10 +1432,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -1504,10 +1504,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index 016f5f55c825b2..e7d1050b607992 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -94,8 +94,8 @@ entry:
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
ret void
}
@@ -129,8 +129,8 @@ entry:
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
ret void
}
@@ -177,10 +177,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -249,10 +249,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -317,8 +317,8 @@ entry:
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
ret void
}
@@ -352,8 +352,8 @@ entry:
%4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
%6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
ret void
}
@@ -400,10 +400,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -472,10 +472,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -537,10 +537,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -609,10 +609,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -674,10 +674,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
@@ -746,10 +746,10 @@ entry:
%18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
%19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
%20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
- tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
ret void
}
>From 70b53175ed406b6d8582deef0b74802a15fad752 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 18 Dec 2024 11:30:12 +0000
Subject: [PATCH 5/6] - Combine if conditions in getRegAllocationHints
---
.../Target/AArch64/AArch64RegisterInfo.cpp | 49 +++++++++----------
1 file changed, 22 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index a806d53db806e2..8fcf79cecb58d2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1098,7 +1098,6 @@ bool AArch64RegisterInfo::getRegAllocationHints(
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned RegID = MRI.getRegClass(VirtReg)->getID();
// Since the SVE calling convention preserves registers Z8-Z23, there are no
// ZPR2Strided or ZPR4Strided registers which do not overlap with the
@@ -1108,32 +1107,28 @@ bool AArch64RegisterInfo::getRegAllocationHints(
// ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
// pseudo, it will likely be better to try assigning a strided register
// anyway to avoid extra copy instructions.
- if (RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
- RegID == AArch64::ZPR4StridedOrContiguousRegClassID) {
-
- // Look through uses of the register and if the FORM_TRANSPOSED_REG_TUPLE
- // pseudo is found in the uses, set HintStrided.
- if (any_of(MRI.use_nodbg_instructions(VirtReg), [](MachineInstr &Use) {
- return Use.getOpcode() ==
- AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
- Use.getOpcode() ==
- AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
- })) {
-
- // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
- // allocate these first.
- const TargetRegisterClass *StridedRC =
- RegID == AArch64::ZPR2StridedOrContiguousRegClassID
- ? &AArch64::ZPR2StridedRegClass
- : &AArch64::ZPR4StridedRegClass;
-
- for (MCPhysReg Reg : Order)
- if (StridedRC->contains(Reg))
- Hints.push_back(Reg);
-
- return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
- MF, VRM);
- }
+ unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+ // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE.
+ if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+ RegID == AArch64::ZPR4StridedOrContiguousRegClassID) &&
+ any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) {
+ return Use.getOpcode() ==
+ AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
+ })) {
+ // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
+ // allocate these first.
+ const TargetRegisterClass *StridedRC =
+ RegID == AArch64::ZPR2StridedOrContiguousRegClassID
+ ? &AArch64::ZPR2StridedRegClass
+ : &AArch64::ZPR4StridedRegClass;
+
+ for (MCPhysReg Reg : Order)
+ if (StridedRC->contains(Reg))
+ Hints.push_back(Reg);
+
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
}
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
>From 52f2bd3b3b09bbfce2855233b803fd33e8b200a4 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 18 Dec 2024 13:38:12 +0000
Subject: [PATCH 6/6] - Reword comments in getRegAllocationHints
---
.../lib/Target/AArch64/AArch64RegisterInfo.cpp | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 8fcf79cecb58d2..5973b63b5a8024 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1099,14 +1099,14 @@ bool AArch64RegisterInfo::getRegAllocationHints(
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
- // Since the SVE calling convention preserves registers Z8-Z23, there are no
- // ZPR2Strided or ZPR4Strided registers which do not overlap with the
- // callee-saved registers. These will be pushed to the back of the allocation
- // order for the ZPRStridedOrContiguous classes.
- // However, if any of the instructions which define VirtReg are
- // ZPRStridedOrContiguous registers used by a FORM_TRANSPOSED_REG_TUPLE
- // pseudo, it will likely be better to try assigning a strided register
- // anyway to avoid extra copy instructions.
+ // The SVE calling convention preserves registers Z8-Z23. As a result, there
+ // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the
+ // callee-saved registers and so by default these will be pushed to the back
+ // of the allocation order for the ZPRStridedOrContiguous classes.
+ // If any of the instructions which define VirtReg are used by the
+ // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
+ // instructions over reducing the number of clobbered callee-save registers,
+ // so we add the strided registers as a hint.
unsigned RegID = MRI.getRegClass(VirtReg)->getID();
// Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE.
if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
@@ -1116,8 +1116,6 @@ bool AArch64RegisterInfo::getRegAllocationHints(
AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
})) {
- // Push the list of 2/4 ZPRStrided registers to Hints to ensure we try to
- // allocate these first.
const TargetRegisterClass *StridedRC =
RegID == AArch64::ZPR2StridedOrContiguousRegClassID
? &AArch64::ZPR2StridedRegClass
More information about the llvm-commits
mailing list