[llvm] [AArch64][SME2] Add FORM_STRIDED_TUPLE pseudo nodes (PR #116399)
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 06:25:04 PST 2024
https://github.com/kmclaughlin-arm updated https://github.com/llvm/llvm-project/pull/116399
>From fc0d2246b9d83ed00fd1d3a8661464a9f6de615c Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 15 Nov 2024 11:20:32 +0000
Subject: [PATCH 01/10] [AArch64][SME2] Add tests for strided load & contiguous
dot instructions
---
.../AArch64/sme2-intrinsics-int-dots.ll | 400 +++++++++++++++++-
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 348 ++++++++++++++-
2 files changed, 746 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 1e835c92ba9e4c..8a7406401e79a4 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -635,6 +635,106 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
+define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: udot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -703,6 +803,105 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
+define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == Multi, indexed (signed) ==
@@ -774,6 +973,106 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
ret void
}
+define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
@@ -844,11 +1143,110 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
ret void
}
+define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+ %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: sudot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
attributes #0 = { nounwind "target-features"="+sme2" }
attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
-
; == Multi, multi (unsigned)
declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index a0d8c18f55c3a0..ea786a61b15f54 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
-
; == FVDOT ==
define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
@@ -89,6 +88,106 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
+define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x9]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
+define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: svdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == UVDOT ==
@@ -141,6 +240,106 @@ define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
ret void
}
+define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_2x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: add x9, x0, x1
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
+; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x9]
+; CHECK-NEXT: mov z2.d, z16.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+ %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+ %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+ ret void
+}
+
+define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: uvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == SUVDOT ==
@@ -161,6 +360,78 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
+define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: suvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
; == USVDOT ==
@@ -181,6 +452,81 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
ret void
}
+define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
+; CHECK-LABEL: usvdot_form_4x_tuple:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
+; CHECK-NEXT: lsl x9, x1, #1
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: add x9, x9, x1
+; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z0.d, z17.d
+; CHECK-NEXT: mov z4.d, z21.d
+; CHECK-NEXT: mov z12.d, z25.d
+; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: mov z5.d, z20.d
+; CHECK-NEXT: mov z13.d, z24.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z6.d, z22.d
+; CHECK-NEXT: mov z14.d, z26.d
+; CHECK-NEXT: mov z3.d, z8.d
+; CHECK-NEXT: mov z7.d, z9.d
+; CHECK-NEXT: mov z15.d, z10.d
+; CHECK-NEXT: mov z8.d, z29.d
+; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z10.d, z30.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
+ tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sme2" }
+attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
; == FVDOT ==
declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
>From 6992da2c00cfc9d0652a1f3e3f0cc6a96c883564 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 15 Nov 2024 12:00:43 +0000
Subject: [PATCH 02/10] Enable subreg liveness in SME2 dot tests.
---
.../AArch64/sme2-intrinsics-int-dots.ll | 360 +++++++-----------
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 198 ++++------
2 files changed, 217 insertions(+), 341 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 8a7406401e79a4..7b35378c63fa9f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
target triple="aarch64-linux-gnu"
@@ -26,18 +26,18 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -68,18 +68,18 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -110,18 +110,18 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -152,18 +152,18 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -197,18 +197,18 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -239,18 +239,18 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
-; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
+; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
; CHECK-NEXT: ret
<vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
@@ -281,18 +281,18 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z26.d, z7.d
-; CHECK-NEXT: mov z31.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: mov z26.d, z7.d
; CHECK-NEXT: mov z25.d, z6.d
-; CHECK-NEXT: mov z30.d, z3.d
+; CHECK-NEXT: mov z7.d, z4.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z24.d, z5.d
-; CHECK-NEXT: mov z29.d, z2.d
; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
-; CHECK-NEXT: mov z28.d, z1.d
-; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
-; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
+; CHECK-NEXT: mov z6.d, z3.d
+; CHECK-NEXT: mov z5.d, z2.d
+; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
+; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
; CHECK-NEXT: ret
<vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
@@ -309,9 +309,7 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -324,11 +322,7 @@ define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -341,9 +335,7 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -356,11 +348,7 @@ define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: udot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -373,9 +361,7 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -388,11 +374,7 @@ define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: udot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -405,9 +387,7 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -420,11 +400,7 @@ define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -440,9 +416,7 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -455,11 +429,7 @@ define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -472,9 +442,7 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -487,11 +455,7 @@ define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -504,9 +468,7 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
; CHECK-NEXT: ret
@@ -519,11 +481,7 @@ define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
; CHECK-NEXT: ret
@@ -536,9 +494,7 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
; CHECK-NEXT: ret
@@ -551,11 +507,7 @@ define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
; CHECK-NEXT: ret
@@ -571,8 +523,8 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -585,11 +537,7 @@ define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
; CHECK-NEXT: ret
@@ -605,8 +553,8 @@ define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -620,8 +568,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -665,41 +613,36 @@ entry:
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -739,8 +682,8 @@ define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -754,8 +697,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -773,8 +716,8 @@ define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -788,8 +731,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -833,41 +776,36 @@ entry:
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -909,8 +847,8 @@ define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
; CHECK-NEXT: ret
@@ -924,8 +862,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
@@ -943,8 +881,8 @@ define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -958,8 +896,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -1003,41 +941,36 @@ entry:
define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -1077,8 +1010,8 @@ define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
; CHECK-NEXT: ret
@@ -1092,8 +1025,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
@@ -1113,8 +1046,8 @@ define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z5.d, z2.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z4.d, z1.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
; CHECK-NEXT: ret
@@ -1128,8 +1061,8 @@ define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z27.d, z4.d
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z26.d, z3.d
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov z25.d, z2.d
; CHECK-NEXT: mov z24.d, z1.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
@@ -1173,41 +1106,36 @@ entry:
define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sudot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index ea786a61b15f54..cf9311db99225b 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -1,14 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s
; == FVDOT ==
define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -24,9 +22,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -42,9 +38,7 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -57,11 +51,7 @@ define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -74,11 +64,7 @@ define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -119,41 +105,36 @@ entry:
define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: svdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -194,9 +175,7 @@ entry:
define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
; CHECK-NEXT: ret
@@ -209,11 +188,7 @@ define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %
define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -226,11 +201,7 @@ define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %
define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) {
; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1]
; CHECK-NEXT: ret
@@ -271,41 +242,36 @@ entry:
define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: uvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -346,11 +312,7 @@ entry:
define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -363,41 +325,36 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: suvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -438,11 +395,7 @@ entry:
define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) {
; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: mov w8, w0
-; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3]
; CHECK-NEXT: ret
@@ -455,41 +408,36 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: ld1b { z8.b - z11.b }, pn8/z, [x0, x9]
; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z12.d, z25.d
; CHECK-NEXT: mov z1.d, z16.d
+; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: mov z4.d, z21.d
; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z13.d, z24.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z6.d, z22.d
-; CHECK-NEXT: mov z14.d, z26.d
-; CHECK-NEXT: mov z3.d, z8.d
-; CHECK-NEXT: mov z7.d, z9.d
-; CHECK-NEXT: mov z15.d, z10.d
-; CHECK-NEXT: mov z8.d, z29.d
-; CHECK-NEXT: mov z9.d, z28.d
+; CHECK-NEXT: mov z8.d, z25.d
+; CHECK-NEXT: mov z9.d, z24.d
+; CHECK-NEXT: mov z3.d, z16.d
+; CHECK-NEXT: mov z7.d, z17.d
+; CHECK-NEXT: mov z11.d, z18.d
+; CHECK-NEXT: mov z16.d, z29.d
+; CHECK-NEXT: mov z17.d, z28.d
+; CHECK-NEXT: mov z18.d, z14.d
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z10.d, z30.d
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
>From 0d4c931c3d5047878b9f82e48527a312acf5f4e3 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 15 Nov 2024 13:46:19 +0000
Subject: [PATCH 03/10] [AArch64][SME2] Add FORM_STRIDED_TUPLE pseudo nodes
This patch adds a pseudo node to help towards improving register
allocation of multi-vector SME intrinsics.
The FORM_STRIDED_TUPLE node is emitted if each of the operands of a
contiguous multi-vector dot intrinsic are the result of a strided
multi-vector load. The operands of the psuedo will be one subregister
at the same index from each of these strided loads.
Follow up patches will use this pseudo when adding register allocation
hints to remove unecessary register copies in this scenario. Subregister
liveness is also required to achieve this and has been enabled in the
tests changed by this patch.
Patch contains changes by Matthew Devereau.
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 32 +++
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 27 ++
.../Target/AArch64/AArch64ISelLowering.cpp | 63 +++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 11 +
llvm/lib/Target/AArch64/SMEInstrFormats.td | 12 +
.../AArch64/sme2-intrinsics-int-dots.ll | 244 +++++++++---------
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 226 ++++++++--------
8 files changed, 378 insertions(+), 240 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 055cb3cefcedf9..dabcaaf9f5c874 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -67,6 +67,10 @@ class AArch64ExpandPseudo : public MachineFunctionPass {
TargetRegisterClass ContiguousClass,
TargetRegisterClass StridedClass,
unsigned ContiguousOpc, unsigned StridedOpc);
+ bool expandFormTuplePseudo(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI,
+ unsigned Size);
bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
unsigned BitSize);
@@ -1142,6 +1146,30 @@ bool AArch64ExpandPseudo::expandMultiVecPseudo(
return true;
}
+bool AArch64ExpandPseudo::expandFormTuplePseudo(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineBasicBlock::iterator &NextMBBI, unsigned Size) {
+ assert(Size == 2 || Size == 4 && "Invalid Tuple Size");
+ MachineInstr &MI = *MBBI;
+ Register ReturnTuple = MI.getOperand(0).getReg();
+
+ const TargetRegisterInfo *TRI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ for (unsigned i = 0; i < Size; i++) {
+ Register FormTupleOpReg = MI.getOperand(i + 1).getReg();
+ Register ReturnTupleSubReg =
+ TRI->getSubReg(ReturnTuple, AArch64::zsub0 + i);
+ if (FormTupleOpReg != ReturnTupleSubReg)
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
+ .addReg(ReturnTupleSubReg, RegState::Define)
+ .addReg(FormTupleOpReg)
+ .addReg(FormTupleOpReg);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
/// If MBBI references a pseudo instruction that should be expanded here,
/// do the expansion and return true. Otherwise return false.
bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1724,6 +1752,10 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
+ case AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO:
+ return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
+ case AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO:
+ return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
}
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1969c830f4d312..d46bae07b3d4c5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -504,6 +504,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectAllActivePredicate(SDValue N);
bool SelectAnyPredicate(SDValue N);
+
+ void SelectFormTuplePseudo(SDNode *N, unsigned Size);
};
class AArch64DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
@@ -7181,6 +7183,14 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
+ case AArch64ISD::FORM_STRIDED_TUPLE_X2: {
+ SelectFormTuplePseudo(Node, 2);
+ return;
+ }
+ case AArch64ISD::FORM_STRIDED_TUPLE_X4: {
+ SelectFormTuplePseudo(Node, 4);
+ return;
+ }
}
// Select the default instruction
@@ -7438,3 +7448,20 @@ bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
return true;
}
+
+void AArch64DAGToDAGISel::SelectFormTuplePseudo(SDNode *Node, unsigned Size) {
+ assert((Size == 2 || Size == 4) && "Invalid Tuple size");
+ EVT VT = Node->getValueType(0);
+ SmallVector<SDValue> Ops;
+ for (unsigned I = 0; I < Size; I++)
+ Ops.push_back(Node->getOperand(I));
+ SDLoc DL(Node);
+ unsigned Opc = Size == 2 ? AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO
+ : AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO;
+ SDNode *Tuple = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
+ SDValue SuperReg = SDValue(Tuple, 0);
+ for (unsigned I = 0; I < Size; ++I)
+ ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + I, DL, VT, SuperReg));
+ CurDAG->RemoveDeadNode(Node);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9d1c3d4eddc880..b8c87b0ec2ea5f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2808,6 +2808,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FMUL_PRED)
MAKE_CASE(AArch64ISD::FSUB_PRED)
MAKE_CASE(AArch64ISD::RDSVL)
+ MAKE_CASE(AArch64ISD::FORM_STRIDED_TUPLE_X2)
+ MAKE_CASE(AArch64ISD::FORM_STRIDED_TUPLE_X4)
MAKE_CASE(AArch64ISD::BIC)
MAKE_CASE(AArch64ISD::CBZ)
MAKE_CASE(AArch64ISD::CBNZ)
@@ -5709,6 +5711,46 @@ SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
Mask);
}
+static unsigned getIntrinsicID(const SDNode *N);
+
+SDValue TryLowerMultiVecSMEDotIntrinsic(SDValue Op, SelectionDAG &DAG,
+ unsigned Size) {
+ assert((Size == 2 || Size == 4) && "Invalid Tuple Size");
+ auto IsStridedLoad = [Size](SDValue Op) -> bool {
+ unsigned Intrinsic = getIntrinsicID(Op.getNode());
+ if (Size == 2)
+ return Intrinsic == Intrinsic::aarch64_sve_ld1_pn_x2;
+ else
+ return Intrinsic == Intrinsic::aarch64_sve_ld1_pn_x4;
+ };
+
+ SmallVector<SDValue> Ops;
+ unsigned LastLoadIdx = Size == 2 ? 5 : 7;
+ unsigned LoadResNo = Op.getOperand(3).getResNo();
+ for (unsigned I = 3; I < LastLoadIdx; I++) {
+ if (!IsStridedLoad(Op->getOperand(I)) ||
+ Op.getOperand(I).getResNo() != LoadResNo)
+ return SDValue();
+ Ops.push_back(Op->getOperand(I));
+ }
+
+ EVT VT = Op->getOperand(3).getValueType();
+ SDVTList VTList =
+ Size == 2 ? DAG.getVTList(VT, VT) : DAG.getVTList(VT, VT, VT, VT);
+ unsigned Opc = Size == 2 ? AArch64ISD::FORM_STRIDED_TUPLE_X2
+ : AArch64ISD::FORM_STRIDED_TUPLE_X4;
+ SDLoc DL(Op);
+ SDValue Pseudo = DAG.getNode(Opc, DL, VTList, Ops);
+
+ SmallVector<SDValue> DotOps = {Op.getOperand(0), Op->getOperand(1),
+ Op->getOperand(2)};
+ for (unsigned I = 0; I < Size; I++)
+ DotOps.push_back(Pseudo.getValue(I));
+ DotOps.push_back(Op->getOperand(DotOps.size()));
+ DotOps.push_back(Op->getOperand(DotOps.size()));
+ return DAG.getNode(Op->getOpcode(), DL, MVT::Other, DotOps);
+}
+
// Lower an SME LDR/STR ZA intrinsic
// Case 1: If the vector number (vecnum) is an immediate in range, it gets
// folded into the instruction
@@ -5898,6 +5940,22 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getOperand(0), // Chain
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
+ case Intrinsic::aarch64_sme_uvdot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_suvdot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_usvdot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_svdot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_usdot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_udot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_sudot_lane_za32_vg1x4:
+ case Intrinsic::aarch64_sme_sdot_lane_za32_vg1x4:
+ return TryLowerMultiVecSMEDotIntrinsic(Op, DAG, 4);
+ case Intrinsic::aarch64_sme_uvdot_lane_za32_vg1x2:
+ case Intrinsic::aarch64_sme_sdot_lane_za32_vg1x2:
+ case Intrinsic::aarch64_sme_svdot_lane_za32_vg1x2:
+ case Intrinsic::aarch64_sme_usdot_lane_za32_vg1x2:
+ case Intrinsic::aarch64_sme_sudot_lane_za32_vg1x2:
+ case Intrinsic::aarch64_sme_udot_lane_za32_vg1x2:
+ return TryLowerMultiVecSMEDotIntrinsic(Op, DAG, 2);
}
}
@@ -7639,6 +7697,11 @@ static unsigned getIntrinsicID(const SDNode *N) {
return IID;
return Intrinsic::not_intrinsic;
}
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IID = N->getConstantOperandVal(1);
+ if (IID < Intrinsic::num_intrinsics)
+ return IID;
+ }
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d11da64d3f84eb..c7a70ab9f3c898 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -478,6 +478,9 @@ enum NodeType : unsigned {
SME_ZA_LDR,
SME_ZA_STR,
+ FORM_STRIDED_TUPLE_X2,
+ FORM_STRIDED_TUPLE_X4,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index a6ba6ddc30b277..5fb44fe5146d3c 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -28,6 +28,17 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
+def SDT_FORM_STRIDED_TUPLE_X2 : SDTypeProfile<4, 4,
+ [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+
+def SDT_FORM_STRIDED_TUPLE_X4 : SDTypeProfile<4, 4,
+ [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
+ SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>,
+ SDTCisSameAs<0, 6>, SDTCisSameAs<0, 7>]>;
+
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 8c256b5818ee88..41508bce651c6b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -34,6 +34,18 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
+def FORM_STRIDED_TUPLE_X2_PSEUDO :
+ Pseudo<(outs ZPR2Mul2:$tup),
+ (ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
+ let hasSideEffects = 0;
+}
+
+def FORM_STRIDED_TUPLE_X4_PSEUDO :
+ Pseudo<(outs ZPR4Mul4:$tup),
+ (ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
+ let hasSideEffects = 0;
+}
+
def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
[SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 7b35378c63fa9f..eddff238ace031 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -589,11 +589,12 @@ define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
@@ -613,36 +614,34 @@ entry:
define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: udot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -752,11 +751,12 @@ define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
@@ -776,36 +776,34 @@ entry:
define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -917,11 +915,12 @@ define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
@@ -941,36 +940,34 @@ entry:
define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -1082,11 +1079,12 @@ define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z2.b, z3.b }, z0.b[0]
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
@@ -1106,36 +1104,34 @@ entry:
define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: sudot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index cf9311db99225b..dec2dfb6f687ca 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -81,11 +81,12 @@ define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x9, x0, x1
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
-; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x9]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0]
+; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
@@ -105,36 +106,34 @@ entry:
define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: svdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -218,11 +217,12 @@ define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x9, x0, x1
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
-; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x9]
-; CHECK-NEXT: mov z2.d, z16.d
-; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0]
+; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
@@ -242,36 +242,34 @@ entry:
define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: uvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -325,36 +323,34 @@ define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: suvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -408,36 +404,34 @@ define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8>
define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-LABEL: usvdot_form_4x_tuple:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str d14, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: lsl x9, x1, #1
-; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
-; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0]
-; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: add x9, x9, x1
-; CHECK-NEXT: mov z0.d, z17.d
-; CHECK-NEXT: mov z1.d, z16.d
-; CHECK-NEXT: ld1b { z16.b - z19.b }, pn8/z, [x0, x9]
-; CHECK-NEXT: mov z4.d, z21.d
-; CHECK-NEXT: mov z5.d, z20.d
-; CHECK-NEXT: mov z8.d, z25.d
-; CHECK-NEXT: mov z9.d, z24.d
-; CHECK-NEXT: mov z3.d, z16.d
-; CHECK-NEXT: mov z7.d, z17.d
-; CHECK-NEXT: mov z11.d, z18.d
-; CHECK-NEXT: mov z16.d, z29.d
-; CHECK-NEXT: mov z17.d, z28.d
-; CHECK-NEXT: mov z18.d, z14.d
+; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
+; CHECK-NEXT: add x10, x9, x1
+; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
+; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
+; CHECK-NEXT: mov z0.d, z16.d
+; CHECK-NEXT: mov z1.d, z17.d
+; CHECK-NEXT: mov z2.d, z18.d
+; CHECK-NEXT: mov z3.d, z19.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z20.d
+; CHECK-NEXT: mov z1.d, z21.d
+; CHECK-NEXT: mov z2.d, z22.d
+; CHECK-NEXT: mov z3.d, z23.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z24.d
+; CHECK-NEXT: mov z1.d, z25.d
+; CHECK-NEXT: mov z2.d, z26.d
+; CHECK-NEXT: mov z3.d, z27.d
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: mov z0.d, z28.d
+; CHECK-NEXT: mov z1.d, z29.d
+; CHECK-NEXT: mov z2.d, z30.d
+; CHECK-NEXT: mov z3.d, z31.d
; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
-; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr d14, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
>From 7188a2da5f93eefb272e6fcc6f64e4a18f4e5fca Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 19 Nov 2024 14:14:01 +0000
Subject: [PATCH 04/10] - Removed the FORM_STRIDED_TUPLE_X# nodes, leaving only
the pseudo nodes. - Changed the tablegen patterns used by the dot intrinsics
to always output the FORM_STRIDED_TUPLE_X#_PSEUDO nodes. - Check that the
operands to the pseudo are copies from a StridedOrContiguous register class
in AdjustInstrPostInstrSelection, falling back on creating a REG_SEQUENCE
node if not.
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 2 +
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 27 +---
.../Target/AArch64/AArch64ISelLowering.cpp | 127 ++++++++++--------
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 20 ++-
5 files changed, 93 insertions(+), 85 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index dabcaaf9f5c874..1924812ca33c85 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1159,6 +1159,8 @@ bool AArch64ExpandPseudo::expandFormTuplePseudo(
Register FormTupleOpReg = MI.getOperand(i + 1).getReg();
Register ReturnTupleSubReg =
TRI->getSubReg(ReturnTuple, AArch64::zsub0 + i);
+ // Add copies to ensure the subregisters remain in the correct order
+ // for any contigious operation they are used by.
if (FormTupleOpReg != ReturnTupleSubReg)
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORR_ZZZ))
.addReg(ReturnTupleSubReg, RegState::Define)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d46bae07b3d4c5..827f5a5386b829 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -505,7 +505,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectAllActivePredicate(SDValue N);
bool SelectAnyPredicate(SDValue N);
- void SelectFormTuplePseudo(SDNode *N, unsigned Size);
+ void SelectFormTuplePseudo(SDNode *N);
};
class AArch64DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
@@ -7183,14 +7183,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
}
break;
}
- case AArch64ISD::FORM_STRIDED_TUPLE_X2: {
- SelectFormTuplePseudo(Node, 2);
- return;
- }
- case AArch64ISD::FORM_STRIDED_TUPLE_X4: {
- SelectFormTuplePseudo(Node, 4);
- return;
- }
}
// Select the default instruction
@@ -7448,20 +7440,3 @@ bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
return true;
}
-
-void AArch64DAGToDAGISel::SelectFormTuplePseudo(SDNode *Node, unsigned Size) {
- assert((Size == 2 || Size == 4) && "Invalid Tuple size");
- EVT VT = Node->getValueType(0);
- SmallVector<SDValue> Ops;
- for (unsigned I = 0; I < Size; I++)
- Ops.push_back(Node->getOperand(I));
- SDLoc DL(Node);
- unsigned Opc = Size == 2 ? AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO
- : AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO;
- SDNode *Tuple = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
- SDValue SuperReg = SDValue(Tuple, 0);
- for (unsigned I = 0; I < Size; ++I)
- ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
- AArch64::zsub0 + I, DL, VT, SuperReg));
- CurDAG->RemoveDeadNode(Node);
-}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b8c87b0ec2ea5f..8856a5dff47cb3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5711,46 +5711,6 @@ SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
Mask);
}
-static unsigned getIntrinsicID(const SDNode *N);
-
-SDValue TryLowerMultiVecSMEDotIntrinsic(SDValue Op, SelectionDAG &DAG,
- unsigned Size) {
- assert((Size == 2 || Size == 4) && "Invalid Tuple Size");
- auto IsStridedLoad = [Size](SDValue Op) -> bool {
- unsigned Intrinsic = getIntrinsicID(Op.getNode());
- if (Size == 2)
- return Intrinsic == Intrinsic::aarch64_sve_ld1_pn_x2;
- else
- return Intrinsic == Intrinsic::aarch64_sve_ld1_pn_x4;
- };
-
- SmallVector<SDValue> Ops;
- unsigned LastLoadIdx = Size == 2 ? 5 : 7;
- unsigned LoadResNo = Op.getOperand(3).getResNo();
- for (unsigned I = 3; I < LastLoadIdx; I++) {
- if (!IsStridedLoad(Op->getOperand(I)) ||
- Op.getOperand(I).getResNo() != LoadResNo)
- return SDValue();
- Ops.push_back(Op->getOperand(I));
- }
-
- EVT VT = Op->getOperand(3).getValueType();
- SDVTList VTList =
- Size == 2 ? DAG.getVTList(VT, VT) : DAG.getVTList(VT, VT, VT, VT);
- unsigned Opc = Size == 2 ? AArch64ISD::FORM_STRIDED_TUPLE_X2
- : AArch64ISD::FORM_STRIDED_TUPLE_X4;
- SDLoc DL(Op);
- SDValue Pseudo = DAG.getNode(Opc, DL, VTList, Ops);
-
- SmallVector<SDValue> DotOps = {Op.getOperand(0), Op->getOperand(1),
- Op->getOperand(2)};
- for (unsigned I = 0; I < Size; I++)
- DotOps.push_back(Pseudo.getValue(I));
- DotOps.push_back(Op->getOperand(DotOps.size()));
- DotOps.push_back(Op->getOperand(DotOps.size()));
- return DAG.getNode(Op->getOpcode(), DL, MVT::Other, DotOps);
-}
-
// Lower an SME LDR/STR ZA intrinsic
// Case 1: If the vector number (vecnum) is an immediate in range, it gets
// folded into the instruction
@@ -5940,22 +5900,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getOperand(0), // Chain
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
- case Intrinsic::aarch64_sme_uvdot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_suvdot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_usvdot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_svdot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_usdot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_udot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_sudot_lane_za32_vg1x4:
- case Intrinsic::aarch64_sme_sdot_lane_za32_vg1x4:
- return TryLowerMultiVecSMEDotIntrinsic(Op, DAG, 4);
- case Intrinsic::aarch64_sme_uvdot_lane_za32_vg1x2:
- case Intrinsic::aarch64_sme_sdot_lane_za32_vg1x2:
- case Intrinsic::aarch64_sme_svdot_lane_za32_vg1x2:
- case Intrinsic::aarch64_sme_usdot_lane_za32_vg1x2:
- case Intrinsic::aarch64_sme_sudot_lane_za32_vg1x2:
- case Intrinsic::aarch64_sme_udot_lane_za32_vg1x2:
- return TryLowerMultiVecSMEDotIntrinsic(Op, DAG, 2);
}
}
@@ -8729,6 +8673,77 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
+ if (MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ||
+ MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO) {
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ bool UseFormStrided = false;
+ unsigned Size =
+ MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ? 2 : 4;
+
+ // The FORM_STRIDED_TUPLE pseudo should only be used if the input operands
+ // are copy nodes where the source register is in a StridedOrContiguous
+ // class. For example:
+ // %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+ // %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+ // %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+ // %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+ // %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+ // %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+ // %9:zpr2mul2 = FORM_STRIDED_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+
+ SmallVector<unsigned, 4> OpSubRegs;
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ if (!MO.isReg())
+ continue;
+
+ MachineOperand *Def = MRI.getOneDef(MO.getReg());
+ if (!Def || !Def->isReg() || !Def->getParent()->isCopy())
+ continue;
+
+ MachineInstr *Cpy = Def->getParent();
+ MachineOperand CpyOp = Cpy->getOperand(1);
+ if (!CpyOp.isReg())
+ continue;
+
+ MachineOperand *Ld = MRI.getOneDef(CpyOp.getReg());
+ OpSubRegs.push_back(CpyOp.getSubReg());
+ if (!Ld || !Ld->isReg())
+ continue;
+
+ const TargetRegisterClass *RegClass =
+ Size == 2 ? &AArch64::ZPR2StridedOrContiguousRegClass
+ : &AArch64::ZPR4StridedOrContiguousRegClass;
+
+ if (MRI.getRegClass(Ld->getReg()) == RegClass)
+ UseFormStrided = true;
+ }
+
+ // Ensure the operands all use the same subreg index.
+ if (!std::equal(OpSubRegs.begin(), OpSubRegs.end(), OpSubRegs.begin()))
+ UseFormStrided = false;
+
+ // If input values to the FORM_STRIDED_TUPLE pseudo aren't copies from a
+ // StridedOrContiguous class, fall back on REG_SEQUENCE node.
+ if (!UseFormStrided) {
+ static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
+ AArch64::zsub2, AArch64::zsub3};
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::REG_SEQUENCE),
+ MI.getOperand(0).getReg());
+
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MIB.add(MI.getOperand(I));
+ MIB.addImm(SubRegs[I - 1]);
+ }
+
+ MI.eraseFromParent();
+ }
+ return;
+ }
+
// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
// have nothing to do with VG, were it not that they are used to materialise a
// frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 5fb44fe5146d3c..af04a7f3d08370 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -29,7 +29,7 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
-def SDT_FORM_STRIDED_TUPLE_X2 : SDTypeProfile<4, 4,
+def SDT_FORM_STRIDED_TUPLE_X2 : SDTypeProfile<2, 2,
[SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 41508bce651c6b..510140cc402d92 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -38,12 +38,14 @@ def FORM_STRIDED_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2Mul2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
+ let hasPostISelHook = 1;
}
def FORM_STRIDED_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4Mul4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
+ let hasPostISelHook = 1;
}
def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
@@ -186,6 +188,12 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
+class SME2_ZA_TwoOp_VG2_Multi_Index_FormStrided_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
+ Operand imm_ty, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (FORM_STRIDED_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
+
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
@@ -194,6 +202,14 @@ class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic
(REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
zpr_ty:$Zm, imm_ty:$i)>;
+class SME2_ZA_TwoOp_VG4_Multi_Index_FormStrided_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
+ Operand imm_ty, ComplexPattern tileslice>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (FORM_STRIDED_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ zpr_ty:$Zm, imm_ty:$i)>;
+
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
: Pat<(out_vt (intrinsic in_vt:$Zn1, in_vt:$Zn2, (i32 imm_ty:$i))),
(!cast<Instruction>(name) (REG_SEQUENCE ZPR2Mul2, in_vt:$Zn1, zsub0, in_vt:$Zn2, zsub1), imm_ty:$i)>;
@@ -2635,7 +2651,7 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
}
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
- def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_FormStrided_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
@@ -2778,7 +2794,7 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
- def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_FormStrided_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
>From ebc97b7cdbeb0f054688330e74b8a2b21b3d8b01 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 15 Nov 2024 15:43:35 +0000
Subject: [PATCH 05/10] [AArch64][SME2] Add register allocation hints for
ZPRMulReg
This patch implements getRegAllocationHints to improve register
allocation for the ZPR2Mul2Reg & ZPR4Mul4Reg classes.
If a FORM_STRIDED_TUPLE is found, getRegAllocationHints will try
to find a contiguous ZPRMulReg beginning with the same subregister
as the first operand of the pseudo.
For example, if the first strided load has been assigned $z16_z20_z24_z28
and the operands of the pseudo are each accessing subregister zsub2, the
correct register to use would be $z24_z25_z26_z27.
---
.../Target/AArch64/AArch64RegisterInfo.cpp | 75 ++++++++++
llvm/lib/Target/AArch64/AArch64RegisterInfo.h | 5 +
.../AArch64/sme2-intrinsics-int-dots.ll | 128 ++++--------------
.../CodeGen/AArch64/sme2-intrinsics-vdot.ll | 112 +++------------
4 files changed, 124 insertions(+), 196 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index f13c162776a9b1..73184ddb081ae2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1107,6 +1107,81 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
+// FORM_STRIDED_TUPLE nodes are created to improve register allocation where
+// a consecutive multi-vector tuple is constructed from the same indices of
+// multiple strided loads. This may still result in unnecessary copies between
+// the loads and the tuple. Here we try to return a hint to assign the
+// contiguous ZPRMulReg starting at the same register as the first operand of
+// the pseudo, which should be a subregister of the first strided load.
+//
+// For example, if the first strided load has been assigned $z16_z20_z24_z28
+// and the operands of the pseudo are each accessing subregister zsub2, we
+// should look through through Order to find a contiguous register which
+// begins with $z24 (i.e. $z24_z25_z26_z27).
+//
+bool AArch64RegisterInfo::getRegAllocationHints(
+ Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+ const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool DefaultHints =
+ TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+
+ unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+ if (RegID != AArch64::ZPR2Mul2RegClassID &&
+ RegID != AArch64::ZPR4Mul4RegClassID)
+ return DefaultHints;
+
+ for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
+ if (MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO &&
+ MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO)
+ continue;
+
+ // Look up the physical register mapped to the first load of the pseudo.
+ Register FirstLoadVirtReg = MI.getOperand(1).getReg();
+ if (!VRM->hasPhys(FirstLoadVirtReg))
+ continue;
+
+ unsigned SubRegIdx = 0;
+ MCRegister FirstLoadPhysReg = VRM->getPhys(FirstLoadVirtReg);
+
+ // The subreg number is used to access the correct unit of the
+ // strided register found in the map above.
+ switch (MI.getOperand(1).getSubReg()) {
+ case AArch64::zsub0:
+ break;
+ case AArch64::zsub1:
+ SubRegIdx = 1;
+ break;
+ case AArch64::zsub2:
+ SubRegIdx = 2;
+ break;
+ case AArch64::zsub3:
+ SubRegIdx = 3;
+ break;
+ default:
+ continue;
+ }
+
+ SmallVector<Register, 4> RegUnits;
+ for (MCRegUnit Unit : TRI->regunits(FirstLoadPhysReg))
+ RegUnits.push_back(Unit);
+
+ // Find the contiguous ZPRMul register which starts with the
+ // same register unit as the strided register and add to Hints.
+ Register StartReg = RegUnits[SubRegIdx];
+ for (unsigned I = 0; I < Order.size(); ++I) {
+ Register Reg = *TRI->regunits(Order[I]).begin();
+ if (Reg == StartReg)
+ Hints.push_back(Order[I]);
+ }
+ }
+
+ return DefaultHints;
+}
+
unsigned AArch64RegisterInfo::getLocalAddressRegister(
const MachineFunction &MF) const {
const auto &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 5c8a5e029584fc..11da624af4881b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -134,6 +134,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
unsigned getLocalAddressRegister(const MachineFunction &MF) const;
bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index eddff238ace031..ef569e480ea3d6 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -590,12 +590,8 @@ define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -622,26 +618,10 @@ define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -752,12 +732,8 @@ define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -784,26 +760,10 @@ define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -916,12 +876,8 @@ define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -948,26 +904,10 @@ define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -1080,12 +1020,8 @@ define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -1112,26 +1048,10 @@ define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index dec2dfb6f687ca..49106e12378bea 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -82,12 +82,8 @@ define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -114,26 +110,10 @@ define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -218,12 +198,8 @@ define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0]
; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -250,26 +226,10 @@ define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -331,26 +291,10 @@ define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
@@ -412,26 +356,10 @@ define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
; CHECK-NEXT: add x10, x9, x1
; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
-; CHECK-NEXT: mov z0.d, z16.d
-; CHECK-NEXT: mov z1.d, z17.d
-; CHECK-NEXT: mov z2.d, z18.d
-; CHECK-NEXT: mov z3.d, z19.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z20.d
-; CHECK-NEXT: mov z1.d, z21.d
-; CHECK-NEXT: mov z2.d, z22.d
-; CHECK-NEXT: mov z3.d, z23.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z24.d
-; CHECK-NEXT: mov z1.d, z25.d
-; CHECK-NEXT: mov z2.d, z26.d
-; CHECK-NEXT: mov z3.d, z27.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
-; CHECK-NEXT: mov z0.d, z28.d
-; CHECK-NEXT: mov z1.d, z29.d
-; CHECK-NEXT: mov z2.d, z30.d
-; CHECK-NEXT: mov z3.d, z31.d
-; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
>From 43939b9cba7416b4fdd1d5f42735bc83b826e163 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 29 Nov 2024 15:14:03 +0000
Subject: [PATCH 06/10] - Remove unused references to FORM_STRIDED_TUPLE -
Removed switch from getRegAllocationHints
---
.../lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 2 --
.../lib/Target/AArch64/AArch64ISelLowering.cpp | 2 --
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 ---
.../lib/Target/AArch64/AArch64RegisterInfo.cpp | 18 +++---------------
llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 10 ----------
5 files changed, 3 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 827f5a5386b829..1969c830f4d312 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -504,8 +504,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectAllActivePredicate(SDValue N);
bool SelectAnyPredicate(SDValue N);
-
- void SelectFormTuplePseudo(SDNode *N);
};
class AArch64DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8856a5dff47cb3..777d093c9dec82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2808,8 +2808,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::FMUL_PRED)
MAKE_CASE(AArch64ISD::FSUB_PRED)
MAKE_CASE(AArch64ISD::RDSVL)
- MAKE_CASE(AArch64ISD::FORM_STRIDED_TUPLE_X2)
- MAKE_CASE(AArch64ISD::FORM_STRIDED_TUPLE_X4)
MAKE_CASE(AArch64ISD::BIC)
MAKE_CASE(AArch64ISD::CBZ)
MAKE_CASE(AArch64ISD::CBNZ)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c7a70ab9f3c898..d11da64d3f84eb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -478,9 +478,6 @@ enum NodeType : unsigned {
SME_ZA_LDR,
SME_ZA_STR,
- FORM_STRIDED_TUPLE_X2,
- FORM_STRIDED_TUPLE_X4,
-
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 73184ddb081ae2..040afa19657d9f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1144,26 +1144,14 @@ bool AArch64RegisterInfo::getRegAllocationHints(
if (!VRM->hasPhys(FirstLoadVirtReg))
continue;
- unsigned SubRegIdx = 0;
+ int64_t SubRegIdx = -1;
MCRegister FirstLoadPhysReg = VRM->getPhys(FirstLoadVirtReg);
// The subreg number is used to access the correct unit of the
// strided register found in the map above.
- switch (MI.getOperand(1).getSubReg()) {
- case AArch64::zsub0:
- break;
- case AArch64::zsub1:
- SubRegIdx = 1;
- break;
- case AArch64::zsub2:
- SubRegIdx = 2;
- break;
- case AArch64::zsub3:
- SubRegIdx = 3;
- break;
- default:
+ SubRegIdx = MI.getOperand(1).getSubReg() - AArch64::zsub0;
+ if (SubRegIdx < 0 || SubRegIdx > 3)
continue;
- }
SmallVector<Register, 4> RegUnits;
for (MCRegUnit Unit : TRI->regunits(FirstLoadPhysReg))
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index af04a7f3d08370..06d7b38a90f47c 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -29,16 +29,6 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
-def SDT_FORM_STRIDED_TUPLE_X2 : SDTypeProfile<2, 2,
- [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
-
-def SDT_FORM_STRIDED_TUPLE_X4 : SDTypeProfile<4, 4,
- [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
- SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>,
- SDTCisSameAs<0, 6>, SDTCisSameAs<0, 7>]>;
-
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
>From 426253cf2bc994fda37caedd9db4b478b9fcd2ff Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Mon, 2 Dec 2024 13:58:44 +0000
Subject: [PATCH 07/10] - Removed Multi_Index_FormStrided_Pat classes and
reused existing classes - Removed isReg() check for operand 1 of a copy -
Moved loop over FORM_STRIDED_TUPLE operands to new function.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 99 ++++++++++---------
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 1 -
llvm/lib/Target/AArch64/SMEInstrFormats.td | 18 +---
3 files changed, 52 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 777d093c9dec82..431f90399e5707 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8646,6 +8646,55 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
+bool shouldUseFormStridedPseudo(MachineInstr &MI) {
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ bool UseFormStrided = false;
+ unsigned NumOperands =
+ MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ? 2 : 4;
+
+ // The FORM_STRIDED_TUPLE pseudo should only be used if the input operands
+ // are copy nodes where the source register is in a StridedOrContiguous
+ // class. For example:
+ // %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+ // %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+ // %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+ // %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+ // %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+ // %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+ // %9:zpr2mul2 = FORM_STRIDED_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+
+ MCRegister SubReg = MCRegister::NoRegister;
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ assert(MO.isReg() && "Unexpected operand to FORM_STRIDED_TUPLE");
+
+ MachineOperand *Def = MRI.getOneDef(MO.getReg());
+ if (!Def || !Def->isReg() || !Def->getParent()->isCopy()) {
+ UseFormStrided = false;
+ break;
+ }
+
+ MachineOperand CpyOp = Def->getParent()->getOperand(1);
+ MachineOperand *Ld = MRI.getOneDef(CpyOp.getReg());
+ unsigned OpSubReg = CpyOp.getSubReg();
+ if (SubReg == MCRegister::NoRegister)
+ SubReg = OpSubReg;
+ if (!Ld || !Ld->isReg() || OpSubReg != SubReg) {
+ UseFormStrided = false;
+ break;
+ }
+
+ const TargetRegisterClass *RegClass =
+ NumOperands == 2 ? &AArch64::ZPR2StridedOrContiguousRegClass
+ : &AArch64::ZPR4StridedOrContiguousRegClass;
+
+ if (MRI.getRegClass(Ld->getReg()) == RegClass)
+ UseFormStrided = true;
+ }
+
+ return UseFormStrided;
+}
+
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
// Live-in physreg copies that are glued to SMSTART are applied as
@@ -8673,57 +8722,9 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
if (MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ||
MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO) {
- MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- bool UseFormStrided = false;
- unsigned Size =
- MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ? 2 : 4;
-
- // The FORM_STRIDED_TUPLE pseudo should only be used if the input operands
- // are copy nodes where the source register is in a StridedOrContiguous
- // class. For example:
- // %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
- // %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
- // %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
- // %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
- // %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
- // %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
- // %9:zpr2mul2 = FORM_STRIDED_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
-
- SmallVector<unsigned, 4> OpSubRegs;
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MachineOperand &MO = MI.getOperand(I);
- if (!MO.isReg())
- continue;
-
- MachineOperand *Def = MRI.getOneDef(MO.getReg());
- if (!Def || !Def->isReg() || !Def->getParent()->isCopy())
- continue;
-
- MachineInstr *Cpy = Def->getParent();
- MachineOperand CpyOp = Cpy->getOperand(1);
- if (!CpyOp.isReg())
- continue;
-
- MachineOperand *Ld = MRI.getOneDef(CpyOp.getReg());
- OpSubRegs.push_back(CpyOp.getSubReg());
- if (!Ld || !Ld->isReg())
- continue;
-
- const TargetRegisterClass *RegClass =
- Size == 2 ? &AArch64::ZPR2StridedOrContiguousRegClass
- : &AArch64::ZPR4StridedOrContiguousRegClass;
-
- if (MRI.getRegClass(Ld->getReg()) == RegClass)
- UseFormStrided = true;
- }
-
- // Ensure the operands all use the same subreg index.
- if (!std::equal(OpSubRegs.begin(), OpSubRegs.end(), OpSubRegs.begin()))
- UseFormStrided = false;
-
// If input values to the FORM_STRIDED_TUPLE pseudo aren't copies from a
// StridedOrContiguous class, fall back on REG_SEQUENCE node.
- if (!UseFormStrided) {
+ if (!shouldUseFormStridedPseudo(MI)) {
static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2, AArch64::zsub3};
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 06d7b38a90f47c..a6ba6ddc30b277 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -28,7 +28,6 @@ def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
-
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 510140cc402d92..efee1edcf52404 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -184,26 +184,12 @@ class SME2_ZA_TwoOp_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Op
class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
- : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), zpr_ty:$Zm, imm_ty:$i)>;
-
-class SME2_ZA_TwoOp_VG2_Multi_Index_FormStrided_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
- Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
(FORM_STRIDED_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
- : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
- vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
- zpr_ty:$Zm, imm_ty:$i)>;
-
-class SME2_ZA_TwoOp_VG4_Multi_Index_FormStrided_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
- Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
@@ -2651,7 +2637,7 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<
}
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
- def : SME2_ZA_TwoOp_VG2_Multi_Index_FormStrided_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+ def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
@@ -2794,7 +2780,7 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, vector_ty, VectorIndexS32b_timm, SMEMatrixArray>;
- def : SME2_ZA_TwoOp_VG4_Multi_Index_FormStrided_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
+ def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, vector_ty, vt, VectorIndexS32b_timm, tileslice16>;
def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
(!cast<Instruction>(NAME) MatrixOp32:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
>From 645e30b34899f12696ac2d4dbd8d57ec9f82cd27 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 10 Dec 2024 12:21:38 +0000
Subject: [PATCH 08/10] - shouldUseFormStridedPseudo sets RegClass using a
switch statement - Removed INTRINSIC_W_CHAIN from getIntrinsicID - Renamed
FORM_STRIDED_TUPLE_X#_PSEUDO -> FORM_TRANSPOSED_REG_TUPLE_X#_PSEUDO - Add
switch statement back into getRegAllocationHints - Use getSubReg in
getRegAllocationHints and remove index into RegUnits
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 4 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 77 +++++++++----------
.../Target/AArch64/AArch64RegisterInfo.cpp | 63 ++++++---------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 8 +-
4 files changed, 68 insertions(+), 84 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 1924812ca33c85..3afde85d891e67 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1754,9 +1754,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
- case AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO:
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
- case AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO:
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
}
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 431f90399e5707..831f1bf8347da8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7639,11 +7639,6 @@ static unsigned getIntrinsicID(const SDNode *N) {
return IID;
return Intrinsic::not_intrinsic;
}
- case ISD::INTRINSIC_W_CHAIN: {
- unsigned IID = N->getConstantOperandVal(1);
- if (IID < Intrinsic::num_intrinsics)
- return IID;
- }
}
}
@@ -8646,53 +8641,55 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
return ZExtBool;
}
+// The FORM_TRANSPOSED_REG_TUPLE pseudo should only be used if the
+// input operands are copy nodes where the source register is in a
+// StridedOrContiguous class. For example:
+//
+// %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
+// %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
+// %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
+// %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
+// %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
+// %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
+// %9:zpr2mul2 = FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
+//
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
- bool UseFormStrided = false;
- unsigned NumOperands =
- MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ? 2 : 4;
-
- // The FORM_STRIDED_TUPLE pseudo should only be used if the input operands
- // are copy nodes where the source register is in a StridedOrContiguous
- // class. For example:
- // %3:zpr2stridedorcontiguous = LD1B_2Z_IMM_PSEUDO ..
- // %4:zpr = COPY %3.zsub1:zpr2stridedorcontiguous
- // %5:zpr = COPY %3.zsub0:zpr2stridedorcontiguous
- // %6:zpr2stridedorcontiguous = LD1B_2Z_PSEUDO ..
- // %7:zpr = COPY %6.zsub1:zpr2stridedorcontiguous
- // %8:zpr = COPY %6.zsub0:zpr2stridedorcontiguous
- // %9:zpr2mul2 = FORM_STRIDED_TUPLE_X2_PSEUDO %5:zpr, %8:zpr
MCRegister SubReg = MCRegister::NoRegister;
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MachineOperand &MO = MI.getOperand(I);
- assert(MO.isReg() && "Unexpected operand to FORM_STRIDED_TUPLE");
+ assert(MO.isReg() && "Unexpected operand to FORM_TRANSPOSED_REG_TUPLE");
MachineOperand *Def = MRI.getOneDef(MO.getReg());
- if (!Def || !Def->isReg() || !Def->getParent()->isCopy()) {
- UseFormStrided = false;
- break;
- }
+ if (!Def || !Def->getParent()->isCopy())
+ return false;
- MachineOperand CpyOp = Def->getParent()->getOperand(1);
- MachineOperand *Ld = MRI.getOneDef(CpyOp.getReg());
- unsigned OpSubReg = CpyOp.getSubReg();
+ const MachineOperand &CpySrc = Def->getParent()->getOperand(1);
+ MachineOperand *CopySrcOp = MRI.getOneDef(CpySrc.getReg());
+ unsigned OpSubReg = CpySrc.getSubReg();
if (SubReg == MCRegister::NoRegister)
SubReg = OpSubReg;
- if (!Ld || !Ld->isReg() || OpSubReg != SubReg) {
- UseFormStrided = false;
+ if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg)
+ return false;
+
+ const TargetRegisterClass *RegClass = nullptr;
+ switch (MI.getNumOperands() - 1) {
+ case 2:
+ RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
+ break;
+ case 4:
+ RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
break;
+ default:
+ llvm_unreachable("Unexpected number of operands to pseudo.");
}
- const TargetRegisterClass *RegClass =
- NumOperands == 2 ? &AArch64::ZPR2StridedOrContiguousRegClass
- : &AArch64::ZPR4StridedOrContiguousRegClass;
-
- if (MRI.getRegClass(Ld->getReg()) == RegClass)
- UseFormStrided = true;
+ if (MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
+ return false;
}
- return UseFormStrided;
+ return true;
}
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
@@ -8720,10 +8717,10 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
- if (MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO ||
- MI.getOpcode() == AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO) {
- // If input values to the FORM_STRIDED_TUPLE pseudo aren't copies from a
- // StridedOrContiguous class, fall back on REG_SEQUENCE node.
+ if (MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+ MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
+ // If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
+ // from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
if (!shouldUseFormStridedPseudo(MI)) {
static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
AArch64::zsub2, AArch64::zsub3};
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 040afa19657d9f..3659b13bd2013c 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1107,10 +1107,10 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
-// FORM_STRIDED_TUPLE nodes are created to improve register allocation where
-// a consecutive multi-vector tuple is constructed from the same indices of
-// multiple strided loads. This may still result in unnecessary copies between
-// the loads and the tuple. Here we try to return a hint to assign the
+// FORM_TRANSPOSED_REG_TUPLE nodes are created to improve register allocation
+// where a consecutive multi-vector tuple is constructed from the same indices
+// of multiple strided loads. This may still result in unnecessary copies
+// between the loads and the tuple. Here we try to return a hint to assign the
// contiguous ZPRMulReg starting at the same register as the first operand of
// the pseudo, which should be a subregister of the first strided load.
//
@@ -1123,51 +1123,38 @@ bool AArch64RegisterInfo::getRegAllocationHints(
Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
- const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
- const TargetRegisterInfo *TRI = STI.getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- bool DefaultHints =
- TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
-
- unsigned RegID = MRI.getRegClass(VirtReg)->getID();
- if (RegID != AArch64::ZPR2Mul2RegClassID &&
- RegID != AArch64::ZPR4Mul4RegClassID)
- return DefaultHints;
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
- if (MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO &&
- MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO)
+ if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
+ MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
continue;
- // Look up the physical register mapped to the first load of the pseudo.
- Register FirstLoadVirtReg = MI.getOperand(1).getReg();
- if (!VRM->hasPhys(FirstLoadVirtReg))
+ switch (MI.getOperand(1).getSubReg()) {
+ case AArch64::zsub0:
+ case AArch64::zsub1:
+ case AArch64::zsub2:
+ case AArch64::zsub3:
+ break;
+ default:
continue;
+ }
- int64_t SubRegIdx = -1;
- MCRegister FirstLoadPhysReg = VRM->getPhys(FirstLoadVirtReg);
-
- // The subreg number is used to access the correct unit of the
- // strided register found in the map above.
- SubRegIdx = MI.getOperand(1).getSubReg() - AArch64::zsub0;
- if (SubRegIdx < 0 || SubRegIdx > 3)
+ // Look up the physical register mapped to the first operand of the pseudo.
+ Register FirstOpVirtReg = MI.getOperand(1).getReg();
+ if (!VRM->hasPhys(FirstOpVirtReg))
continue;
- SmallVector<Register, 4> RegUnits;
- for (MCRegUnit Unit : TRI->regunits(FirstLoadPhysReg))
- RegUnits.push_back(Unit);
-
- // Find the contiguous ZPRMul register which starts with the
- // same register unit as the strided register and add to Hints.
- Register StartReg = RegUnits[SubRegIdx];
- for (unsigned I = 0; I < Order.size(); ++I) {
- Register Reg = *TRI->regunits(Order[I]).begin();
- if (Reg == StartReg)
- Hints.push_back(Order[I]);
- }
+ MCRegister TupleStartReg =
+ getSubReg(VRM->getPhys(FirstOpVirtReg), MI.getOperand(1).getSubReg());
+ for (unsigned I = 0; I < Order.size(); ++I)
+ if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
+ if (R == TupleStartReg)
+ Hints.push_back(Order[I]);
}
- return DefaultHints;
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
}
unsigned AArch64RegisterInfo::getLocalAddressRegister(
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index efee1edcf52404..49a208849b2658 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -34,14 +34,14 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
-def FORM_STRIDED_TUPLE_X2_PSEUDO :
+def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2Mul2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
let hasSideEffects = 0;
let hasPostISelHook = 1;
}
-def FORM_STRIDED_TUPLE_X4_PSEUDO :
+def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
Pseudo<(outs ZPR4Mul4:$tup),
(ins ZPR:$zn0, ZPR:$zn1, ZPR:$zn2, ZPR:$zn3), []>, Sched<[]>{
let hasSideEffects = 0;
@@ -186,14 +186,14 @@ class SME2_ZA_TwoOp_VG2_Multi_Index_Pat<string name, SDPatternOperator intrinsic
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_STRIDED_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
+ (FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO vt:$Zn1,vt:$Zn2), zpr_ty:$Zm, imm_ty:$i)>;
class SME2_ZA_TwoOp_VG4_Multi_Index_Pat<string name, SDPatternOperator intrinsic, Operand index_ty, ZPRRegOp zpr_ty, ValueType vt,
Operand imm_ty, ComplexPattern tileslice>
: Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)),
vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4, vt:$Zm, (i32 imm_ty:$i)),
(!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (FORM_STRIDED_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
+ (FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
zpr_ty:$Zm, imm_ty:$i)>;
class SME2_Sat_Shift_VG2_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, ValueType in_vt, Operand imm_ty>
>From d7ccfe1e247e28ee1f91ac48b163191dc0d88e80 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 10 Dec 2024 18:15:34 +0000
Subject: [PATCH 09/10] - Renamed CpySrc -> CopySrc - Moved switch in
shouldUseFormStridedPseudo outside of loop - Added a description of the
pseudo nodes to SMEInstrFormats.td
---
.../Target/AArch64/AArch64ISelLowering.cpp | 60 ++++++++++---------
.../Target/AArch64/AArch64RegisterInfo.cpp | 5 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 10 ++++
3 files changed, 44 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 831f1bf8347da8..c363adbd35b98d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8656,6 +8656,18 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
bool shouldUseFormStridedPseudo(MachineInstr &MI) {
MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ const TargetRegisterClass *RegClass = nullptr;
+ switch (MI.getOpcode()) {
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+ RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
+ break;
+ case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+ RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode.");
+ }
+
MCRegister SubReg = MCRegister::NoRegister;
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
MachineOperand &MO = MI.getOperand(I);
@@ -8665,26 +8677,15 @@ bool shouldUseFormStridedPseudo(MachineInstr &MI) {
if (!Def || !Def->getParent()->isCopy())
return false;
- const MachineOperand &CpySrc = Def->getParent()->getOperand(1);
- MachineOperand *CopySrcOp = MRI.getOneDef(CpySrc.getReg());
- unsigned OpSubReg = CpySrc.getSubReg();
+ const MachineOperand &CopySrc = Def->getParent()->getOperand(1);
+ unsigned OpSubReg = CopySrc.getSubReg();
if (SubReg == MCRegister::NoRegister)
SubReg = OpSubReg;
+
+ MachineOperand *CopySrcOp = MRI.getOneDef(CopySrc.getReg());
if (!CopySrcOp || !CopySrcOp->isReg() || OpSubReg != SubReg)
return false;
- const TargetRegisterClass *RegClass = nullptr;
- switch (MI.getNumOperands() - 1) {
- case 2:
- RegClass = &AArch64::ZPR2StridedOrContiguousRegClass;
- break;
- case 4:
- RegClass = &AArch64::ZPR4StridedOrContiguousRegClass;
- break;
- default:
- llvm_unreachable("Unexpected number of operands to pseudo.");
- }
-
if (MRI.getRegClass(CopySrcOp->getReg()) != RegClass)
return false;
}
@@ -8721,22 +8722,23 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MI.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO) {
// If input values to the FORM_TRANSPOSED_REG_TUPLE pseudo aren't copies
// from a StridedOrContiguous class, fall back on REG_SEQUENCE node.
- if (!shouldUseFormStridedPseudo(MI)) {
- static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
- AArch64::zsub2, AArch64::zsub3};
-
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
- TII->get(TargetOpcode::REG_SEQUENCE),
- MI.getOperand(0).getReg());
-
- for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
- MIB.add(MI.getOperand(I));
- MIB.addImm(SubRegs[I - 1]);
- }
+ if (shouldUseFormStridedPseudo(MI))
+ return;
- MI.eraseFromParent();
+ static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
+ AArch64::zsub2, AArch64::zsub3};
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(TargetOpcode::REG_SEQUENCE),
+ MI.getOperand(0).getReg());
+
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ MIB.add(MI.getOperand(I));
+ MIB.addImm(SubRegs[I - 1]);
}
+
+ MI.eraseFromParent();
return;
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 3659b13bd2013c..547f1f0a9fa99b 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1130,7 +1130,8 @@ bool AArch64RegisterInfo::getRegAllocationHints(
MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
continue;
- switch (MI.getOperand(1).getSubReg()) {
+ unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
+ switch (FirstOpSubReg) {
case AArch64::zsub0:
case AArch64::zsub1:
case AArch64::zsub2:
@@ -1146,7 +1147,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
continue;
MCRegister TupleStartReg =
- getSubReg(VRM->getPhys(FirstOpVirtReg), MI.getOperand(1).getSubReg());
+ getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
for (unsigned I = 0; I < Order.size(); ++I)
if (MCRegister R = getSubReg(Order[I], AArch64::zsub0))
if (R == TupleStartReg)
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 49a208849b2658..9874593a4936e5 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -34,6 +34,16 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>;
def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
+// The FORM_TRANSPOSED_REG_TUPLE pseudos defined below are intended to
+// improve register allocation for intrinsics which use strided and contiguous
+// multi-vector registers, avoiding unnecessary copies.
+// If the operands of the pseudo are copies where the source register is in
+// the StridedOrContiguous class, the pseudo is used to provide a hint to the
+// register allocator suggesting a contigious multi-vector register which
+// matches the subregister sequence used by the operands.
+// If the operands do not match this pattern, the pseudos are expanded
+// to a REG_SEQUENCE using the post-isel hook.
+
def FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO :
Pseudo<(outs ZPR2Mul2:$tup),
(ins ZPR:$zn0, ZPR:$zn1), []>, Sched<[]>{
>From 0f0bc8425ea5f58f95868085d7db12d885bf3ea3 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 11 Dec 2024 09:58:19 +0000
Subject: [PATCH 10/10] - Return early from getRegAllocationHints if the opcode
is not a FORM_TRANSPOSED pseudo or the subreg indices are not in the range
zsub0-zsub3.
---
llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 547f1f0a9fa99b..876ca0004ade65 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1128,7 +1128,7 @@ bool AArch64RegisterInfo::getRegAllocationHints(
for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
- continue;
+ break;
unsigned FirstOpSubReg = MI.getOperand(1).getSubReg();
switch (FirstOpSubReg) {
@@ -1138,13 +1138,14 @@ bool AArch64RegisterInfo::getRegAllocationHints(
case AArch64::zsub3:
break;
default:
- continue;
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+ MF, VRM);
}
// Look up the physical register mapped to the first operand of the pseudo.
Register FirstOpVirtReg = MI.getOperand(1).getReg();
if (!VRM->hasPhys(FirstOpVirtReg))
- continue;
+ break;
MCRegister TupleStartReg =
getSubReg(VRM->getPhys(FirstOpVirtReg), FirstOpSubReg);
More information about the llvm-commits
mailing list