[llvm] [AArch64][SME] Make getRegAllocationHints stricter for multi-vector loads (PR #123081)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 17 01:16:10 PST 2025
================
@@ -0,0 +1,183 @@
+# RUN: llc -force-streaming -verify-machineinstrs -enable-subreg-liveness -start-before=greedy %s -o - | FileCheck %s
+
+# No available group of four strided x4 registers, fall back on default allocation order
+--- |
+ ; ModuleID = '<stdin>'
+ source_filename = "<stdin>"
+ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+ target triple = "aarch64-linux-gnu"
+
+ ; Function Attrs: nounwind
+ define void @form_4x_tuple_many_live(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+ entry:
+ %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+ %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+ %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+ %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+ %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+ %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+ %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+ %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+ %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+ %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+ %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+ %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+ %mul3 = shl i64 %stride, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+ %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+ %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+ %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+ %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+ %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+ %mul5 = mul i64 %stride, 3
+ %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+ %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+ %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+ %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+ %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+ %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+ tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+ store <vscale x 16 x i8> %scalable_arg, ptr %ptr, align 16
+ ret void
+ }
+
+ ; Function Attrs: nocallback nofree nosync nounwind willreturn
+ declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32 immarg) #1
+
+ ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+ declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) #2
+
+ ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+ declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() #3
+
+ attributes #0 = { nounwind "target-features"="+sme2" }
+ attributes #1 = { nocallback nofree nosync nounwind willreturn }
+ attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+ attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+...
+---
+name: form_4x_tuple_many_live
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: true
+isSSA: false
+noVRegs: false
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+liveins:
+ - { reg: '$x0', virtual-reg: '%0' }
+ - { reg: '$x1', virtual-reg: '%1' }
+ - { reg: '$z0', virtual-reg: '%2' }
+ - { reg: '$z17', virtual-reg: '%3' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 1
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack: []
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x0, $x1, $z0, $z17
+
+ ; CHECK-LABEL: form_4x_tuple_many_live
+ ; CHECK: stp d11, d10, [sp, #-32]!
+ ; CHECK-NEXT: lsl x9, x1, #1
+ ; CHECK-NEXT: stp d9, d8, [sp, #16]
+ ; CHECK-NEXT: ptrue pn8.b
+ ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
+ ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x1]
+ ; CHECK-NEXT: mov w8, wzr
+ ; CHECK-NEXT: add x10, x9, x1
+ ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x9]
+ ; CHECK-NEXT: ptrue p0.b
+ ; CHECK-NEXT: ld1b { z4.b - z7.b }, pn8/z, [x0, x10]
+ ; CHECK-NEXT: mov z8.d, z16.d
+ ; CHECK-NEXT: mov z9.d, z18.d
+ ; CHECK-NEXT: mov z21.d, z22.d
+ ; CHECK-NEXT: mov z10.d, z19.d
+ ; CHECK-NEXT: mov z22.d, z23.d
+ ; CHECK-NEXT: mov z25.d, z26.d
+ ; CHECK-NEXT: mov z11.d, z4.d
+ ; CHECK-NEXT: mov z23.d, z5.d
+ ; CHECK-NEXT: mov z26.d, z27.d
+ ; CHECK-NEXT: mov z27.d, z6.d
+ ; CHECK-NEXT: mov z29.d, z30.d
+ ; CHECK-NEXT: mov z30.d, z31.d
+ ; CHECK-NEXT: mov z31.d, z7.d
+ ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+ ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
+ ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
+ ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
+ ; CHECK-NEXT: ldp d9, d8, [sp, #16]
+ ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
+ ; CHECK-NEXT: st1b { z17.b }, p0, [x0]
+ ; CHECK-NEXT: ldp d11, d10, [sp], #32
+ ; CHECK-NEXT: ret
+
+ %2:zpr = COPY $z0
+ %3:zpr = COPY $z17
+ %1:gpr64 = COPY $x1
+ %0:gpr64common = COPY $x0
+ %27:matrixindexgpr32_8_11 = COPY $wzr
+ %14:gpr64 = UBFMXri %1, 63, 62
+ %pred:pnr_p8to15 = PTRUE_C_B implicit $vg
+ %4:zpr4stridedorcontiguous = LD1B_4Z_IMM_PSEUDO %pred, %0, 0
+ %20:gpr64 = ADDXrr %14, %1
+ %9:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %1
+ %15:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %14
+ %21:zpr4stridedorcontiguous = LD1B_4Z_PSEUDO %pred, %0, %20
+ %26:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub0, %9.zsub0, %15.zsub0, %21.zsub0
+ $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %26, undef %28:zpr_4b, 0
+ %29:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub1, %9.zsub1, %15.zsub1, %21.zsub1
+ $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %29, undef %30:zpr_4b, 0
+ %31:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub2, %9.zsub2, %15.zsub2, %21.zsub2
+ %35:ppr_3b = PTRUE_B 31, implicit $vg
+ $za = UDOT_VG4_M4ZZI_BtoS $za, %27, 0, %31, undef %32:zpr_4b, 0
+ %33:zpr4mul4 = FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO %4.zsub3, %9.zsub3, %15.zsub3, %21.zsub3
----------------
sdesmalen-arm wrote:
could you do some manual re-scheduling of these instructions, to make the test easier to read?
https://github.com/llvm/llvm-project/pull/123081
More information about the llvm-commits
mailing list