[llvm] [AArch64] Match constants in SelectSMETileSlice (PR #151494)

Thu Jul 31 03:57:31 PDT 2025

https://github.com/sdesmalen-arm created https://github.com/llvm/llvm-project/pull/151494

If the slice is a constant then it should try to use `WZR + <imm>`
addressing mode if the constant fits the range.

>From ca2bea536c110bf2b39e3b0742b992ed0a108040 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Thu, 31 Jul 2025 10:49:52 +0000
Subject: [PATCH 1/2] Pre-commit test

---
 .../AArch64/sme-tileslice-addrmodes.ll        | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll

diff --git a/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll b/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
new file mode 100644
index 0000000000000..aaee521a8bd16
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sme2 -enable-subreg-liveness < %s| FileCheck %s
+
+target triple = "aarch64"
+
+define void @sme_tileslice_addrmode_zero_base_plus_constant_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_zero_base_plus_constant_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    fdot za.s[w9, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    mov w8, #8 // =0x8
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 1, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 7, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 8, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+define void @sme_tileslice_addrmode_base_plus_constant_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_base_plus_constant_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    add w9, w0, #8
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w9, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  %slice0 = add i32 %slice, 0
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice0, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice1 = add i32 %slice, 1
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice1, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice7 = add i32 %slice, 7
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice7, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  %slice8 = add i32 %slice, 8
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice8, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+define void @sme_tileslice_addrmode_base_plus_zero_offset(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: sme_tileslice_addrmode_base_plus_zero_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
+  ret void
+}
+
+declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)

>From 722d30afba0b5aba825d5b4cfec7fe953e190bcb Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Thu, 31 Jul 2025 10:39:23 +0000
Subject: [PATCH 2/2] [AArch64] Match constants in SelectSMETileSlice

If the slice is a constant then it should try to use WZR + <imm>
addressing mode if the constant fits the range.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 29 ++++++++++++++-----
 .../AArch64/sme-tileslice-addrmodes.ll        |  8 ++---
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ad42f4b56caf2..bc786f415b554 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7617,16 +7617,29 @@ bool AArch64DAGToDAGISel::SelectAnyPredicate(SDValue N) {
 bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned MaxSize,
                                              SDValue &Base, SDValue &Offset,
                                              unsigned Scale) {
-  // Try to untangle an ADD node into a 'reg + offset'
-  if (CurDAG->isBaseWithConstantOffset(N))
-    if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+  auto MatchConstantOffset = [&](SDValue CN) -> SDValue {
+    if (auto *C = dyn_cast<ConstantSDNode>(CN)) {
       int64_t ImmOff = C->getSExtValue();
-      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0))) {
-        Base = N.getOperand(0);
-        Offset = CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
-        return true;
-      }
+      if ((ImmOff > 0 && ImmOff <= MaxSize && (ImmOff % Scale == 0)))
+        return CurDAG->getTargetConstant(ImmOff / Scale, SDLoc(N), MVT::i64);
     }
+    return SDValue();
+  };
+
+  if (SDValue C = MatchConstantOffset(N)) {
+    Base = CurDAG->getConstant(0, SDLoc(N), MVT::i32);
+    Offset = C;
+    return true;
+  }
+
+  // Try to untangle an ADD node into a 'reg + offset'
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (SDValue C = MatchConstantOffset(N.getOperand(1))) {
+      Base = N.getOperand(0);
+      Offset = C;
+      return true;
+    }
+  }
 
   // By default, just match reg + 0.
   Base = N;
diff --git a/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll b/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
index aaee521a8bd16..cfe8e9ec4a0b6 100644
--- a/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
+++ b/llvm/test/CodeGen/AArch64/sme-tileslice-addrmodes.ll
@@ -7,13 +7,11 @@ define void @sme_tileslice_addrmode_zero_base_plus_constant_offset(i32 %slice, <
 ; CHECK-LABEL: sme_tileslice_addrmode_zero_base_plus_constant_offset:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
-; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    fdot za.s[w8, 1, vgx4], { z0.h - z3.h }, z4.h[0]
+; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[0]
 ; CHECK-NEXT:    fdot za.s[w9, 0, vgx4], { z0.h - z3.h }, z4.h[0]
-; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
-; CHECK-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[0]
 ; CHECK-NEXT:    ret
   tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)
   tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 1, <vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %3, <vscale x 8 x half> %4, i32 0)