[llvm] [AArch64][SME] Preserve `Chain` argument in SelectMultiVectorLutiLane (PR #161494)

Wed Oct 1 02:20:55 PDT 2025

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/161494

Previously, the `Chain` was dropped leading to two identical LUTI4 nodes that would be incorrectly CSE'd.

Fixes: #161420

>From b07ff80a54ce5535247c7467f4bc943f2bb7ceef Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 1 Oct 2025 09:16:00 +0000
Subject: [PATCH] [AArch64][SME] Preserve `Chain` argument in
 SelectMultiVectorLutiLane

Previously, the `Chain` was dropped leading to two identical LUTI4 nodes
that would be incorrectly CSE'd.

Fixes: #161420
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  3 +-
 llvm/test/CodeGen/AArch64/pr161420.ll         | 56 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/pr161420.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6a1b06eea4309..8d872eb85da22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
diff --git a/llvm/test/CodeGen/AArch64/pr161420.ll b/llvm/test/CodeGen/AArch64/pr161420.ll
new file mode 100644
index 0000000000000..ebab7e0f42e77
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr161420.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx15.0.0"
+
+; From https://github.com/llvm/llvm-project/issues/161420. This test checks that
+; two `luti4` instructions are emitted. Previously, we would incorrectly CSE
+; one as the `Chain` argument was dropped in SelectionDAG.
+
+define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
+; CHECK-LABEL: pluto:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    mov w8, #0 ; =0x0
+; CHECK-NEXT:    ldr zt0, [x1]
+; CHECK-NEXT:    ldr z4, [x3]
+; CHECK-NEXT:    ptrue pn8.h
+; CHECK-NEXT:    ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    luti4 { z16.h - z19.h }, zt0, z4[0]
+; CHECK-NEXT:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z16.h - z19.h }
+; CHECK-NEXT:    ldr zt0, [x2]
+; CHECK-NEXT:    luti4 { z4.h - z7.h }, zt0, z4[0]
+; CHECK-NEXT:    fmla za.h[w8, 2, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
+; CHECK-NEXT:    ret
+bb:
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)
+  %load = load <vscale x 16 x i8>, ptr %arg3, align 16
+  %call = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
+  %call4 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %call, ptr %arg)
+  %extractvalue = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 0
+  %extractvalue5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 1
+  %extractvalue6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 2
+  %extractvalue7 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 3
+  %call8 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
+  %extractvalue9 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 0
+  %extractvalue10 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 1
+  %extractvalue11 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 2
+  %extractvalue12 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 3
+  tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue9, <vscale x 8 x half> %extractvalue10, <vscale x 8 x half> %extractvalue11, <vscale x 8 x half> %extractvalue12)
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg2)
+  %call13 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
+  %extractvalue14 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 0
+  %extractvalue15 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 1
+  %extractvalue16 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 2
+  %extractvalue17 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 3
+  tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 2, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue14, <vscale x 8 x half> %extractvalue15, <vscale x 8 x half> %extractvalue16, <vscale x 8 x half> %extractvalue17)
+  ret void
+}
+
+declare void @llvm.aarch64.sme.ldr.zt(i32, ptr)
+declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 immarg, <vscale x 16 x i8>, i32 immarg)
+declare void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+
+attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn uwtable(sync) "aarch64_inout_za" "aarch64_inout_zt0" "aarch64_pstate_sm_enabled" "target-cpu"="apple-m1" "target-features"="+fp-armv8,+lse,+neon,+sme,+sme-f16f16,+sme2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a" }