[llvm] [InstCombine, AArch64] Avoid vector Ext in case by-element operation variant apply for all elements (PR #140733)

Tue May 20 07:04:53 PDT 2025

https://github.com/ayasin-a created https://github.com/llvm/llvm-project/pull/140733

This patch permits the use of by-element operations (also known as indexed-variant) in vector code, when all elements of the vector are used and the shuffle is a splat.

This is achieved by avoiding the combine of `shuffle(s/zext)` into `s/zext(shuffle)` in `performBuildShuffleExtendCombine` when the shuffles are splats (same index is used for all elements in mask) and all vector elements are exploited by the DAG. The later condition is meant to keep the above combine (which allows use of vector s/zext), and may be tuned.

For example, this pattern:

    %lanes = sext <4 x i16> %l to <4 x i32>
    %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
    %mul0 = mul <4 x i32> %shf0, %a
    %add0 = add <4 x i32> %mul0, %b
    %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    %mul1 = mul <4 x i32> %shf1, %a
    %add1 = add <4 x i32> %mul1, %b
    %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
    %mul2 = mul <4 x i32> %shf2, %a
    %add2 = add <4 x i32> %mul2, %b
    %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    %mul3 = mul <4 x i32> %shf3, %a
    %add3 = add <4 x i32> %mul3, %b

after this patch, will be lowered to:

    sshll v0.4s, v0.4h, #0
    mla v3.4s, v1.4s, v0.s[0]
    mla v4.4s, v1.4s, v0.s[1]
    mla v2.4s, v1.4s, v0.s[3]
    mla v5.4s, v1.4s, v0.s[2]

before this patch, is lowered to:

	dup.4h	v3, v0[0]
	dup.4h	v4, v0[1]
	dup.4h	v5, v0[2]
	dup.4h	v0, v0[3]
	mov.16b	v6, v2
	mov.16b	v7, v2
	sshll.4s	v3, v3, #0
	sshll.4s	v4, v4, #0
	sshll.4s	v5, v5, #0
	sshll.4s	v0, v0, #0
	mla.4s	v6, v3, v1
	mov.16b	v3, v2
	mla.4s	v7, v4, v1
	mla.4s	v2, v0, v1
	mla.4s	v3, v5, v1


>From 039e713b860391cb8b70c32a26489c613d59afe0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 20 May 2025 15:08:15 +0300
Subject: [PATCH] Avoid the vector SExt/ZExt in case indexed operation apply
 when using all elements in a vector

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  27 +++
 .../CodeGen/AArch64/aarch64-ext-shuffle.ll    | 166 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4e45162a687f8..31aef63eebefc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18602,6 +18602,33 @@ static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
     SeenZExtOrSExt = true;
   }
 
+  // Avoid the said use of vector SExt/ZExt in case all vector elements are
+  // consumed and each shuffle's mask uses same index, in order to permit use of
+  // indexed OP (e.g. MLA, MUL) variants
+  EVT ExtendType = Extend->getValueType(0);
+  if (ExtendType.isVector() && !ExtendType.isScalableVT()) {
+    const int NumElements = ExtendType.getVectorNumElements();
+    SmallBitVector UsedElements(NumElements, false);
+    for (auto UI = Extend.getNode()->use_begin(),
+              UE = Extend.getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = UI->getUser();
+      if (User->getOpcode() == ISD::VECTOR_SHUFFLE &&
+          User->getOperand(0) == Extend) {
+        ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
+        const int Idx = Mask[0];
+        if (Idx >= NumElements)
+          continue;
+        if (llvm::all_of(Mask, [Idx](int M) { return M == Idx; }))
+          UsedElements.set(Idx);
+        else
+          break; // early loop exit to help performance
+      }
+    }
+    if (UsedElements.all())
+      return SDValue();
+  }
+
   SDValue NBV;
   SDLoc DL(BV);
   if (BV.getOpcode() == ISD::BUILD_VECTOR) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
new file mode 100644
index 0000000000000..6aacf7b7709b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK-GI
+define <4 x i32> @ext_shuffle_v4i16_v4i32(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    mov v3.16b, v2.16b
+; CHECK-SD-NEXT:    mov v4.16b, v2.16b
+; CHECK-SD-NEXT:    mov v5.16b, v2.16b
+; CHECK-SD-NEXT:    mla v3.4s, v1.4s, v0.s[0]
+; CHECK-SD-NEXT:    mla v4.4s, v1.4s, v0.s[1]
+; CHECK-SD-NEXT:    mla v2.4s, v1.4s, v0.s[3]
+; CHECK-SD-NEXT:    mla v5.4s, v1.4s, v0.s[2]
+; CHECK-SD-NEXT:    sub v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    sub v1.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v3.16b, v2.16b
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    mov v5.16b, v2.16b
+; CHECK-GI-NEXT:    mla v3.4s, v1.4s, v0.s[0]
+; CHECK-GI-NEXT:    mla v4.4s, v1.4s, v0.s[1]
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.s[3]
+; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.s[2]
+; CHECK-GI-NEXT:    sub v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    sub v1.4s, v5.4s, v2.4s
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
+  %lanes = sext <4 x i16> %l to <4 x i32>
+  %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul0 = mul <4 x i32> %shf0, %a
+  %add0 = add <4 x i32> %mul0, %b
+  %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul1 = mul <4 x i32> %shf1, %a
+  %add1 = add <4 x i32> %mul1, %b
+  %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %mul2 = mul <4 x i32> %shf2, %a
+  %add2 = add <4 x i32> %mul2, %b
+  %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul3 = mul <4 x i32> %shf3, %a
+  %add3 = add <4 x i32> %mul3, %b
+  %sub1 = sub <4 x i32> %add0, %add1
+  %sub2 = sub <4 x i32> %add2, %add3
+  %sub3 = sub <4 x i32> %sub1, %sub2
+  ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_partial(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_partial:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v3.4h, v0.h[0]
+; CHECK-SD-NEXT:    dup v4.4h, v0.h[1]
+; CHECK-SD-NEXT:    mov v5.16b, v2.16b
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-SD-NEXT:    mov v6.16b, v2.16b
+; CHECK-SD-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    mla v5.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    mla v6.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    sub v0.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_partial:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v3.16b, v2.16b
+; CHECK-GI-NEXT:    mov v4.16b, v2.16b
+; CHECK-GI-NEXT:    mla v3.4s, v1.4s, v0.s[0]
+; CHECK-GI-NEXT:    mla v4.4s, v1.4s, v0.s[1]
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.s[2]
+; CHECK-GI-NEXT:    sub v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ret
+  %lanes = sext <4 x i16> %l to <4 x i32>
+  %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul0 = mul <4 x i32> %shf0, %a
+  %add0 = add <4 x i32> %mul0, %b
+  %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul1 = mul <4 x i32> %shf1, %a
+  %add1 = add <4 x i32> %mul1, %b
+  %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %mul2 = mul <4 x i32> %shf2, %a
+  %add2 = add <4 x i32> %mul2, %b
+  %sub1 = sub <4 x i32> %add0, %add1
+  %sub3 = sub <4 x i32> %sub1, %add2
+  ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_add(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_add:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    dup v1.4s, v0.s[0]
+; CHECK-SD-NEXT:    dup v3.4s, v0.s[1]
+; CHECK-SD-NEXT:    dup v4.4s, v0.s[2]
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[3]
+; CHECK-SD-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT:    add v3.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    add v4.4s, v4.4s, v2.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    sub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_add:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    dup v1.4s, v0.s[0]
+; CHECK-GI-NEXT:    dup v3.4s, v0.s[1]
+; CHECK-GI-NEXT:    dup v4.4s, v0.s[2]
+; CHECK-GI-NEXT:    dup v0.4s, v0.s[3]
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    add v4.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    sub v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+  %lanes = sext <4 x i16> %l to <4 x i32>
+  %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add0 = add <4 x i32> %shf0, %b
+  %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %add1 = add <4 x i32> %shf1, %b
+  %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %add2 = add <4 x i32> %shf2, %b
+  %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %add3 = add <4 x i32> %shf3, %b
+  %sub1 = sub <4 x i32> %add0, %add1
+  %sub2 = sub <4 x i32> %add2, %add3
+  %sub3 = sub <4 x i32> %sub1, %sub2
+  ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_one(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_one:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[3]
+; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_one:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mla v0.4s, v1.4s, v3.s[3]
+; CHECK-GI-NEXT:    ret
+  %lanes = sext <4 x i16> %l to <4 x i32>
+  %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul3 = mul <4 x i32> %shf3, %a
+  %add3 = add <4 x i32> %mul3, %b
+  ret <4 x i32> %add3
+}