[llvm] [InstCombine, AArch64] Avoid vector Ext in case by-element operation variant apply for all elements (PR #140733)
Ahmad Yasin via llvm-commits
llvm-commits at lists.llvm.org
Thu May 22 05:15:00 PDT 2025
https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/140733
>From 039e713b860391cb8b70c32a26489c613d59afe0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 20 May 2025 15:08:15 +0300
Subject: [PATCH 1/2] Avoid the vector SExt/ZExt in case indexed operation
apply when using all elements in a vector
---
.../Target/AArch64/AArch64ISelLowering.cpp | 27 +++
.../CodeGen/AArch64/aarch64-ext-shuffle.ll | 166 ++++++++++++++++++
2 files changed, 193 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4e45162a687f8..31aef63eebefc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18602,6 +18602,33 @@ static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
SeenZExtOrSExt = true;
}
+ // Avoid the said use of vector SExt/ZExt in case all vector elements are
+ // consumed and each shuffle's mask uses same index, in order to permit use of
+ // indexed OP (e.g. MLA, MUL) variants
+ EVT ExtendType = Extend->getValueType(0);
+ if (ExtendType.isVector() && !ExtendType.isScalableVT()) {
+ const int NumElements = ExtendType.getVectorNumElements();
+ SmallBitVector UsedElements(NumElements, false);
+ for (auto UI = Extend.getNode()->use_begin(),
+ UE = Extend.getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = UI->getUser();
+ if (User->getOpcode() == ISD::VECTOR_SHUFFLE &&
+ User->getOperand(0) == Extend) {
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
+ const int Idx = Mask[0];
+ if (Idx >= NumElements)
+ continue;
+ if (llvm::all_of(Mask, [Idx](int M) { return M == Idx; }))
+ UsedElements.set(Idx);
+ else
+ break; // early loop exit to help performance
+ }
+ }
+ if (UsedElements.all())
+ return SDValue();
+ }
+
SDValue NBV;
SDLoc DL(BV);
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
new file mode 100644
index 0000000000000..6aacf7b7709b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK-GI
+define <4 x i32> @ext_shuffle_v4i16_v4i32(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: mov v3.16b, v2.16b
+; CHECK-SD-NEXT: mov v4.16b, v2.16b
+; CHECK-SD-NEXT: mov v5.16b, v2.16b
+; CHECK-SD-NEXT: mla v3.4s, v1.4s, v0.s[0]
+; CHECK-SD-NEXT: mla v4.4s, v1.4s, v0.s[1]
+; CHECK-SD-NEXT: mla v2.4s, v1.4s, v0.s[3]
+; CHECK-SD-NEXT: mla v5.4s, v1.4s, v0.s[2]
+; CHECK-SD-NEXT: sub v0.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT: sub v1.4s, v2.4s, v5.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v3.16b, v2.16b
+; CHECK-GI-NEXT: mov v4.16b, v2.16b
+; CHECK-GI-NEXT: mov v5.16b, v2.16b
+; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
+; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[3]
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.s[2]
+; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: sub v1.4s, v5.4s, v2.4s
+; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: ret
+ %lanes = sext <4 x i16> %l to <4 x i32>
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %mul0 = mul <4 x i32> %shf0, %a
+ %add0 = add <4 x i32> %mul0, %b
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %mul1 = mul <4 x i32> %shf1, %a
+ %add1 = add <4 x i32> %mul1, %b
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %mul2 = mul <4 x i32> %shf2, %a
+ %add2 = add <4 x i32> %mul2, %b
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %mul3 = mul <4 x i32> %shf3, %a
+ %add3 = add <4 x i32> %mul3, %b
+ %sub1 = sub <4 x i32> %add0, %add1
+ %sub2 = sub <4 x i32> %add2, %add3
+ %sub3 = sub <4 x i32> %sub1, %sub2
+ ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_partial(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_partial:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: dup v3.4h, v0.h[0]
+; CHECK-SD-NEXT: dup v4.4h, v0.h[1]
+; CHECK-SD-NEXT: mov v5.16b, v2.16b
+; CHECK-SD-NEXT: dup v0.4h, v0.h[2]
+; CHECK-SD-NEXT: mov v6.16b, v2.16b
+; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT: sshll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: mla v5.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT: mla v6.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: sub v0.4s, v5.4s, v6.4s
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_partial:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v3.16b, v2.16b
+; CHECK-GI-NEXT: mov v4.16b, v2.16b
+; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
+; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[2]
+; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: sub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: ret
+ %lanes = sext <4 x i16> %l to <4 x i32>
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %mul0 = mul <4 x i32> %shf0, %a
+ %add0 = add <4 x i32> %mul0, %b
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %mul1 = mul <4 x i32> %shf1, %a
+ %add1 = add <4 x i32> %mul1, %b
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %mul2 = mul <4 x i32> %shf2, %a
+ %add2 = add <4 x i32> %mul2, %b
+ %sub1 = sub <4 x i32> %add0, %add1
+ %sub3 = sub <4 x i32> %sub1, %add2
+ ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_add(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_add:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: dup v1.4s, v0.s[0]
+; CHECK-SD-NEXT: dup v3.4s, v0.s[1]
+; CHECK-SD-NEXT: dup v4.4s, v0.s[2]
+; CHECK-SD-NEXT: dup v0.4s, v0.s[3]
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: add v3.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: add v4.4s, v4.4s, v2.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: sub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_add:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: dup v1.4s, v0.s[0]
+; CHECK-GI-NEXT: dup v3.4s, v0.s[1]
+; CHECK-GI-NEXT: dup v4.4s, v0.s[2]
+; CHECK-GI-NEXT: dup v0.4s, v0.s[3]
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v3.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT: add v4.4s, v4.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: sub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: sub v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: ret
+ %lanes = sext <4 x i16> %l to <4 x i32>
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %add0 = add <4 x i32> %shf0, %b
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %add1 = add <4 x i32> %shf1, %b
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %add2 = add <4 x i32> %shf2, %b
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %add3 = add <4 x i32> %shf3, %b
+ %sub1 = sub <4 x i32> %add0, %add1
+ %sub2 = sub <4 x i32> %add2, %add3
+ %sub3 = sub <4 x i32> %sub1, %sub2
+ ret <4 x i32> %sub3
+}
+
+define <4 x i32> @ext_shuffle_v4i16_v4i32_one(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_one:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: dup v0.4h, v0.h[3]
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_one:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: mla v0.4s, v1.4s, v3.s[3]
+; CHECK-GI-NEXT: ret
+ %lanes = sext <4 x i16> %l to <4 x i32>
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %mul3 = mul <4 x i32> %shf3, %a
+ %add3 = add <4 x i32> %mul3, %b
+ ret <4 x i32> %add3
+}
>From 87d224f9bc18b56144196adf7962b3225742c08e Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 22 May 2025 15:14:29 +0300
Subject: [PATCH 2/2] simplify the loop on Extend users + avoid undef use in
tests
---
.../Target/AArch64/AArch64ISelLowering.cpp | 5 +---
.../CodeGen/AArch64/aarch64-ext-shuffle.ll | 24 +++++++++----------
2 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 31aef63eebefc..c10fb7751329e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18609,10 +18609,7 @@ static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
if (ExtendType.isVector() && !ExtendType.isScalableVT()) {
const int NumElements = ExtendType.getVectorNumElements();
SmallBitVector UsedElements(NumElements, false);
- for (auto UI = Extend.getNode()->use_begin(),
- UE = Extend.getNode()->use_end();
- UI != UE; ++UI) {
- SDNode *User = UI->getUser();
+ for (SDNode *User : Extend.getNode()->users()) {
if (User->getOpcode() == ISD::VECTOR_SHUFFLE &&
User->getOperand(0) == Extend) {
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
index 6aacf7b7709b3..5d2d3f4082593 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
@@ -32,16 +32,16 @@ define <4 x i32> @ext_shuffle_v4i16_v4i32(<4 x i16> %l, <4 x i32> %a, <4 x i32>
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
- %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%mul0 = mul <4 x i32> %shf0, %a
%add0 = add <4 x i32> %mul0, %b
- %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul1 = mul <4 x i32> %shf1, %a
%add1 = add <4 x i32> %mul1, %b
- %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%mul2 = mul <4 x i32> %shf2, %a
%add2 = add <4 x i32> %mul2, %b
- %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul3 = mul <4 x i32> %shf3, %a
%add3 = add <4 x i32> %mul3, %b
%sub1 = sub <4 x i32> %add0, %add1
@@ -81,13 +81,13 @@ define <4 x i32> @ext_shuffle_v4i16_v4i32_partial(<4 x i16> %l, <4 x i32> %a, <4
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
- %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%mul0 = mul <4 x i32> %shf0, %a
%add0 = add <4 x i32> %mul0, %b
- %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul1 = mul <4 x i32> %shf1, %a
%add1 = add <4 x i32> %mul1, %b
- %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%mul2 = mul <4 x i32> %shf2, %a
%add2 = add <4 x i32> %mul2, %b
%sub1 = sub <4 x i32> %add0, %add1
@@ -128,13 +128,13 @@ define <4 x i32> @ext_shuffle_v4i16_v4i32_add(<4 x i16> %l, <4 x i32> %a, <4 x i
; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
- %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> zeroinitializer
+ %shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%add0 = add <4 x i32> %shf0, %b
- %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%add1 = add <4 x i32> %shf1, %b
- %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+ %shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%add2 = add <4 x i32> %shf2, %b
- %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%add3 = add <4 x i32> %shf3, %b
%sub1 = sub <4 x i32> %add0, %add1
%sub2 = sub <4 x i32> %add2, %add3
@@ -159,7 +159,7 @@ define <4 x i32> @ext_shuffle_v4i16_v4i32_one(<4 x i16> %l, <4 x i32> %a, <4 x i
; CHECK-GI-NEXT: mla v0.4s, v1.4s, v3.s[3]
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
- %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul3 = mul <4 x i32> %shf3, %a
%add3 = add <4 x i32> %mul3, %b
ret <4 x i32> %add3
More information about the llvm-commits
mailing list