[llvm] [SDAG] Match BUILD_VECTOR in INSERT_SUBVECTOR to SPLAT_VECTOR fold (PR #163984)

Fri Oct 17 09:29:37 PDT 2025

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/163984

>From 1e605fc103c40ea227314cd2cd24ab86d9cdc8a3 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 17 Oct 2025 15:50:51 +0000
Subject: [PATCH 1/4] Precommit tests

---
 .../fixed-subvector-insert-into-scalable.ll   | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll

diff --git a/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
new file mode 100644
index 0000000000000..41a3270966f22
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 4 x i32> @insert_div() {
+; CHECK-LABEL: insert_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #43691 // =0xaaab
+; CHECK-NEXT:    movi v0.4s, #9
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movk w8, #43690, lsl #16
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    lsr z0.s, z0.s, #1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 9), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_mul() {
+; CHECK-LABEL: insert_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.4s, #1
+; CHECK-NEXT:    mul z0.s, z0.s, #7
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 1), i64 0)
+  %mul = mul <vscale x 4 x i32> %0, splat (i32 7)
+  ret <vscale x 4 x i32> %mul
+}
+
+define <vscale x 4 x i32> @insert_add() {
+; CHECK-LABEL: insert_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.4s, #5
+; CHECK-NEXT:    add z0.s, z0.s, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 5), i64 0)
+  %add = add <vscale x 4 x i32> %0, splat (i32 11)
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_sub() {
+; CHECK-LABEL: insert_sub:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.4s, #11
+; CHECK-NEXT:    sub z0.s, z0.s, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 11), i64 0)
+  %sub = add <vscale x 4 x i32> %0, splat (i32 -11)
+  ret <vscale x 4 x i32> %sub
+}

>From 76bfc78b0570929581be4e6fbe547e28b413c11e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 17 Oct 2025 15:52:08 +0000
Subject: [PATCH 2/4] [SDAG] Match BUILD_VECTOR in INSERT_SUBVECTOR to
 SPLAT_VECTOR fold

This allows for more constant folding when inserting fixed-length vector
splats into scalable vectors.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp   |  8 ++++++--
 .../fixed-subvector-insert-into-scalable.ll     | 17 ++++-------------
 llvm/test/CodeGen/AArch64/vecreduce-add.ll      |  2 +-
 llvm/test/CodeGen/X86/pr35443.ll                |  2 +-
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..8e74bcb25da63 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -28014,9 +28014,13 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
 
   // Simplify scalar inserts into an undef vector:
   // insert_subvector undef, (splat X), N2 -> splat X
-  if (N0.isUndef() && N1.getOpcode() == ISD::SPLAT_VECTOR)
-    if (DAG.isConstantValueOfAnyType(N1.getOperand(0)) || N1.hasOneUse())
+  auto *BV0 = dyn_cast<BuildVectorSDNode>(N1);
+  if (N0.isUndef() && (N1.getOpcode() == ISD::SPLAT_VECTOR || BV0)) {
+    SDValue Splat = BV0 ? BV0->getSplatValue() : N1.getOperand(0);
+    if (Splat &&
+        (N1.hasOneUse() || (!BV0 && DAG.isConstantValueOfAnyType(Splat))))
       return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
+  }
 
   // insert_subvector (splat X), (splat X), N2 -> splat X
   if (N0.getOpcode() == ISD::SPLAT_VECTOR && N0.getOpcode() == N1.getOpcode() &&
diff --git a/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
index 41a3270966f22..5ce5baf42f5f5 100644
--- a/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
@@ -4,13 +4,7 @@
 define <vscale x 4 x i32> @insert_div() {
 ; CHECK-LABEL: insert_div:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #43691 // =0xaaab
-; CHECK-NEXT:    movi v0.4s, #9
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movk w8, #43690, lsl #16
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    lsr z0.s, z0.s, #1
+; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 9), i64 0)
@@ -21,8 +15,7 @@ entry:
 define <vscale x 4 x i32> @insert_mul() {
 ; CHECK-LABEL: insert_mul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.4s, #1
-; CHECK-NEXT:    mul z0.s, z0.s, #7
+; CHECK-NEXT:    mov z0.s, #7 // =0x7
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 1), i64 0)
@@ -33,8 +26,7 @@ entry:
 define <vscale x 4 x i32> @insert_add() {
 ; CHECK-LABEL: insert_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.4s, #5
-; CHECK-NEXT:    add z0.s, z0.s, #11 // =0xb
+; CHECK-NEXT:    mov z0.s, #16 // =0x10
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 5), i64 0)
@@ -45,8 +37,7 @@ entry:
 define <vscale x 4 x i32> @insert_sub() {
 ; CHECK-LABEL: insert_sub:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.4s, #11
-; CHECK-NEXT:    sub z0.s, z0.s, #11 // =0xb
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 11), i64 0)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 2d0df562b9a4b..f56e1680f79c6 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4778,7 +4778,7 @@ entry:
 define i64 @extract_scalable(<2 x i32> %0) "target-features"="+sve2" {
 ; CHECK-SD-LABEL: extract_scalable:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    movi v1.2s, #1
+; CHECK-SD-NEXT:    mov z1.s, #1 // =0x1
 ; CHECK-SD-NEXT:    ptrue p0.s, vl2
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-SD-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
diff --git a/llvm/test/CodeGen/X86/pr35443.ll b/llvm/test/CodeGen/X86/pr35443.ll
index 430a1380c7c8c..8438b73acac23 100644
--- a/llvm/test/CodeGen/X86/pr35443.ll
+++ b/llvm/test/CodeGen/X86/pr35443.ll
@@ -8,7 +8,7 @@
 define void @pr35443() {
 ; CHECK-LABEL: pr35443:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastb ac+4(%rip), %xmm0
+; CHECK-NEXT:    vpbroadcastb ac+4(%rip), %ymm0
 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsubq %ymm0, %ymm1, %ymm0

>From 67fb3ea85dc805832478eacf39f5797b145b6169 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 17 Oct 2025 16:04:08 +0000
Subject: [PATCH 3/4] undef > poison

---
 .../AArch64/fixed-subvector-insert-into-scalable.ll       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
index 5ce5baf42f5f5..8758f5a4e244d 100644
--- a/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-subvector-insert-into-scalable.ll
@@ -7,7 +7,7 @@ define <vscale x 4 x i32> @insert_div() {
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 9), i64 0)
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 9), i64 0)
   %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
@@ -18,7 +18,7 @@ define <vscale x 4 x i32> @insert_mul() {
 ; CHECK-NEXT:    mov z0.s, #7 // =0x7
 ; CHECK-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 1), i64 0)
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 1), i64 0)
   %mul = mul <vscale x 4 x i32> %0, splat (i32 7)
   ret <vscale x 4 x i32> %mul
 }
@@ -29,7 +29,7 @@ define <vscale x 4 x i32> @insert_add() {
 ; CHECK-NEXT:    mov z0.s, #16 // =0x10
 ; CHECK-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 5), i64 0)
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 5), i64 0)
   %add = add <vscale x 4 x i32> %0, splat (i32 11)
   ret <vscale x 4 x i32> %add
 }
@@ -40,7 +40,7 @@ define <vscale x 4 x i32> @insert_sub() {
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ret
 entry:
-  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 11), i64 0)
+  %0 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 11), i64 0)
   %sub = add <vscale x 4 x i32> %0, splat (i32 -11)
   ret <vscale x 4 x i32> %sub
 }

>From 52e5a38e3a226230a7894a3eee2f509f1c36efef Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 17 Oct 2025 16:28:55 +0000
Subject: [PATCH 4/4] Relax one use check for constants

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8e74bcb25da63..1ad4bff165607 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -28017,8 +28017,9 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   auto *BV0 = dyn_cast<BuildVectorSDNode>(N1);
   if (N0.isUndef() && (N1.getOpcode() == ISD::SPLAT_VECTOR || BV0)) {
     SDValue Splat = BV0 ? BV0->getSplatValue() : N1.getOperand(0);
-    if (Splat &&
-        (N1.hasOneUse() || (!BV0 && DAG.isConstantValueOfAnyType(Splat))))
+    bool SplatLegal = TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR, VT);
+    if (Splat && (N1.hasOneUse() || (DAG.isConstantValueOfAnyType(Splat) &&
+                                     (!BV0 || SplatLegal))))
       return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
   }