[llvm-branch-commits] [llvm] [SelectionDAG] Fold subvector inserts into concat operands (PR #200937)

Mon Jun 1 16:58:11 PDT 2026

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/200937

>From ef3893c5e1ba73c2828533856292ba7a71ef9a63 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Sat, 30 May 2026 02:18:03 +0000
Subject: [PATCH 1/2] [SelectionDAG] Fold subvector inserts into concat
 operands

Push insert_subvector into the containing CONCAT_VECTORS operand when the insertion is wholly contained there.

AI note: an LLM generated the code and the test, I've read them

Co-Authored-By: OpenAI Codex <codex at openai.com>
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 +++++++++---
 .../AMDGPU/dagcombine-insert-concat.ll        | 72 +++++++++++++++++++
 2 files changed, 107 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 58fc5ece9f3d3..2b93d2236df25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -29534,16 +29534,41 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     }
   }
 
-  // If the input vector is a concatenation, and the insert replaces
-  // one of the pieces, we can optimize into a single concat_vectors.
-  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
-      N0.getOperand(0).getValueType() == N1.getValueType() &&
-      N0.getOperand(0).getValueType().isScalableVector() ==
-          N1.getValueType().isScalableVector()) {
-    unsigned Factor = N1.getValueType().getVectorMinNumElements();
-    SmallVector<SDValue, 8> Ops(N0->ops());
-    Ops[InsIdx / Factor] = N1;
-    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+  // If the input vector is a concatenation and the insert is wholly contained
+  // in one of its operands, push the insertion into that operand.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse()) {
+    EVT ConcatOpVT = N0.getOperand(0).getValueType();
+    EVT InsVT = N1.getValueType();
+    unsigned Factor = ConcatOpVT.getVectorMinNumElements();
+    unsigned ConcatOpIdx = InsIdx / Factor;
+    unsigned RelativeIdx = InsIdx - ConcatOpIdx * Factor;
+    if (ConcatOpIdx < N0.getNumOperands()) {
+      // If the insert replaces a whole concat operand, optimize into a single
+      // concat_vectors.
+      if (ConcatOpVT == InsVT &&
+          ConcatOpVT.isScalableVector() == InsVT.isScalableVector() &&
+          RelativeIdx == 0) {
+        SmallVector<SDValue, 8> Ops(N0->ops());
+        Ops[ConcatOpIdx] = N1;
+        return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+      }
+
+      if (VT.isFixedLengthVector() && ConcatOpVT.isFixedLengthVector() &&
+          InsVT.isFixedLengthVector() &&
+          ConcatOpVT.getVectorElementType() == InsVT.getVectorElementType() &&
+          hasOperation(ISD::INSERT_SUBVECTOR, ConcatOpVT)) {
+        unsigned NumConcatOpElts = ConcatOpVT.getVectorNumElements();
+        unsigned NumInsElts = InsVT.getVectorNumElements();
+        if (RelativeIdx % NumInsElts == 0 &&
+            RelativeIdx + NumInsElts <= NumConcatOpElts) {
+          SmallVector<SDValue, 8> Ops(N0->ops());
+          Ops[ConcatOpIdx] = DAG.getNode(
+              ISD::INSERT_SUBVECTOR, SDLoc(N), ConcatOpVT, Ops[ConcatOpIdx],
+              N1, DAG.getVectorIdxConstant(RelativeIdx, SDLoc(N)));
+          return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+        }
+      }
+    }
   }
 
   // Simplify source operands based on insertion.
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll
new file mode 100644
index 0000000000000..5d53859b16952
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-insert-concat.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \
+; RUN:   -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s \
+; RUN:   --check-prefix=COMBINE \
+; RUN:   --implicit-check-not=REG_SEQUENCE --implicit-check-not=INSERT_SUBREG
+; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \
+; RUN:   -verify-machineinstrs -combiner-disabled -stop-after=amdgpu-isel < %s \
+; RUN:   | FileCheck %s --check-prefix=NOCOMBINE
+
+declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64 immarg)
+
+define <8 x i32> @insert_into_concat_operand(<4 x i32> %a, <4 x i32> %b, <2 x i32> %sub) nounwind {
+  ; COMBINE-LABEL: name: insert_into_concat_operand
+  ; COMBINE: bb.0 (%ir-block.0):
+  ; COMBINE-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr8, $vgpr9
+  ; COMBINE-NEXT: {{  $}}
+  ; COMBINE-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; COMBINE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; COMBINE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; COMBINE-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; COMBINE-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; COMBINE-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; COMBINE-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; COMBINE-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; COMBINE-NEXT:   $vgpr0 = COPY [[COPY7]]
+  ; COMBINE-NEXT:   $vgpr1 = COPY [[COPY6]]
+  ; COMBINE-NEXT:   $vgpr2 = COPY [[COPY5]]
+  ; COMBINE-NEXT:   $vgpr3 = COPY [[COPY4]]
+  ; COMBINE-NEXT:   $vgpr4 = COPY [[COPY3]]
+  ; COMBINE-NEXT:   $vgpr5 = COPY [[COPY2]]
+  ; COMBINE-NEXT:   $vgpr6 = COPY [[COPY1]]
+  ; COMBINE-NEXT:   $vgpr7 = COPY [[COPY]]
+  ; COMBINE-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  ;
+  ; NOCOMBINE-LABEL: name: insert_into_concat_operand
+  ; NOCOMBINE: bb.0 (%ir-block.0):
+  ; NOCOMBINE-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+  ; NOCOMBINE-NEXT: {{  $}}
+  ; NOCOMBINE-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; NOCOMBINE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; NOCOMBINE-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+  ; NOCOMBINE-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+  ; NOCOMBINE-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; NOCOMBINE-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; NOCOMBINE-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; NOCOMBINE-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; NOCOMBINE-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; NOCOMBINE-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; NOCOMBINE-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_256_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3, [[COPY5]], %subreg.sub4, [[COPY4]], %subreg.sub5, [[COPY3]], %subreg.sub6, [[COPY2]], %subreg.sub7
+  ; NOCOMBINE-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:av_256_align2 = INSERT_SUBREG [[REG_SEQUENCE]], [[COPY1]], %subreg.sub6
+  ; NOCOMBINE-NEXT:   [[INSERT_SUBREG1:%[0-9]+]]:av_256_align2 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY]], %subreg.sub7
+  ; NOCOMBINE-NEXT:   [[COPY10:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub0
+  ; NOCOMBINE-NEXT:   [[COPY11:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub1
+  ; NOCOMBINE-NEXT:   [[COPY12:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub2
+  ; NOCOMBINE-NEXT:   [[COPY13:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub3
+  ; NOCOMBINE-NEXT:   [[COPY14:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub4
+  ; NOCOMBINE-NEXT:   [[COPY15:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub5
+  ; NOCOMBINE-NEXT:   [[COPY16:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub6
+  ; NOCOMBINE-NEXT:   [[COPY17:%[0-9]+]]:av_32 = COPY [[INSERT_SUBREG1]].sub7
+  ; NOCOMBINE-NEXT:   $vgpr0 = COPY [[COPY10]]
+  ; NOCOMBINE-NEXT:   $vgpr1 = COPY [[COPY11]]
+  ; NOCOMBINE-NEXT:   $vgpr2 = COPY [[COPY12]]
+  ; NOCOMBINE-NEXT:   $vgpr3 = COPY [[COPY13]]
+  ; NOCOMBINE-NEXT:   $vgpr4 = COPY [[COPY14]]
+  ; NOCOMBINE-NEXT:   $vgpr5 = COPY [[COPY15]]
+  ; NOCOMBINE-NEXT:   $vgpr6 = COPY [[COPY16]]
+  ; NOCOMBINE-NEXT:   $vgpr7 = COPY [[COPY17]]
+  ; NOCOMBINE-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
+  %wide = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %ins = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %wide, <2 x i32> %sub, i64 6)
+  ret <8 x i32> %ins
+}

>From 5f851f164a3b59e9a8b322bdf16ed0c7de9b340f Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 1 Jun 2026 23:57:48 +0000
Subject: [PATCH 2/2] clang-format, newer API

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2b93d2236df25..f5b81b337fd57 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -29562,9 +29562,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
         if (RelativeIdx % NumInsElts == 0 &&
             RelativeIdx + NumInsElts <= NumConcatOpElts) {
           SmallVector<SDValue, 8> Ops(N0->ops());
-          Ops[ConcatOpIdx] = DAG.getNode(
-              ISD::INSERT_SUBVECTOR, SDLoc(N), ConcatOpVT, Ops[ConcatOpIdx],
-              N1, DAG.getVectorIdxConstant(RelativeIdx, SDLoc(N)));
+          Ops[ConcatOpIdx] = DAG.getInsertSubvector(SDLoc(N), Ops[ConcatOpIdx],
+                                                    N1, RelativeIdx);
           return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
         }
       }