[llvm] Port `NVPTXTargetLowering::LowerCONCAT_VECTORS` to SelectionDAG (PR #120030)

Wed Mar 5 08:44:31 PST 2025

https://github.com/Esan5 updated https://github.com/llvm/llvm-project/pull/120030

>From 3b031ee5746065c358d0fad48d2b2786c3853e43 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Sun, 15 Dec 2024 19:31:53 -0600
Subject: [PATCH 01/15] first try

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 26 ++++++++++++++++---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 24 -----------------
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  1 -
 3 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2b8818482a333..0d83e0b509b94 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -191,6 +191,7 @@ class SelectionDAGLegalize {
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+  SDValue ExpandConcatVectors(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
@@ -1517,10 +1518,27 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                      BaseVecAlignment);
 }
 
+SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
+  assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
+  SDLoc Dl(Node);
+  SmallVector<SDValue, 0> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    SDValue SubOp = Node->getOperand(I);
+    EVT VectorValueType =
+        SubOp->getValueType(0);
+    EVT ElementValueType = VectorValueType.getVectorElementType();
+    unsigned NumSubElem = VectorValueType.getVectorNumElements();
+    for (unsigned J = 0; J < NumSubElem; ++J) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ElementValueType,
+                                SubOp, DAG.getIntPtrConstant(J, Dl)));
+    }
+  }
+  return DAG.getBuildVector(Node->getValueType(0), Dl, Ops);
+}
+
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
-  assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
-          Node->getOpcode() == ISD::CONCAT_VECTORS) &&
-         "Unexpected opcode!");
+  assert(Node->getOpcode() == ISD::BUILD_VECTOR && "Unexpected opcode!");
 
   // We can't handle this case efficiently.  Allocate a sufficiently
   // aligned object on the stack, store each operand into it, then load
@@ -3375,7 +3393,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    Results.push_back(ExpandConcatVectors(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3e755c25fd91a..f6793d53967b1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2110,28 +2110,6 @@ SDValue NVPTXTargetLowering::LowerSTACKSAVE(SDValue Op,
   return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
 }
 
-// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
-// (see LegalizeDAG.cpp). This is slow and uses local memory.
-// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
-SDValue
-NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
-  SmallVector<SDValue, 8> Ops;
-  unsigned NumOperands = Node->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; ++i) {
-    SDValue SubOp = Node->getOperand(i);
-    EVT VVT = SubOp.getNode()->getValueType(0);
-    EVT EltVT = VVT.getVectorElementType();
-    unsigned NumSubElem = VVT.getVectorNumElements();
-    for (unsigned j = 0; j < NumSubElem; ++j) {
-      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
-                                DAG.getIntPtrConstant(j, dl)));
-    }
-  }
-  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
-}
-
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2777,8 +2755,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::CONCAT_VECTORS:
-    return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index f41c569a65544..f8c17d7118bbb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -279,7 +279,6 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;

>From 39a8dc733eec34aa1855660c65184938a27121b7 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Mon, 16 Dec 2024 19:17:29 -0600
Subject: [PATCH 02/15] fix tests

---
 .../sve-streaming-mode-fixed-length-concat.ll | 82 +++++++++++++------
 ...ve-streaming-mode-fixed-length-int-mulh.ll | 72 +++++++---------
 ...treaming-mode-fixed-length-trunc-stores.ll |  4 +-
 3 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 6ec2b837eed2a..1b5a37bfb932b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -70,9 +70,12 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -181,9 +184,12 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -279,9 +285,14 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
@@ -441,9 +452,12 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
@@ -539,9 +553,14 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -754,12 +773,15 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -781,12 +803,15 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -805,12 +830,15 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i32_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 97f2e7a1e66cb..6510fb4818ef8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -1132,17 +1132,15 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
@@ -1179,23 +1177,19 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
 ; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
@@ -2331,17 +2325,15 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
@@ -2378,23 +2370,19 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
 ; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 13fcd94ea8a26..ae87128b5c3f9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -142,9 +142,7 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-NEXT:    ldr x9, [x0]
 ; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret

>From 0fb31cc998cbeb248c5ec7def884506cd04a999c Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Mon, 16 Dec 2024 22:07:02 -0600
Subject: [PATCH 03/15] comments

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0d83e0b509b94..55378fdee25b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1523,15 +1523,16 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   SDLoc Dl(Node);
   SmallVector<SDValue, 0> Ops;
   unsigned NumOperands = Node->getNumOperands();
+  MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
     EVT VectorValueType =
         SubOp->getValueType(0);
     EVT ElementValueType = VectorValueType.getVectorElementType();
     unsigned NumSubElem = VectorValueType.getVectorNumElements();
-    for (unsigned J = 0; J < NumSubElem; ++J) {
+    for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ElementValueType,
-                                SubOp, DAG.getIntPtrConstant(J, Dl)));
+                                SubOp, DAG.getConstant(Idx, Dl, VectorIdxType)));
     }
   }
   return DAG.getBuildVector(Node->getValueType(0), Dl, Ops);

>From 4ef0702d782d2b827e04e7805bf45a07ea863e4b Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Mon, 16 Dec 2024 22:49:07 -0600
Subject: [PATCH 04/15] prefer size 16 in small vector

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 55378fdee25b0..52818f848c0d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1521,7 +1521,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
   SDLoc Dl(Node);
-  SmallVector<SDValue, 0> Ops;
+  SmallVector<SDValue, 16> Ops;
   unsigned NumOperands = Node->getNumOperands();
   MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
   for (unsigned I = 0; I < NumOperands; ++I) {

>From f4d50559bdf770be710757447d89fc85481efec7 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Tue, 17 Dec 2024 15:54:49 -0600
Subject: [PATCH 05/15] trigger builds


>From 2746c506c97d0f9b6b11a21a539ea88ea3c57559 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Tue, 17 Dec 2024 20:36:49 -0600
Subject: [PATCH 06/15] comments

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 52818f848c0d6..18d5ecd00ccdb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1520,22 +1520,21 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
 SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
-  SDLoc Dl(Node);
+  SDLoc DL(Node);
   SmallVector<SDValue, 16> Ops;
   unsigned NumOperands = Node->getNumOperands();
   MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
-    EVT VectorValueType =
-        SubOp->getValueType(0);
+    EVT VectorValueType = SubOp.getValueType();
     EVT ElementValueType = VectorValueType.getVectorElementType();
     unsigned NumSubElem = VectorValueType.getVectorNumElements();
     for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
-      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ElementValueType,
-                                SubOp, DAG.getConstant(Idx, Dl, VectorIdxType)));
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
+                                SubOp, DAG.getConstant(Idx, DL, VectorIdxType)));
     }
   }
-  return DAG.getBuildVector(Node->getValueType(0), Dl, Ops);
+  return DAG.getBuildVector(Node->getValueType(0), DL, Ops);
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {

>From 0a3a108869ac0412eb1745225968dc57a75c5bb4 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Thu, 19 Dec 2024 18:04:23 -0600
Subject: [PATCH 07/15] extract to legal type

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 18d5ecd00ccdb..ad917b938fe78 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -191,7 +191,7 @@ class SelectionDAGLegalize {
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
-  SDValue ExpandConcatVectors(SDNode* Node);
+  SDValue ExpandConcatVectors(SDNode *Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
@@ -1527,11 +1527,13 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
     EVT VectorValueType = SubOp.getValueType();
-    EVT ElementValueType = VectorValueType.getVectorElementType();
+    EVT ElementValueType = TLI.getTypeToTransformTo(
+        *DAG.getContext(), VectorValueType.getVectorElementType());
     unsigned NumSubElem = VectorValueType.getVectorNumElements();
     for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
-                                SubOp, DAG.getConstant(Idx, DL, VectorIdxType)));
+                                SubOp,
+                                DAG.getConstant(Idx, DL, VectorIdxType)));
     }
   }
   return DAG.getBuildVector(Node->getValueType(0), DL, Ops);
@@ -1545,8 +1547,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   // the result as a vector.
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
-  EVT MemVT = isa<BuildVectorSDNode>(Node) ? VT.getVectorElementType()
-                                           : Node->getOperand(0).getValueType();
+  EVT MemVT = VT.getVectorElementType();
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
@@ -1560,8 +1561,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
   // If the destination vector element type of a BUILD_VECTOR is narrower than
   // the source element type, only store the bits necessary.
-  bool Truncate = isa<BuildVectorSDNode>(Node) &&
-                  MemVT.bitsLT(Node->getOperand(0).getValueType());
+  bool Truncate = MemVT.bitsLT(Node->getOperand(0).getValueType());
 
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {

>From 963f0478430456b203a9ed55874b3687b780fc87 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Fri, 20 Dec 2024 14:11:12 -0600
Subject: [PATCH 08/15] use build through stack if type changes

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 19 ++++++---
 .../sve-streaming-mode-fixed-length-concat.ll | 40 +++++++------------
 2 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ad917b938fe78..59a0fc48b67a0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1524,11 +1524,14 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   SmallVector<SDValue, 16> Ops;
   unsigned NumOperands = Node->getNumOperands();
   MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
+  EVT VectorValueType = Node->getOperand(0).getValueType();
+  EVT ElementValueType = VectorValueType.getVectorElementType();
+  if (ElementValueType !=
+      TLI.getTypeToTransformTo(*DAG.getContext(), ElementValueType)) {
+    return ExpandVectorBuildThroughStack(Node);
+  }
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
-    EVT VectorValueType = SubOp.getValueType();
-    EVT ElementValueType = TLI.getTypeToTransformTo(
-        *DAG.getContext(), VectorValueType.getVectorElementType());
     unsigned NumSubElem = VectorValueType.getVectorNumElements();
     for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
@@ -1540,14 +1543,17 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
-  assert(Node->getOpcode() == ISD::BUILD_VECTOR && "Unexpected opcode!");
+  assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
+          Node->getOpcode() == ISD::CONCAT_VECTORS) &&
+         "Unexpected opcode!");
 
   // We can't handle this case efficiently.  Allocate a sufficiently
   // aligned object on the stack, store each operand into it, then load
   // the result as a vector.
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
-  EVT MemVT = VT.getVectorElementType();
+  EVT MemVT = isa<BuildVectorSDNode>(Node) ? VT.getVectorElementType()
+                                           : Node->getOperand(0).getValueType();
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
@@ -1561,7 +1567,8 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
   // If the destination vector element type of a BUILD_VECTOR is narrower than
   // the source element type, only store the bits necessary.
-  bool Truncate = MemVT.bitsLT(Node->getOperand(0).getValueType());
+  bool Truncate = isa<BuildVectorSDNode>(Node) &&
+                  MemVT.bitsLT(Node->getOperand(0).getValueType());
 
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 1b5a37bfb932b..3997bf16c2aaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -70,12 +70,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
-; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -184,12 +181,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
-; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -773,15 +767,12 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
-; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -803,15 +794,12 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
-; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From ec4f649e8cf114b326513d172577890177fefa77 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Fri, 20 Dec 2024 14:50:17 -0600
Subject: [PATCH 09/15] just checking if the type changed is not sufficient

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  8 +---
 .../sve-streaming-mode-fixed-length-concat.ll | 40 ++++++++++++-------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 59a0fc48b67a0..ad29a104bcfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1525,14 +1525,10 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   unsigned NumOperands = Node->getNumOperands();
   MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
   EVT VectorValueType = Node->getOperand(0).getValueType();
-  EVT ElementValueType = VectorValueType.getVectorElementType();
-  if (ElementValueType !=
-      TLI.getTypeToTransformTo(*DAG.getContext(), ElementValueType)) {
-    return ExpandVectorBuildThroughStack(Node);
-  }
+  unsigned NumSubElem = VectorValueType.getVectorNumElements();
+  EVT ElementValueType = TLI.getTypeToTransformTo(*DAG.getContext(), VectorValueType.getVectorElementType());
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
-    unsigned NumSubElem = VectorValueType.getVectorNumElements();
     for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
                                 SubOp,
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 3997bf16c2aaf..1b5a37bfb932b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -70,9 +70,12 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -181,9 +184,12 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -767,12 +773,15 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -794,12 +803,15 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From 1920fc0a21dc02033c89a28cba0f42cb65b32b94 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Sat, 21 Dec 2024 15:50:34 -0600
Subject: [PATCH 10/15] use ExpandVectorBuildThroughStack if extract vector elt
 is not is not legal or custom.

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 12 +++++-
 .../sve-streaming-mode-fixed-length-concat.ll | 40 +++++++------------
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ad29a104bcfad..1b702e083ad82 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1526,7 +1526,8 @@ SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
   MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
   EVT VectorValueType = Node->getOperand(0).getValueType();
   unsigned NumSubElem = VectorValueType.getVectorNumElements();
-  EVT ElementValueType = TLI.getTypeToTransformTo(*DAG.getContext(), VectorValueType.getVectorElementType());
+  EVT ElementValueType = TLI.getTypeToTransformTo(
+      *DAG.getContext(), VectorValueType.getVectorElementType());
   for (unsigned I = 0; I < NumOperands; ++I) {
     SDValue SubOp = Node->getOperand(I);
     for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
@@ -3396,7 +3397,14 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    Results.push_back(ExpandConcatVectors(Node));
+    if (EVT ElementValueType =
+            Node->getOperand(0).getValueType().getVectorElementType();
+        TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT,
+                                     ElementValueType)) {
+      Results.push_back(ExpandConcatVectors(Node));
+    } else {
+      Results.push_back(ExpandVectorBuildThroughStack(Node));
+    }
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 1b5a37bfb932b..3997bf16c2aaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -70,12 +70,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
-; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -184,12 +181,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
-; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -773,15 +767,12 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
-; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -803,15 +794,12 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
-; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

>From 1654877406e5e5acd46a96aa5a716037c5673756 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <97700380+Esan5 at users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:49:54 -0500
Subject: [PATCH 11/15] try ExpandConcatVectors if extract_vector_elt is not an
 Expand operation

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  5 +-
 .../sve-streaming-mode-fixed-length-concat.ll | 42 ++++-------
 ...ve-streaming-mode-fixed-length-int-mulh.ll | 72 +++++++++++--------
 ...treaming-mode-fixed-length-trunc-stores.ll |  4 +-
 4 files changed, 59 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 1b702e083ad82..0d04d7198917a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3397,10 +3397,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    if (EVT ElementValueType =
-            Node->getOperand(0).getValueType().getVectorElementType();
-        TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT,
-                                     ElementValueType)) {
+    if (!TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT, Node->getOperand(0).getValueType())) {
       Results.push_back(ExpandConcatVectors(Node));
     } else {
       Results.push_back(ExpandVectorBuildThroughStack(Node));
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 3997bf16c2aaf..6ec2b837eed2a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -279,14 +279,9 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
@@ -446,12 +441,9 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
-; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
@@ -547,14 +539,9 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
-; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
-; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -818,15 +805,12 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i32_4op:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #32
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
-; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #32
+; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index 6510fb4818ef8..97f2e7a1e66cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -1132,15 +1132,17 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
@@ -1177,19 +1179,23 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
 ; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
 ; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
@@ -2325,15 +2331,17 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    sub sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
+; NONEON-NOSVE-NEXT:    add sp, sp, #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
@@ -2370,19 +2378,23 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
 ; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
 ; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
+; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index ae87128b5c3f9..13fcd94ea8a26 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -142,7 +142,9 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-NEXT:    ldr x9, [x0]
 ; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret

>From d95555ea26ccd17fd408d7f7208cb03b279e9554 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <97700380+Esan5 at users.noreply.github.com>
Date: Thu, 23 Jan 2025 11:59:16 -0500
Subject: [PATCH 12/15] remove braces negate and swap order

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0d04d7198917a..0ef6fa4daac42 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3397,11 +3397,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    if (!TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT, Node->getOperand(0).getValueType())) {
-      Results.push_back(ExpandConcatVectors(Node));
-    } else {
+    if (Node->getValueType().isScalableVector() ||
+        TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT,
+                              Node->getOperand(0).getValueType()))
       Results.push_back(ExpandVectorBuildThroughStack(Node));
-    }
+    else
+      Results.push_back(ExpandConcatVectors(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));

>From 1c9fa8e98f8921a51a9364bbfd8bc181e831c6f4 Mon Sep 17 00:00:00 2001
From: Ethan Kaji <97700380+Esan5 at users.noreply.github.com>
Date: Mon, 27 Jan 2025 13:58:52 -0500
Subject: [PATCH 13/15] scalable vector guard with vector value type

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0ef6fa4daac42..1dd1461335b59 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3397,9 +3397,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    if (Node->getValueType().isScalableVector() ||
-        TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT,
-                              Node->getOperand(0).getValueType()))
+    if (EVT VectorValueType = Node->getOperand(0).getValueType();
+        VectorValueType.isScalableVector() ||
+        TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT, VectorValueType))
       Results.push_back(ExpandVectorBuildThroughStack(Node));
     else
       Results.push_back(ExpandConcatVectors(Node));

>From d1bab4d606aebd486bf4fca759d8ada23da816de Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Tue, 4 Mar 2025 18:14:42 -0500
Subject: [PATCH 14/15] remove dead code

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 29 +------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 1dd1461335b59..2b8818482a333 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -191,7 +191,6 @@ class SelectionDAGLegalize {
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
-  SDValue ExpandConcatVectors(SDNode *Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
@@ -1518,27 +1517,6 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                      BaseVecAlignment);
 }
 
-SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
-  assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
-  SDLoc DL(Node);
-  SmallVector<SDValue, 16> Ops;
-  unsigned NumOperands = Node->getNumOperands();
-  MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
-  EVT VectorValueType = Node->getOperand(0).getValueType();
-  unsigned NumSubElem = VectorValueType.getVectorNumElements();
-  EVT ElementValueType = TLI.getTypeToTransformTo(
-      *DAG.getContext(), VectorValueType.getVectorElementType());
-  for (unsigned I = 0; I < NumOperands; ++I) {
-    SDValue SubOp = Node->getOperand(I);
-    for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
-      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
-                                SubOp,
-                                DAG.getConstant(Idx, DL, VectorIdxType)));
-    }
-  }
-  return DAG.getBuildVector(Node->getValueType(0), DL, Ops);
-}
-
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
           Node->getOpcode() == ISD::CONCAT_VECTORS) &&
@@ -3397,12 +3375,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    if (EVT VectorValueType = Node->getOperand(0).getValueType();
-        VectorValueType.isScalableVector() ||
-        TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT, VectorValueType))
-      Results.push_back(ExpandVectorBuildThroughStack(Node));
-    else
-      Results.push_back(ExpandConcatVectors(Node));
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));

>From 4c4083fe24b95a72a1346ebfd31ca046de98caed Mon Sep 17 00:00:00 2001
From: Ethan Kaji <ethan.kaji at gmail.com>
Date: Wed, 5 Mar 2025 11:44:05 -0500
Subject: [PATCH 15/15] Revert "remove dead code"

This reverts commit d1bab4d606aebd486bf4fca759d8ada23da816de.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2b8818482a333..1dd1461335b59 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -191,6 +191,7 @@ class SelectionDAGLegalize {
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+  SDValue ExpandConcatVectors(SDNode *Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
@@ -1517,6 +1518,27 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                      BaseVecAlignment);
 }
 
+SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
+  assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
+  SDLoc DL(Node);
+  SmallVector<SDValue, 16> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  MVT VectorIdxType = TLI.getVectorIdxTy(DAG.getDataLayout());
+  EVT VectorValueType = Node->getOperand(0).getValueType();
+  unsigned NumSubElem = VectorValueType.getVectorNumElements();
+  EVT ElementValueType = TLI.getTypeToTransformTo(
+      *DAG.getContext(), VectorValueType.getVectorElementType());
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    SDValue SubOp = Node->getOperand(I);
+    for (unsigned Idx = 0; Idx < NumSubElem; ++Idx) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElementValueType,
+                                SubOp,
+                                DAG.getConstant(Idx, DL, VectorIdxType)));
+    }
+  }
+  return DAG.getBuildVector(Node->getValueType(0), DL, Ops);
+}
+
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
           Node->getOpcode() == ISD::CONCAT_VECTORS) &&
@@ -3375,7 +3397,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    if (EVT VectorValueType = Node->getOperand(0).getValueType();
+        VectorValueType.isScalableVector() ||
+        TLI.isOperationExpand(ISD::EXTRACT_VECTOR_ELT, VectorValueType))
+      Results.push_back(ExpandVectorBuildThroughStack(Node));
+    else
+      Results.push_back(ExpandConcatVectors(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));