[llvm] Port NVPTXTargetLowering::LowerCONCAT_VECTORS to llvm/lib/CodeGen/SelectionDAG (PR #120030)

Mon Dec 16 17:31:32 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Ethan Kaji (Esan5)

<details>
<summary>Changes</summary>

Ports `NVPTXTargetLowering::LowerCONCAT_VECTORS` to `llvm/lib/CodeGen/SelectionDAG` as requested in https://github.com/llvm/llvm-project/issues/116695.

---
Full diff: https://github.com/llvm/llvm-project/pull/120030.diff


6 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp (+22-4) 
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (-24) 
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+55-27) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll (+30-42) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll (+1-3) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ca87168929f964..09025ae4d71ab9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -191,6 +191,7 @@ class SelectionDAGLegalize {
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+  SDValue ExpandConcatVectors(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
   SDValue ExpandConstant(ConstantSDNode *CP);
@@ -1517,10 +1518,27 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
                      BaseVecAlignment);
 }
 
+SDValue SelectionDAGLegalize::ExpandConcatVectors(SDNode *Node) {
+  assert(Node->getOpcode() == ISD::CONCAT_VECTORS && "Unexpected opcode!");
+  SDLoc Dl(Node);
+  SmallVector<SDValue, 0> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned I = 0; I < NumOperands; ++I) {
+    SDValue SubOp = Node->getOperand(I);
+    EVT VectorValueType =
+        SubOp->getValueType(0);
+    EVT ElementValueType = VectorValueType.getVectorElementType();
+    unsigned NumSubElem = VectorValueType.getVectorNumElements();
+    for (unsigned J = 0; J < NumSubElem; ++J) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ElementValueType,
+                                SubOp, DAG.getIntPtrConstant(J, Dl)));
+    }
+  }
+  return DAG.getBuildVector(Node->getValueType(0), Dl, Ops);
+}
+
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
-  assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
-          Node->getOpcode() == ISD::CONCAT_VECTORS) &&
-         "Unexpected opcode!");
+  assert(Node->getOpcode() == ISD::BUILD_VECTOR && "Unexpected opcode!");
 
   // We can't handle this case efficiently.  Allocate a sufficiently
   // aligned object on the stack, store each operand into it, then load
@@ -3371,7 +3389,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS:
-    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    Results.push_back(ExpandConcatVectors(Node));
     break;
   case ISD::SCALAR_TO_VECTOR:
     Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a02607efb7fc28..de06763da4dbbe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2001,28 +2001,6 @@ SDValue NVPTXTargetLowering::LowerSTACKSAVE(SDValue Op,
   return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
 }
 
-// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
-// (see LegalizeDAG.cpp). This is slow and uses local memory.
-// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
-SDValue
-NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
-  SmallVector<SDValue, 8> Ops;
-  unsigned NumOperands = Node->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; ++i) {
-    SDValue SubOp = Node->getOperand(i);
-    EVT VVT = SubOp.getNode()->getValueType(0);
-    EVT EltVT = VVT.getVectorElementType();
-    unsigned NumSubElem = VVT.getVectorNumElements();
-    for (unsigned j = 0; j < NumSubElem; ++j) {
-      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
-                                DAG.getIntPtrConstant(j, dl)));
-    }
-  }
-  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
-}
-
 SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcasting from v2i8 without hitting the default promotion
   // strategy which goes through stack memory.
@@ -2565,8 +2543,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::CONCAT_VECTORS:
-    return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::LOAD:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 0244a0c5bec9d5..6523f22777693e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -266,7 +266,6 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 619840fc6afb28..ff6e1dbb3baa02 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -70,9 +70,12 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                                                                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -181,9 +184,12 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -279,9 +285,14 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
@@ -441,9 +452,12 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp x8, x9, [sp]
+; NONEON-NOSVE-NEXT:    stp x8, x9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x half> %res
@@ -539,9 +553,14 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v4f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp], #16
+; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #-32]!
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
+; NONEON-NOSVE-NEXT:    ldp s0, s1, [sp]
+; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
@@ -754,12 +773,15 @@ define void @concat_v32i8_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v32i8_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
   %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -781,12 +803,15 @@ define void @concat_v16i16_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v16i16_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldr x8, [sp, #8]
+; NONEON-NOSVE-NEXT:    str x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
   %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -805,12 +830,15 @@ define void @concat_v8i32_4op(ptr %a, ptr %b)  {
 ;
 ; NONEON-NOSVE-LABEL: concat_v8i32_4op:
 ; NONEON-NOSVE:       // %bb.0:
+; NONEON-NOSVE-NEXT:    sub sp, sp, #32
+; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr d0, [x0]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
-; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr q0, [sp]
+; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
-; NONEON-NOSVE-NEXT:    add sp, sp, #16
+; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
   %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index b0fdce9a93bd3b..fbdfc4b7d96cf3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -1138,17 +1138,15 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: smulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = sext <2 x i64> %op1 to <2 x i128>
   %2 = sext <2 x i64> %op2 to <2 x i128>
@@ -1185,23 +1183,19 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    smulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
 ; NONEON-NOSVE-NEXT:    smulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    smulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    smulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
@@ -2339,17 +2333,15 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ;
 ; NONEON-NOSVE-LABEL: umulh_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #-64]!
+; NONEON-NOSVE-NEXT:    sub sp, sp, #64
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp q0, q1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x10
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x11
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
-; NONEON-NOSVE-NEXT:    add sp, sp, #64
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp], #64
 ; NONEON-NOSVE-NEXT:    ret
   %1 = zext <2 x i64> %op1 to <2 x i128>
   %2 = zext <2 x i64> %op2 to <2 x i128>
@@ -2386,23 +2378,19 @@ define void @umulh_v4i64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 128
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    ldp q2, q3, [x1]
-; NONEON-NOSVE-NEXT:    stp q1, q2, [sp]
-; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp]
-; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q1, q2, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp x11, x10, [sp, #32]
+; NONEON-NOSVE-NEXT:    stp q0, q3, [sp, #64]
+; NONEON-NOSVE-NEXT:    ldp x13, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x9, x8, [sp, #64]
 ; NONEON-NOSVE-NEXT:    umulh x10, x10, x12
-; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #48]
+; NONEON-NOSVE-NEXT:    ldp x14, x12, [sp, #80]
 ; NONEON-NOSVE-NEXT:    umulh x11, x11, x13
 ; NONEON-NOSVE-NEXT:    umulh x8, x8, x12
 ; NONEON-NOSVE-NEXT:    umulh x9, x9, x14
-; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
-; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
+; NONEON-NOSVE-NEXT:    stp x11, x10, [sp, #16]
+; NONEON-NOSVE-NEXT:    stp x9, x8, [sp]
+; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #128
 ; NONEON-NOSVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 13fcd94ea8a260..ae87128b5c3f9d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -142,9 +142,7 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; NONEON-NOSVE-NEXT:    ldr x9, [x0]
 ; NONEON-NOSVE-NEXT:    stp x9, x8, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp]
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldr q0, [sp]
 ; NONEON-NOSVE-NEXT:    str q0, [x1]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret

``````````

</details>


https://github.com/llvm/llvm-project/pull/120030