[llvm] [MIPS] [MSA] Fix ILVOD operand order; Widen v2 vectors for MIPS64 (PR #123040)

Wed Jan 15 05:02:40 PST 2025

https://github.com/Cyanoxygen updated https://github.com/llvm/llvm-project/pull/123040

>From 5dcdaa5fabcc9ed2ebf80d30731ffcc71707bacb Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Wed, 15 Jan 2025 10:57:33 +0800
Subject: [PATCH 1/4] [MIPS][MSA] Invert operand order of ILVOD

The result was <y[1], x[1]> where it should be <x[1], y[1]>.

This affects the result of vector shuffles:

    %result = shufflevector <4 x i32> %x, <4 x i32> %y,
              <4 x 132> <i32 1, i32 5, i32 3, i32 7>

The result is <%y[1], %x[1], %y[3], %x[3]>, where it should be
<%x[1], %y[1], %x[3], %y[3]>.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 818b1683bb867e..1d1b0f9c6ae2a9 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -2736,7 +2736,7 @@ static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
   else
     return SDValue();
 
-  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws);
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Ws, Wt);
 }
 
 // Lower VECTOR_SHUFFLE into ILVR (if possible).

>From 3101e25f98aa1d252a2a85f639c063493a382593 Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Tue, 14 Jan 2025 23:56:32 +0800
Subject: [PATCH 2/4] [MIPS][MSA] Widen v2 vectors to the register length for
 MIPS64 w/ MSA

Currently v2i8, v2i16 and v2i32 are being promoted to v2i64 which casts
the vector back and forth. Widening them to avoid unnecessary bitcasts,
loads and stores.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp | 23 +++++++++++++++++++++
 llvm/lib/Target/Mips/MipsSEISelLowering.h   |  3 +++
 2 files changed, 26 insertions(+)

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 1d1b0f9c6ae2a9..b40d93cee42fe3 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -59,6 +59,29 @@ static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
                                             "stores to their single precision "
                                             "counterparts"));
 
+// Widen the v2 vectors to the register width, i.e. v2i16 -> v8i16,
+// v2i32 -> v4i32, etc, to ensure the correct rail size is used.
+// We will deal with incorrect pattern being matched problem later.
+TargetLoweringBase::LegalizeTypeAction
+MipsSETargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (this->Subtarget.hasMSA() && this->Subtarget.isGP64bit()) {
+    switch (VT.SimpleTy) {
+    // Leave v2i1s to be promoted to larger ones.
+    case MVT::v2i1:
+      return TypePromoteInteger;
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+      return TypeWidenVector;
+      break;
+    // v2i64 is already 128-bit wide.
+    default:
+      break;
+    }
+  }
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
                                            const MipsSubtarget &STI)
     : MipsTargetLowering(TM, STI) {
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.h b/llvm/lib/Target/Mips/MipsSEISelLowering.h
index 43b88a9f095226..675131aefb6dd9 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -45,6 +45,9 @@ class TargetRegisterClass;
         MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
         unsigned *Fast = nullptr) const override;
 
+    TargetLoweringBase::LegalizeTypeAction
+    getPreferredVectorAction(MVT VT) const override;
+
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;

>From 74affabf55af5f2e79b3c3d76a9321ef239e18ef Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Wed, 15 Jan 2025 12:00:24 +0800
Subject: [PATCH 3/4] [MIPS][MSA] Handle UNDEFs in shuffle indices for VSHF

Currently VSHF does not handle UNDEF indices. However isSPLATI() is able
to handle undefs, which may pass indices with undefs to this function.
Adding a check to handle undefs in shuffle indices.

Also, shuffle mask widened from v2 vector types are guranteed to contain
UNDEFs. These shuffle lower logics can handle UNDEFs, so we just leave
it as is.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp | 22 ++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index b40d93cee42fe3..3e6f8cb1ffdc40 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <utility>
@@ -2952,8 +2953,12 @@ static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
 // if the type is v8i16 and all the indices are less than 8 then the second
 // operand is unused and can be replaced with anything. We choose to replace it
 // with the used operand since this reduces the number of instructions overall.
+//
+// NOTE: SPLATI shuffle masks may contain UNDEFs, since isSPLATI() treats
+//       UNDEFs as the SPLATI index.
 static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
                                         const SmallVector<int, 16> &Indices,
+                                        const bool isSPLATI,
                                         SelectionDAG &DAG) {
   SmallVector<SDValue, 16> Ops;
   SDValue Op0;
@@ -2965,6 +2970,9 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
   SDLoc DL(Op);
   int ResTyNumElts = ResTy.getVectorNumElements();
 
+  assert(Indices[0] >= 0 &&
+         "shuffle mask starts at a UNDEF, which is not expected");
+
   for (int i = 0; i < ResTyNumElts; ++i) {
     // Idx == -1 means UNDEF
     int Idx = Indices[i];
@@ -2975,8 +2983,16 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
       Using2ndVec = true;
   }
 
-  for (int Idx : Indices)
+  for (size_t i = 0; i < Indices.size(); i++) {
+    int Idx = Indices[i];
+    if (isSPLATI && Indices[i] < 0) {
+      Idx = Indices[0];
+    }
+    if (!isSPLATI && Indices[i] < 0) {
+      Idx = Indices[i - 1];
+    }
     Ops.push_back(DAG.getTargetConstant(Idx, DL, MaskEltTy));
+  }
 
   SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
 
@@ -3019,7 +3035,7 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   // splati.[bhwd] is preferable to the others but is matched from
   // MipsISD::VSHF.
   if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
-    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, true, DAG);
   SDValue Result;
   if ((Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG)))
     return Result;
@@ -3035,7 +3051,7 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG)))
     return Result;
-  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, false, DAG);
 }
 
 MachineBasicBlock *

>From 6643b64986c57ec377c92c7807f84cc79b11d873 Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Wed, 15 Jan 2025 18:16:02 +0800
Subject: [PATCH 4/4] [MIPS][MSA] Update tests for v2 widening and ILVOD
 operand order fix

---
 llvm/test/CodeGen/Mips/cconv/vector.ll | 320 +++++++++++++------------
 llvm/test/CodeGen/Mips/msa/shuffle.ll  |  12 +-
 2 files changed, 172 insertions(+), 160 deletions(-)

diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 28a7dc046139b2..82139dd352a4c4 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -84,30 +84,6 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sh $5, 8($sp)
-; MIPS64R5-NEXT:    sh $4, 12($sp)
-; MIPS64R5-NEXT:    lb $1, 9($sp)
-; MIPS64R5-NEXT:    lb $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lb $1, 13($sp)
-; MIPS64R5-NEXT:    lb $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sb $2, 5($sp)
-; MIPS64R5-NEXT:    sb $1, 4($sp)
-; MIPS64R5-NEXT:    lh $2, 4($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
-;
 ; MIPS32R5EL-LABEL: i8_2:
 ; MIPS32R5EL:       # %bb.0:
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, -64
@@ -144,6 +120,38 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 64
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EB-LABEL: i8_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EB-NEXT:    sh $5, 16($sp)
+; MIPS64R5EB-NEXT:    sh $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EB-NEXT:    sh $1, 44($sp)
+; MIPS64R5EB-NEXT:    lh $2, 44($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EB-NEXT:    jr $ra
+;
+; MIPS64R5EL-LABEL: i8_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EL-NEXT:    sh $5, 16($sp)
+; MIPS64R5EL-NEXT:    sh $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EL-NEXT:    sh $1, 44($sp)
+; MIPS64R5EL-NEXT:    lh $2, 44($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
   %1 = add <2 x i8> %a, %b
   ret <2 x i8> %1
 }
@@ -297,59 +305,68 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8x2_7:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sh $5, 24($sp)
-; MIPS64R5-NEXT:    sh $4, 28($sp)
-; MIPS64R5-NEXT:    lb $1, 25($sp)
-; MIPS64R5-NEXT:    lb $2, 24($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lb $1, 29($sp)
-; MIPS64R5-NEXT:    lb $2, 28($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    sh $6, 20($sp)
-; MIPS64R5-NEXT:    lb $1, 21($sp)
-; MIPS64R5-NEXT:    lb $2, 20($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $7, 16($sp)
-; MIPS64R5-NEXT:    lb $1, 17($sp)
-; MIPS64R5-NEXT:    lb $2, 16($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $8, 12($sp)
-; MIPS64R5-NEXT:    lb $1, 13($sp)
-; MIPS64R5-NEXT:    lb $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $9, 8($sp)
-; MIPS64R5-NEXT:    lb $1, 9($sp)
-; MIPS64R5-NEXT:    lb $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $10, 4($sp)
-; MIPS64R5-NEXT:    lb $1, 5($sp)
-; MIPS64R5-NEXT:    lb $2, 4($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sb $2, 1($sp)
-; MIPS64R5-NEXT:    sb $1, 0($sp)
-; MIPS64R5-NEXT:    lh $2, 0($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i8x2_7:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -128
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 128
+; MIPS64R5EB-NEXT:    sh $5, 16($sp)
+; MIPS64R5EB-NEXT:    sh $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    sh $6, 32($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 32($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $7, 48($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 48($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $8, 64($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 64($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $9, 80($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 80($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $10, 96($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 96($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EB-NEXT:    sh $1, 124($sp)
+; MIPS64R5EB-NEXT:    lh $2, 124($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 128
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i8x2_7:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -128
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 128
+; MIPS64R5EL-NEXT:    sh $5, 16($sp)
+; MIPS64R5EL-NEXT:    sh $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    sh $6, 32($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 32($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $7, 48($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 48($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $8, 64($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 64($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $9, 80($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 80($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $10, 96($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 96($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EL-NEXT:    sh $1, 124($sp)
+; MIPS64R5EL-NEXT:    lh $2, 124($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 128
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 ;
 ; MIPS32EL-LABEL: i8x2_7:
 ; MIPS32EL:       # %bb.0: # %entry
@@ -1257,30 +1274,6 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i16_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sw $5, 8($sp)
-; MIPS64R5-NEXT:    sw $4, 12($sp)
-; MIPS64R5-NEXT:    lh $1, 10($sp)
-; MIPS64R5-NEXT:    lh $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lh $1, 14($sp)
-; MIPS64R5-NEXT:    lh $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sh $2, 6($sp)
-; MIPS64R5-NEXT:    sh $1, 4($sp)
-; MIPS64R5-NEXT:    lw $2, 4($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
-;
 ; MIPS32R5EL-LABEL: i16_2:
 ; MIPS32R5EL:       # %bb.0:
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, -64
@@ -1317,6 +1310,38 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 64
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EB-LABEL: i16_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sll $1, $5, 0
+; MIPS64R5EB-NEXT:    sw $1, 16($sp)
+; MIPS64R5EB-NEXT:    sll $1, $4, 0
+; MIPS64R5EB-NEXT:    sw $1, 0($sp)
+; MIPS64R5EB-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.h $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i16_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sll $1, $5, 0
+; MIPS64R5EL-NEXT:    sw $1, 16($sp)
+; MIPS64R5EL-NEXT:    sll $1, $4, 0
+; MIPS64R5EL-NEXT:    sw $1, 0($sp)
+; MIPS64R5EL-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
   %1 = add <2 x i16> %a, %b
   ret <2 x i16> %1
 }
@@ -1751,18 +1776,13 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS64R5EB:       # %bb.0:
 ; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    dsrl $1, $5, 32
-; MIPS64R5EB-NEXT:    insert.d $w0[0], $1
-; MIPS64R5EB-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EB-NEXT:    dsrl $1, $4, 32
-; MIPS64R5EB-NEXT:    insert.d $w1[0], $1
-; MIPS64R5EB-NEXT:    insert.d $w1[1], $4
-; MIPS64R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5EB-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5EB-NEXT:    sw $2, 12($sp)
-; MIPS64R5EB-NEXT:    sw $1, 8($sp)
-; MIPS64R5EB-NEXT:    ld $2, 8($sp)
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.w $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
@@ -1800,19 +1820,11 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS64R5EL-NEXT:    sd $5, 16($sp)
-; MIPS64R5EL-NEXT:    sd $4, 24($sp)
-; MIPS64R5EL-NEXT:    lw $1, 20($sp)
-; MIPS64R5EL-NEXT:    insert.d $w0[0], $5
-; MIPS64R5EL-NEXT:    insert.d $w0[1], $1
-; MIPS64R5EL-NEXT:    lw $1, 28($sp)
-; MIPS64R5EL-NEXT:    insert.d $w1[0], $4
-; MIPS64R5EL-NEXT:    insert.d $w1[1], $1
-; MIPS64R5EL-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5EL-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5EL-NEXT:    sw $2, 12($sp)
-; MIPS64R5EL-NEXT:    sw $1, 8($sp)
-; MIPS64R5EL-NEXT:    ld $2, 8($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.w $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
@@ -4078,10 +4090,10 @@ define void @calli16_2() {
 ;
 ; MIPS64R5EB-LABEL: calli16_2:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4096,9 +4108,9 @@ define void @calli16_2() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv2i16)($gp)
 ; MIPS64R5EB-NEXT:    sw $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -4168,10 +4180,10 @@ define void @calli16_2() {
 ;
 ; MIPS64R5EL-LABEL: calli16_2:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4186,9 +4198,9 @@ define void @calli16_2() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv2i16)($gp)
 ; MIPS64R5EL-NEXT:    sw $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
@@ -4807,10 +4819,10 @@ define void @calli32_2() {
 ;
 ; MIPS64R5EB-LABEL: calli32_2:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4826,9 +4838,9 @@ define void @calli32_2() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv2i32)($gp)
 ; MIPS64R5EB-NEXT:    sd $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -4862,10 +4874,10 @@ define void @calli32_2() {
 ;
 ; MIPS64R5EL-LABEL: calli32_2:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4882,9 +4894,9 @@ define void @calli32_2() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv2i32)($gp)
 ; MIPS64R5EL-NEXT:    sd $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/msa/shuffle.ll b/llvm/test/CodeGen/Mips/msa/shuffle.ll
index e93bb7cdd11283..37eefd695c2eda 100644
--- a/llvm/test/CodeGen/Mips/msa/shuffle.ll
+++ b/llvm/test/CodeGen/Mips/msa/shuffle.ll
@@ -533,7 +533,7 @@ define void @ilvod_v16i8_0(ptr %c, ptr %a, ptr %b) nounwind {
   ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
                      <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <16 x i8> %3, ptr %c
   ; CHECK-DAG: st.b [[R3]], 0($4)
 
@@ -548,7 +548,7 @@ define void @ilvod_v8i16_0(ptr %c, ptr %a, ptr %b) nounwind {
   %2 = load <8 x i16>, ptr %b
   ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <8 x i16> %3, ptr %c
   ; CHECK-DAG: st.h [[R3]], 0($4)
 
@@ -563,7 +563,7 @@ define void @ilvod_v4i32_0(ptr %c, ptr %a, ptr %b) nounwind {
   %2 = load <4 x i32>, ptr %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <4 x i32> %3, ptr %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -578,7 +578,7 @@ define void @ilvod_v2i64_0(ptr %c, ptr %a, ptr %b) nounwind {
   %2 = load <2 x i64>, ptr %b
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
-  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %3, ptr %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
@@ -934,7 +934,7 @@ define void @ilvl_v2i64_0(ptr %c, ptr %a, ptr %b) nounwind {
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
   ; ilvl.d and ilvod.d are equivalent for v2i64
-  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %3, ptr %c
   ; CHECK-DAG: st.d [[R3]], 0($4)
 
@@ -1290,7 +1290,7 @@ define void @pckod_v2i64_0(ptr %c, ptr %a, ptr %b) nounwind {
   ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
   ; pckod.d and ilvod.d are equivalent for v2i64
-  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
   store <2 x i64> %3, ptr %c
   ; CHECK-DAG: st.d [[R3]], 0($4)