[llvm] [MIPS] [MSA] Widen v2i8, v216 and v2i32 vectors (PR #123040)

Cinhi Young via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 22 19:50:43 PST 2025


https://github.com/Cyanoxygen updated https://github.com/llvm/llvm-project/pull/123040

>From c8dbae8a3c0eed153c442186c5a19670c649eeb4 Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Tue, 14 Jan 2025 23:56:32 +0800
Subject: [PATCH 1/2] [MIPS][MSA] Widen v2 vectors to the register length for
 MSA

Currently v2i8, v2i16 and v2i32 are being promoted to v2i64 which casts
the vector back and forth, and instructions with the wrong format are
being used. Widening them to avoid unnecessary bitcasts, loads and
stores, and ensure the correct element size is used.

* tests/CodeGen/Mips: Update tests after widening of v2 vectors.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp   |   39 +
 llvm/lib/Target/Mips/MipsSEISelLowering.h     |    3 +
 llvm/test/CodeGen/Mips/cconv/vector.ll        | 1435 ++++++-----------
 .../test/CodeGen/Mips/msa/basic_operations.ll |   40 +-
 4 files changed, 571 insertions(+), 946 deletions(-)

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 1d1b0f9c6ae2a9..edfa6efd66972f 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -59,6 +59,45 @@ static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
                                             "stores to their single precision "
                                             "counterparts"));
 
+// Widen the v2 vectors to the register width, i.e. v2i16 -> v8i16,
+// v2i32 -> v4i32, etc, to ensure the correct rail size is used, i.e.
+// INST.h for v16, INST.w for v32, INST.d for v64.
+TargetLoweringBase::LegalizeTypeAction
+MipsSETargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (this->Subtarget.hasMSA()) {
+    switch (VT.SimpleTy) {
+    // Leave v2i1 vectors to be promoted to larger ones.
+    // Other i1 types will be promoted by default.
+    case MVT::v2i1:
+      return TypePromoteInteger;
+      break;
+    // 16-bit vector types (v2 and longer)
+    case MVT::v2i8:
+    // 32-bit vector types (v2 and longer)
+    case MVT::v2i16:
+    case MVT::v4i8:
+    // 64-bit vector types (v2 and longer)
+    case MVT::v2i32:
+    case MVT::v4i16:
+    case MVT::v8i8:
+      return TypeWidenVector;
+      break;
+    // Only word (.w) and doubleword (.d) are available for floating point
+    // vectors. That means floating point vectors should be either v2f64
+    // or v4f32.
+    // Here we only explicitly widen the f32 types - f16 will be promoted
+    // by default.
+    case MVT::v2f32:
+    case MVT::v3f32:
+      return TypeWidenVector;
+    // v2i64 is already 128-bit wide.
+    default:
+      break;
+    }
+  }
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
                                            const MipsSubtarget &STI)
     : MipsTargetLowering(TM, STI) {
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.h b/llvm/lib/Target/Mips/MipsSEISelLowering.h
index 43b88a9f095226..675131aefb6dd9 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -45,6 +45,9 @@ class TargetRegisterClass;
         MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
         unsigned *Fast = nullptr) const override;
 
+    TargetLoweringBase::LegalizeTypeAction
+    getPreferredVectorAction(MVT VT) const override;
+
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 28a7dc046139b2..383e5ef19cebf1 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -48,102 +48,86 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
 ;
 ; MIPS32R5EB-LABEL: i8_2:
 ; MIPS32R5EB:       # %bb.0:
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -64
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 64
-; MIPS32R5EB-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -48
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS32R5EB-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    sw $fp, 40($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EB-NEXT:    move $fp, $sp
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sh $5, 48($sp)
-; MIPS32R5EB-NEXT:    sh $4, 52($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 49($sp)
-; MIPS32R5EB-NEXT:    sw $1, 28($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 48($sp)
-; MIPS32R5EB-NEXT:    sw $1, 20($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 53($sp)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 52($sp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
-; MIPS32R5EB-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT:    sb $2, 45($sp)
-; MIPS32R5EB-NEXT:    sb $1, 44($sp)
-; MIPS32R5EB-NEXT:    lhu $2, 44($sp)
+; MIPS32R5EB-NEXT:    sh $5, 16($sp)
+; MIPS32R5EB-NEXT:    sh $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS32R5EB-NEXT:    copy_u.h $2, $w0[0]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 64
+; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sh $5, 8($sp)
-; MIPS64R5-NEXT:    sh $4, 12($sp)
-; MIPS64R5-NEXT:    lb $1, 9($sp)
-; MIPS64R5-NEXT:    lb $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lb $1, 13($sp)
-; MIPS64R5-NEXT:    lb $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sb $2, 5($sp)
-; MIPS64R5-NEXT:    sb $1, 4($sp)
-; MIPS64R5-NEXT:    lh $2, 4($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
-;
 ; MIPS32R5EL-LABEL: i8_2:
 ; MIPS32R5EL:       # %bb.0:
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -64
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 64
-; MIPS32R5EL-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -48
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS32R5EL-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    sw $fp, 40($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EL-NEXT:    move $fp, $sp
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sh $5, 48($sp)
-; MIPS32R5EL-NEXT:    sh $4, 52($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 49($sp)
-; MIPS32R5EL-NEXT:    sw $1, 24($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 48($sp)
-; MIPS32R5EL-NEXT:    sw $1, 16($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 53($sp)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 52($sp)
-; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT:    sb $2, 45($sp)
-; MIPS32R5EL-NEXT:    sb $1, 44($sp)
-; MIPS32R5EL-NEXT:    lhu $2, 44($sp)
+; MIPS32R5EL-NEXT:    sh $5, 16($sp)
+; MIPS32R5EL-NEXT:    sh $4, 0($sp)
+; MIPS32R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EL-NEXT:    copy_u.h $2, $w0[0]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 64
+; MIPS32R5EL-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EB-LABEL: i8_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EB-NEXT:    sh $5, 16($sp)
+; MIPS64R5EB-NEXT:    sh $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EB-NEXT:    sh $1, 44($sp)
+; MIPS64R5EB-NEXT:    lh $2, 44($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EB-NEXT:    jr $ra
+;
+; MIPS64R5EL-LABEL: i8_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -48
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS64R5EL-NEXT:    sh $5, 16($sp)
+; MIPS64R5EL-NEXT:    sh $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EL-NEXT:    sh $1, 44($sp)
+; MIPS64R5EL-NEXT:    lh $2, 44($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 48
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
   %1 = add <2 x i8> %a, %b
   ret <2 x i8> %1
 }
@@ -229,127 +213,110 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
 ;
 ; MIPS32R5EB-LABEL: i8x2_7:
 ; MIPS32R5EB:       # %bb.0: # %entry
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -144
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 144
-; MIPS32R5EB-NEXT:    sw $ra, 140($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 136($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -128
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 128
+; MIPS32R5EB-NEXT:    sw $ra, 124($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    sw $fp, 120($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EB-NEXT:    move $fp, $sp
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sh $5, 128($sp)
-; MIPS32R5EB-NEXT:    sh $4, 132($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 129($sp)
-; MIPS32R5EB-NEXT:    sw $1, 76($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 128($sp)
-; MIPS32R5EB-NEXT:    sw $1, 68($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 133($sp)
-; MIPS32R5EB-NEXT:    sw $1, 60($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 132($sp)
-; MIPS32R5EB-NEXT:    sw $1, 52($sp)
-; MIPS32R5EB-NEXT:    ld.d $w0, 64($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 48($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    sh $6, 124($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 125($sp)
-; MIPS32R5EB-NEXT:    sw $1, 92($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 124($sp)
-; MIPS32R5EB-NEXT:    sw $1, 84($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 80($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EB-NEXT:    sh $7, 120($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 121($sp)
-; MIPS32R5EB-NEXT:    sw $1, 108($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 120($sp)
-; MIPS32R5EB-NEXT:    sw $1, 100($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 96($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EB-NEXT:    lbu $1, 163($fp)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 162($fp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EB-NEXT:    lbu $1, 167($fp)
-; MIPS32R5EB-NEXT:    sw $1, 28($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 166($fp)
-; MIPS32R5EB-NEXT:    sw $1, 20($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 16($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EB-NEXT:    lbu $1, 171($fp)
-; MIPS32R5EB-NEXT:    sw $1, 44($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 170($fp)
-; MIPS32R5EB-NEXT:    sw $1, 36($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 32($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT:    sb $2, 117($sp)
-; MIPS32R5EB-NEXT:    sb $1, 116($sp)
-; MIPS32R5EB-NEXT:    lhu $2, 116($sp)
+; MIPS32R5EB-NEXT:    sh $5, 16($sp)
+; MIPS32R5EB-NEXT:    sh $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    sh $6, 32($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 32($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    sh $7, 48($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 48($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    lhu $1, 146($fp)
+; MIPS32R5EB-NEXT:    sh $1, 64($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 64($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    lhu $1, 150($fp)
+; MIPS32R5EB-NEXT:    sh $1, 80($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 80($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    lhu $1, 154($fp)
+; MIPS32R5EB-NEXT:    sh $1, 96($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 96($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS32R5EB-NEXT:    copy_u.h $2, $w0[0]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 136($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 140($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 144
+; MIPS32R5EB-NEXT:    lw $fp, 120($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    lw $ra, 124($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 128
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8x2_7:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sh $5, 24($sp)
-; MIPS64R5-NEXT:    sh $4, 28($sp)
-; MIPS64R5-NEXT:    lb $1, 25($sp)
-; MIPS64R5-NEXT:    lb $2, 24($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lb $1, 29($sp)
-; MIPS64R5-NEXT:    lb $2, 28($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    sh $6, 20($sp)
-; MIPS64R5-NEXT:    lb $1, 21($sp)
-; MIPS64R5-NEXT:    lb $2, 20($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $7, 16($sp)
-; MIPS64R5-NEXT:    lb $1, 17($sp)
-; MIPS64R5-NEXT:    lb $2, 16($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $8, 12($sp)
-; MIPS64R5-NEXT:    lb $1, 13($sp)
-; MIPS64R5-NEXT:    lb $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $9, 8($sp)
-; MIPS64R5-NEXT:    lb $1, 9($sp)
-; MIPS64R5-NEXT:    lb $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    sh $10, 4($sp)
-; MIPS64R5-NEXT:    lb $1, 5($sp)
-; MIPS64R5-NEXT:    lb $2, 4($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sb $2, 1($sp)
-; MIPS64R5-NEXT:    sb $1, 0($sp)
-; MIPS64R5-NEXT:    lh $2, 0($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i8x2_7:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -128
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 128
+; MIPS64R5EB-NEXT:    sh $5, 16($sp)
+; MIPS64R5EB-NEXT:    sh $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    sh $6, 32($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 32($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $7, 48($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 48($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $8, 64($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 64($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $9, 80($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 80($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    sh $10, 96($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 96($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EB-NEXT:    sh $1, 124($sp)
+; MIPS64R5EB-NEXT:    lh $2, 124($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 128
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i8x2_7:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -128
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 128
+; MIPS64R5EL-NEXT:    sh $5, 16($sp)
+; MIPS64R5EL-NEXT:    sh $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    sh $6, 32($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 32($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $7, 48($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 48($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $8, 64($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 64($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $9, 80($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 80($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    sh $10, 96($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 96($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    copy_s.h $1, $w0[0]
+; MIPS64R5EL-NEXT:    sh $1, 124($sp)
+; MIPS64R5EL-NEXT:    lh $2, 124($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 128
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 ;
 ; MIPS32EL-LABEL: i8x2_7:
 ; MIPS32EL:       # %bb.0: # %entry
@@ -387,70 +354,44 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
 ;
 ; MIPS32R5EL-LABEL: i8x2_7:
 ; MIPS32R5EL:       # %bb.0: # %entry
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -144
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 144
-; MIPS32R5EL-NEXT:    sw $ra, 140($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 136($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -128
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 128
+; MIPS32R5EL-NEXT:    sw $ra, 124($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    sw $fp, 120($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EL-NEXT:    move $fp, $sp
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sh $5, 128($sp)
-; MIPS32R5EL-NEXT:    sh $4, 132($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 129($sp)
-; MIPS32R5EL-NEXT:    sw $1, 72($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 128($sp)
+; MIPS32R5EL-NEXT:    sh $5, 16($sp)
+; MIPS32R5EL-NEXT:    sh $4, 0($sp)
+; MIPS32R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EL-NEXT:    sh $6, 32($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 32($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    sh $7, 48($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 48($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    lw $1, 144($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 64($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 133($sp)
-; MIPS32R5EL-NEXT:    sw $1, 56($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 132($sp)
-; MIPS32R5EL-NEXT:    sw $1, 48($sp)
-; MIPS32R5EL-NEXT:    ld.d $w0, 64($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 48($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EL-NEXT:    sh $6, 124($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 125($sp)
-; MIPS32R5EL-NEXT:    sw $1, 88($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 124($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 64($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    lw $1, 148($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 80($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 80($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EL-NEXT:    sh $7, 120($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 121($sp)
-; MIPS32R5EL-NEXT:    sw $1, 104($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 120($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 80($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    lw $1, 152($fp)
 ; MIPS32R5EL-NEXT:    sw $1, 96($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 96($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EL-NEXT:    lbu $1, 161($fp)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 160($fp)
-; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EL-NEXT:    lbu $1, 165($fp)
-; MIPS32R5EL-NEXT:    sw $1, 24($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 164($fp)
-; MIPS32R5EL-NEXT:    sw $1, 16($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 16($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EL-NEXT:    lbu $1, 169($fp)
-; MIPS32R5EL-NEXT:    sw $1, 40($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 168($fp)
-; MIPS32R5EL-NEXT:    sw $1, 32($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 32($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w0, $w1
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT:    sb $2, 117($sp)
-; MIPS32R5EL-NEXT:    sb $1, 116($sp)
-; MIPS32R5EL-NEXT:    lhu $2, 116($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 96($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    copy_u.h $2, $w0[0]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 136($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 140($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 144
+; MIPS32R5EL-NEXT:    lw $fp, 120($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    lw $ra, 124($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 128
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
 entry:
@@ -514,77 +455,64 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 ; MIPS64-NEXT:    jr $ra
 ; MIPS64-NEXT:    nop
 ;
-; MIPS32R5-LABEL: i8_4:
-; MIPS32R5:       # %bb.0:
-; MIPS32R5-NEXT:    addiu $sp, $sp, -16
-; MIPS32R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS32R5-NEXT:    sw $5, 8($sp)
-; MIPS32R5-NEXT:    sw $4, 12($sp)
-; MIPS32R5-NEXT:    lbu $1, 9($sp)
-; MIPS32R5-NEXT:    lbu $2, 8($sp)
-; MIPS32R5-NEXT:    insert.w $w0[0], $2
-; MIPS32R5-NEXT:    insert.w $w0[1], $1
-; MIPS32R5-NEXT:    lbu $1, 10($sp)
-; MIPS32R5-NEXT:    insert.w $w0[2], $1
-; MIPS32R5-NEXT:    lbu $1, 11($sp)
-; MIPS32R5-NEXT:    insert.w $w0[3], $1
-; MIPS32R5-NEXT:    lbu $1, 13($sp)
-; MIPS32R5-NEXT:    lbu $2, 12($sp)
-; MIPS32R5-NEXT:    insert.w $w1[0], $2
-; MIPS32R5-NEXT:    insert.w $w1[1], $1
-; MIPS32R5-NEXT:    lbu $1, 14($sp)
-; MIPS32R5-NEXT:    insert.w $w1[2], $1
-; MIPS32R5-NEXT:    lbu $1, 15($sp)
-; MIPS32R5-NEXT:    insert.w $w1[3], $1
-; MIPS32R5-NEXT:    addv.w $w0, $w1, $w0
-; MIPS32R5-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5-NEXT:    copy_s.w $4, $w0[3]
-; MIPS32R5-NEXT:    sb $4, 7($sp)
-; MIPS32R5-NEXT:    sb $3, 6($sp)
-; MIPS32R5-NEXT:    sb $2, 5($sp)
-; MIPS32R5-NEXT:    sb $1, 4($sp)
-; MIPS32R5-NEXT:    lw $2, 4($sp)
-; MIPS32R5-NEXT:    addiu $sp, $sp, 16
-; MIPS32R5-NEXT:    jr $ra
-; MIPS32R5-NEXT:    nop
+; MIPS32R5EB-LABEL: i8_4:
+; MIPS32R5EB:       # %bb.0:
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -48
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS32R5EB-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    sw $fp, 40($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
+; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
+; MIPS32R5EB-NEXT:    move $fp, $sp
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
+; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
+; MIPS32R5EB-NEXT:    and $sp, $sp, $1
+; MIPS32R5EB-NEXT:    sw $5, 16($sp)
+; MIPS32R5EB-NEXT:    sw $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    shf.b $w0, $w0, 27
+; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32R5EB-NEXT:    move $sp, $fp
+; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 48
+; MIPS32R5EB-NEXT:    jr $ra
+; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8_4:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sw $5, 8($sp)
-; MIPS64R5-NEXT:    sw $4, 12($sp)
-; MIPS64R5-NEXT:    lbu $1, 9($sp)
-; MIPS64R5-NEXT:    lbu $2, 8($sp)
-; MIPS64R5-NEXT:    insert.w $w0[0], $2
-; MIPS64R5-NEXT:    insert.w $w0[1], $1
-; MIPS64R5-NEXT:    lbu $1, 10($sp)
-; MIPS64R5-NEXT:    insert.w $w0[2], $1
-; MIPS64R5-NEXT:    lbu $1, 11($sp)
-; MIPS64R5-NEXT:    insert.w $w0[3], $1
-; MIPS64R5-NEXT:    lbu $1, 13($sp)
-; MIPS64R5-NEXT:    lbu $2, 12($sp)
-; MIPS64R5-NEXT:    insert.w $w1[0], $2
-; MIPS64R5-NEXT:    insert.w $w1[1], $1
-; MIPS64R5-NEXT:    lbu $1, 14($sp)
-; MIPS64R5-NEXT:    insert.w $w1[2], $1
-; MIPS64R5-NEXT:    lbu $1, 15($sp)
-; MIPS64R5-NEXT:    insert.w $w1[3], $1
-; MIPS64R5-NEXT:    addv.w $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.w $2, $w0[1]
-; MIPS64R5-NEXT:    copy_s.w $3, $w0[2]
-; MIPS64R5-NEXT:    copy_s.w $4, $w0[3]
-; MIPS64R5-NEXT:    sb $4, 7($sp)
-; MIPS64R5-NEXT:    sb $3, 6($sp)
-; MIPS64R5-NEXT:    sb $2, 5($sp)
-; MIPS64R5-NEXT:    sb $1, 4($sp)
-; MIPS64R5-NEXT:    lw $2, 4($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i8_4:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sll $1, $5, 0
+; MIPS64R5EB-NEXT:    sw $1, 16($sp)
+; MIPS64R5EB-NEXT:    sll $1, $4, 0
+; MIPS64R5EB-NEXT:    sw $1, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 27
+; MIPS64R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i8_4:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sll $1, $5, 0
+; MIPS64R5EL-NEXT:    sw $1, 16($sp)
+; MIPS64R5EL-NEXT:    sll $1, $4, 0
+; MIPS64R5EL-NEXT:    sw $1, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
   %1 = add <4 x i8> %a, %b
   ret <4 x i8> %1
 }
@@ -704,66 +632,16 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sw $6, 24($sp)
-; MIPS32R5EB-NEXT:    lbu $1, 25($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 24($sp)
-; MIPS32R5EB-NEXT:    sw $7, 28($sp)
-; MIPS32R5EB-NEXT:    insert.h $w0[0], $2
-; MIPS32R5EB-NEXT:    insert.h $w0[1], $1
-; MIPS32R5EB-NEXT:    lbu $1, 26($sp)
-; MIPS32R5EB-NEXT:    sw $4, 32($sp)
-; MIPS32R5EB-NEXT:    insert.h $w0[2], $1
-; MIPS32R5EB-NEXT:    lbu $1, 27($sp)
-; MIPS32R5EB-NEXT:    insert.h $w0[3], $1
-; MIPS32R5EB-NEXT:    lbu $1, 28($sp)
-; MIPS32R5EB-NEXT:    sw $5, 36($sp)
-; MIPS32R5EB-NEXT:    insert.h $w0[4], $1
-; MIPS32R5EB-NEXT:    lbu $1, 33($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 32($sp)
-; MIPS32R5EB-NEXT:    insert.h $w1[0], $2
-; MIPS32R5EB-NEXT:    insert.h $w1[1], $1
-; MIPS32R5EB-NEXT:    lbu $1, 29($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 34($sp)
-; MIPS32R5EB-NEXT:    insert.h $w1[2], $2
-; MIPS32R5EB-NEXT:    insert.h $w0[5], $1
-; MIPS32R5EB-NEXT:    lbu $1, 35($sp)
-; MIPS32R5EB-NEXT:    lbu $2, 31($sp)
-; MIPS32R5EB-NEXT:    lbu $3, 30($sp)
-; MIPS32R5EB-NEXT:    lbu $4, 39($sp)
-; MIPS32R5EB-NEXT:    insert.h $w0[6], $3
-; MIPS32R5EB-NEXT:    insert.h $w0[7], $2
-; MIPS32R5EB-NEXT:    insert.h $w1[3], $1
-; MIPS32R5EB-NEXT:    lbu $1, 36($sp)
-; MIPS32R5EB-NEXT:    insert.h $w1[4], $1
-; MIPS32R5EB-NEXT:    lbu $1, 37($sp)
-; MIPS32R5EB-NEXT:    insert.h $w1[5], $1
-; MIPS32R5EB-NEXT:    lbu $1, 38($sp)
-; MIPS32R5EB-NEXT:    insert.h $w1[6], $1
-; MIPS32R5EB-NEXT:    insert.h $w1[7], $4
-; MIPS32R5EB-NEXT:    addv.h $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    copy_s.h $1, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.h $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.h $3, $w0[2]
-; MIPS32R5EB-NEXT:    copy_s.h $4, $w0[3]
-; MIPS32R5EB-NEXT:    copy_s.h $5, $w0[4]
-; MIPS32R5EB-NEXT:    copy_s.h $6, $w0[5]
-; MIPS32R5EB-NEXT:    copy_s.h $7, $w0[6]
-; MIPS32R5EB-NEXT:    copy_s.h $8, $w0[7]
-; MIPS32R5EB-NEXT:    sb $8, 23($sp)
-; MIPS32R5EB-NEXT:    sb $7, 22($sp)
-; MIPS32R5EB-NEXT:    sb $6, 21($sp)
-; MIPS32R5EB-NEXT:    sb $5, 20($sp)
-; MIPS32R5EB-NEXT:    sb $4, 19($sp)
-; MIPS32R5EB-NEXT:    sb $3, 18($sp)
-; MIPS32R5EB-NEXT:    sb $2, 17($sp)
-; MIPS32R5EB-NEXT:    sb $1, 16($sp)
-; MIPS32R5EB-NEXT:    lw $1, 20($sp)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    lw $1, 16($sp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
+; MIPS32R5EB-NEXT:    sw $7, 20($sp)
+; MIPS32R5EB-NEXT:    sw $6, 16($sp)
+; MIPS32R5EB-NEXT:    sw $5, 4($sp)
+; MIPS32R5EB-NEXT:    sw $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    shf.b $w0, $w0, 27
+; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
 ; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
@@ -771,65 +649,35 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i8_8:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sd $5, 16($sp)
-; MIPS64R5-NEXT:    lbu $1, 17($sp)
-; MIPS64R5-NEXT:    lbu $2, 16($sp)
-; MIPS64R5-NEXT:    sd $4, 24($sp)
-; MIPS64R5-NEXT:    insert.h $w0[0], $2
-; MIPS64R5-NEXT:    insert.h $w0[1], $1
-; MIPS64R5-NEXT:    lbu $1, 18($sp)
-; MIPS64R5-NEXT:    insert.h $w0[2], $1
-; MIPS64R5-NEXT:    lbu $1, 19($sp)
-; MIPS64R5-NEXT:    insert.h $w0[3], $1
-; MIPS64R5-NEXT:    lbu $1, 20($sp)
-; MIPS64R5-NEXT:    insert.h $w0[4], $1
-; MIPS64R5-NEXT:    lbu $1, 25($sp)
-; MIPS64R5-NEXT:    lbu $2, 24($sp)
-; MIPS64R5-NEXT:    insert.h $w1[0], $2
-; MIPS64R5-NEXT:    insert.h $w1[1], $1
-; MIPS64R5-NEXT:    lbu $1, 21($sp)
-; MIPS64R5-NEXT:    lbu $2, 26($sp)
-; MIPS64R5-NEXT:    insert.h $w1[2], $2
-; MIPS64R5-NEXT:    insert.h $w0[5], $1
-; MIPS64R5-NEXT:    lbu $1, 27($sp)
-; MIPS64R5-NEXT:    lbu $2, 23($sp)
-; MIPS64R5-NEXT:    lbu $3, 22($sp)
-; MIPS64R5-NEXT:    lbu $4, 31($sp)
-; MIPS64R5-NEXT:    insert.h $w0[6], $3
-; MIPS64R5-NEXT:    insert.h $w0[7], $2
-; MIPS64R5-NEXT:    insert.h $w1[3], $1
-; MIPS64R5-NEXT:    lbu $1, 28($sp)
-; MIPS64R5-NEXT:    insert.h $w1[4], $1
-; MIPS64R5-NEXT:    lbu $1, 29($sp)
-; MIPS64R5-NEXT:    insert.h $w1[5], $1
-; MIPS64R5-NEXT:    lbu $1, 30($sp)
-; MIPS64R5-NEXT:    insert.h $w1[6], $1
-; MIPS64R5-NEXT:    insert.h $w1[7], $4
-; MIPS64R5-NEXT:    addv.h $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.h $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.h $2, $w0[1]
-; MIPS64R5-NEXT:    copy_s.h $3, $w0[2]
-; MIPS64R5-NEXT:    copy_s.h $4, $w0[3]
-; MIPS64R5-NEXT:    copy_s.h $5, $w0[4]
-; MIPS64R5-NEXT:    copy_s.h $6, $w0[5]
-; MIPS64R5-NEXT:    copy_s.h $7, $w0[6]
-; MIPS64R5-NEXT:    copy_s.h $8, $w0[7]
-; MIPS64R5-NEXT:    sb $8, 15($sp)
-; MIPS64R5-NEXT:    sb $7, 14($sp)
-; MIPS64R5-NEXT:    sb $6, 13($sp)
-; MIPS64R5-NEXT:    sb $5, 12($sp)
-; MIPS64R5-NEXT:    sb $4, 11($sp)
-; MIPS64R5-NEXT:    sb $3, 10($sp)
-; MIPS64R5-NEXT:    sb $2, 9($sp)
-; MIPS64R5-NEXT:    sb $1, 8($sp)
-; MIPS64R5-NEXT:    ld $2, 8($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i8_8:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.b $w0, $w0, 27
+; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i8_8:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.b $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 ;
 ; MIPS32R5EL-LABEL: i8_8:
 ; MIPS32R5EL:       # %bb.0:
@@ -843,66 +691,15 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sw $6, 24($sp)
-; MIPS32R5EL-NEXT:    lbu $1, 25($sp)
-; MIPS32R5EL-NEXT:    lbu $2, 24($sp)
-; MIPS32R5EL-NEXT:    sw $7, 28($sp)
-; MIPS32R5EL-NEXT:    insert.h $w0[0], $2
-; MIPS32R5EL-NEXT:    insert.h $w0[1], $1
-; MIPS32R5EL-NEXT:    lbu $1, 26($sp)
-; MIPS32R5EL-NEXT:    sw $4, 32($sp)
-; MIPS32R5EL-NEXT:    insert.h $w0[2], $1
-; MIPS32R5EL-NEXT:    lbu $1, 27($sp)
-; MIPS32R5EL-NEXT:    insert.h $w0[3], $1
-; MIPS32R5EL-NEXT:    lbu $1, 28($sp)
-; MIPS32R5EL-NEXT:    sw $5, 36($sp)
-; MIPS32R5EL-NEXT:    insert.h $w0[4], $1
-; MIPS32R5EL-NEXT:    lbu $1, 33($sp)
-; MIPS32R5EL-NEXT:    lbu $2, 32($sp)
-; MIPS32R5EL-NEXT:    insert.h $w1[0], $2
-; MIPS32R5EL-NEXT:    insert.h $w1[1], $1
-; MIPS32R5EL-NEXT:    lbu $1, 29($sp)
-; MIPS32R5EL-NEXT:    lbu $2, 34($sp)
-; MIPS32R5EL-NEXT:    insert.h $w1[2], $2
-; MIPS32R5EL-NEXT:    insert.h $w0[5], $1
-; MIPS32R5EL-NEXT:    lbu $1, 35($sp)
-; MIPS32R5EL-NEXT:    lbu $2, 31($sp)
-; MIPS32R5EL-NEXT:    lbu $3, 30($sp)
-; MIPS32R5EL-NEXT:    lbu $4, 39($sp)
-; MIPS32R5EL-NEXT:    insert.h $w0[6], $3
-; MIPS32R5EL-NEXT:    insert.h $w0[7], $2
-; MIPS32R5EL-NEXT:    insert.h $w1[3], $1
-; MIPS32R5EL-NEXT:    lbu $1, 36($sp)
-; MIPS32R5EL-NEXT:    insert.h $w1[4], $1
-; MIPS32R5EL-NEXT:    lbu $1, 37($sp)
-; MIPS32R5EL-NEXT:    insert.h $w1[5], $1
-; MIPS32R5EL-NEXT:    lbu $1, 38($sp)
-; MIPS32R5EL-NEXT:    insert.h $w1[6], $1
-; MIPS32R5EL-NEXT:    insert.h $w1[7], $4
-; MIPS32R5EL-NEXT:    addv.h $w0, $w1, $w0
-; MIPS32R5EL-NEXT:    copy_s.h $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.h $2, $w0[1]
-; MIPS32R5EL-NEXT:    copy_s.h $3, $w0[2]
-; MIPS32R5EL-NEXT:    copy_s.h $4, $w0[3]
-; MIPS32R5EL-NEXT:    copy_s.h $5, $w0[4]
-; MIPS32R5EL-NEXT:    copy_s.h $6, $w0[5]
-; MIPS32R5EL-NEXT:    copy_s.h $7, $w0[6]
-; MIPS32R5EL-NEXT:    copy_s.h $8, $w0[7]
-; MIPS32R5EL-NEXT:    sb $8, 23($sp)
-; MIPS32R5EL-NEXT:    sb $7, 22($sp)
-; MIPS32R5EL-NEXT:    sb $6, 21($sp)
-; MIPS32R5EL-NEXT:    sb $5, 20($sp)
-; MIPS32R5EL-NEXT:    sb $4, 19($sp)
-; MIPS32R5EL-NEXT:    sb $3, 18($sp)
-; MIPS32R5EL-NEXT:    sb $2, 17($sp)
-; MIPS32R5EL-NEXT:    sb $1, 16($sp)
-; MIPS32R5EL-NEXT:    lw $1, 20($sp)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    lw $1, 16($sp)
-; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($sp)
+; MIPS32R5EL-NEXT:    sw $7, 20($sp)
+; MIPS32R5EL-NEXT:    sw $6, 16($sp)
+; MIPS32R5EL-NEXT:    sw $5, 4($sp)
+; MIPS32R5EL-NEXT:    sw $4, 0($sp)
+; MIPS32R5EL-NEXT:    ld.b $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.b $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.b $w0, $w1, $w0
 ; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
+; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
 ; MIPS32R5EL-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
@@ -1221,102 +1018,86 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
 ;
 ; MIPS32R5EB-LABEL: i16_2:
 ; MIPS32R5EB:       # %bb.0:
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -64
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 64
-; MIPS32R5EB-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -48
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 48
+; MIPS32R5EB-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    sw $fp, 40($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EB-NEXT:    move $fp, $sp
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sw $5, 48($sp)
-; MIPS32R5EB-NEXT:    sw $4, 52($sp)
-; MIPS32R5EB-NEXT:    lhu $1, 50($sp)
-; MIPS32R5EB-NEXT:    sw $1, 28($sp)
-; MIPS32R5EB-NEXT:    lhu $1, 48($sp)
-; MIPS32R5EB-NEXT:    sw $1, 20($sp)
-; MIPS32R5EB-NEXT:    lhu $1, 54($sp)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    lhu $1, 52($sp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
-; MIPS32R5EB-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT:    sh $2, 46($sp)
-; MIPS32R5EB-NEXT:    sh $1, 44($sp)
-; MIPS32R5EB-NEXT:    lw $2, 44($sp)
+; MIPS32R5EB-NEXT:    sw $5, 16($sp)
+; MIPS32R5EB-NEXT:    sw $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.h $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.h $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.h $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    shf.h $w0, $w0, 177
+; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 64
+; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i16_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sw $5, 8($sp)
-; MIPS64R5-NEXT:    sw $4, 12($sp)
-; MIPS64R5-NEXT:    lh $1, 10($sp)
-; MIPS64R5-NEXT:    lh $2, 8($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    lh $1, 14($sp)
-; MIPS64R5-NEXT:    lh $2, 12($sp)
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sh $2, 6($sp)
-; MIPS64R5-NEXT:    sh $1, 4($sp)
-; MIPS64R5-NEXT:    lw $2, 4($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
-;
 ; MIPS32R5EL-LABEL: i16_2:
 ; MIPS32R5EL:       # %bb.0:
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -64
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 64
-; MIPS32R5EL-NEXT:    sw $ra, 60($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 56($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -48
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 48
+; MIPS32R5EL-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    sw $fp, 40($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
 ; MIPS32R5EL-NEXT:    move $fp, $sp
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sw $5, 48($sp)
-; MIPS32R5EL-NEXT:    sw $4, 52($sp)
-; MIPS32R5EL-NEXT:    lhu $1, 50($sp)
-; MIPS32R5EL-NEXT:    sw $1, 24($sp)
-; MIPS32R5EL-NEXT:    lhu $1, 48($sp)
-; MIPS32R5EL-NEXT:    sw $1, 16($sp)
-; MIPS32R5EL-NEXT:    lhu $1, 54($sp)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    lhu $1, 52($sp)
-; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT:    sh $2, 46($sp)
-; MIPS32R5EL-NEXT:    sh $1, 44($sp)
-; MIPS32R5EL-NEXT:    lw $2, 44($sp)
+; MIPS32R5EL-NEXT:    sw $5, 16($sp)
+; MIPS32R5EL-NEXT:    sw $4, 0($sp)
+; MIPS32R5EL-NEXT:    ld.h $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.h $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.h $w0, $w1, $w0
+; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 56($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 60($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 64
+; MIPS32R5EL-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EB-LABEL: i16_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sll $1, $5, 0
+; MIPS64R5EB-NEXT:    sw $1, 16($sp)
+; MIPS64R5EB-NEXT:    sll $1, $4, 0
+; MIPS64R5EB-NEXT:    sw $1, 0($sp)
+; MIPS64R5EB-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.h $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i16_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sll $1, $5, 0
+; MIPS64R5EL-NEXT:    sw $1, 16($sp)
+; MIPS64R5EL-NEXT:    sll $1, $4, 0
+; MIPS64R5EL-NEXT:    sw $1, 0($sp)
+; MIPS64R5EL-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
   %1 = add <2 x i16> %a, %b
   ret <2 x i16> %1
 }
@@ -1384,84 +1165,50 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sw $6, 24($sp)
-; MIPS32R5EB-NEXT:    sw $7, 28($sp)
-; MIPS32R5EB-NEXT:    lhu $1, 26($sp)
-; MIPS32R5EB-NEXT:    lhu $2, 24($sp)
-; MIPS32R5EB-NEXT:    sw $4, 32($sp)
-; MIPS32R5EB-NEXT:    insert.w $w0[0], $2
-; MIPS32R5EB-NEXT:    insert.w $w0[1], $1
-; MIPS32R5EB-NEXT:    lhu $1, 28($sp)
-; MIPS32R5EB-NEXT:    sw $5, 36($sp)
-; MIPS32R5EB-NEXT:    insert.w $w0[2], $1
-; MIPS32R5EB-NEXT:    lhu $1, 30($sp)
-; MIPS32R5EB-NEXT:    insert.w $w0[3], $1
-; MIPS32R5EB-NEXT:    lhu $1, 34($sp)
-; MIPS32R5EB-NEXT:    lhu $2, 32($sp)
-; MIPS32R5EB-NEXT:    insert.w $w1[0], $2
-; MIPS32R5EB-NEXT:    insert.w $w1[1], $1
-; MIPS32R5EB-NEXT:    lhu $1, 36($sp)
-; MIPS32R5EB-NEXT:    insert.w $w1[2], $1
-; MIPS32R5EB-NEXT:    lhu $1, 38($sp)
-; MIPS32R5EB-NEXT:    insert.w $w1[3], $1
-; MIPS32R5EB-NEXT:    addv.w $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EB-NEXT:    copy_s.w $4, $w0[3]
-; MIPS32R5EB-NEXT:    sh $4, 22($sp)
-; MIPS32R5EB-NEXT:    sh $3, 20($sp)
-; MIPS32R5EB-NEXT:    sh $2, 18($sp)
-; MIPS32R5EB-NEXT:    sh $1, 16($sp)
-; MIPS32R5EB-NEXT:    lw $1, 20($sp)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    lw $1, 16($sp)
-; MIPS32R5EB-NEXT:    sw $1, 4($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
+; MIPS32R5EB-NEXT:    sw $7, 20($sp)
+; MIPS32R5EB-NEXT:    sw $6, 16($sp)
+; MIPS32R5EB-NEXT:    sw $5, 4($sp)
+; MIPS32R5EB-NEXT:    sw $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.h $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.h $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.h $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    shf.h $w0, $w0, 177
+; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
 ; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EB-NEXT:    jr $ra
-; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i16_4:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sd $5, 16($sp)
-; MIPS64R5-NEXT:    sd $4, 24($sp)
-; MIPS64R5-NEXT:    lhu $1, 18($sp)
-; MIPS64R5-NEXT:    lhu $2, 16($sp)
-; MIPS64R5-NEXT:    insert.w $w0[0], $2
-; MIPS64R5-NEXT:    insert.w $w0[1], $1
-; MIPS64R5-NEXT:    lhu $1, 20($sp)
-; MIPS64R5-NEXT:    insert.w $w0[2], $1
-; MIPS64R5-NEXT:    lhu $1, 22($sp)
-; MIPS64R5-NEXT:    insert.w $w0[3], $1
-; MIPS64R5-NEXT:    lhu $1, 26($sp)
-; MIPS64R5-NEXT:    lhu $2, 24($sp)
-; MIPS64R5-NEXT:    insert.w $w1[0], $2
-; MIPS64R5-NEXT:    insert.w $w1[1], $1
-; MIPS64R5-NEXT:    lhu $1, 28($sp)
-; MIPS64R5-NEXT:    insert.w $w1[2], $1
-; MIPS64R5-NEXT:    lhu $1, 30($sp)
-; MIPS64R5-NEXT:    insert.w $w1[3], $1
-; MIPS64R5-NEXT:    addv.w $w0, $w1, $w0
-; MIPS64R5-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.w $2, $w0[1]
-; MIPS64R5-NEXT:    copy_s.w $3, $w0[2]
-; MIPS64R5-NEXT:    copy_s.w $4, $w0[3]
-; MIPS64R5-NEXT:    sh $4, 14($sp)
-; MIPS64R5-NEXT:    sh $3, 12($sp)
-; MIPS64R5-NEXT:    sh $2, 10($sp)
-; MIPS64R5-NEXT:    sh $1, 8($sp)
-; MIPS64R5-NEXT:    ld $2, 8($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i16_4:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.h $w0, $w0, 27
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i16_4:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.h $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.h $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.h $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 ;
 ; MIPS32R5EL-LABEL: i16_4:
 ; MIPS32R5EL:       # %bb.0:
@@ -1475,42 +1222,15 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sw $6, 24($sp)
-; MIPS32R5EL-NEXT:    sw $7, 28($sp)
-; MIPS32R5EL-NEXT:    lhu $1, 26($sp)
-; MIPS32R5EL-NEXT:    lhu $2, 24($sp)
-; MIPS32R5EL-NEXT:    sw $4, 32($sp)
-; MIPS32R5EL-NEXT:    insert.w $w0[0], $2
-; MIPS32R5EL-NEXT:    insert.w $w0[1], $1
-; MIPS32R5EL-NEXT:    lhu $1, 28($sp)
-; MIPS32R5EL-NEXT:    sw $5, 36($sp)
-; MIPS32R5EL-NEXT:    insert.w $w0[2], $1
-; MIPS32R5EL-NEXT:    lhu $1, 30($sp)
-; MIPS32R5EL-NEXT:    insert.w $w0[3], $1
-; MIPS32R5EL-NEXT:    lhu $1, 34($sp)
-; MIPS32R5EL-NEXT:    lhu $2, 32($sp)
-; MIPS32R5EL-NEXT:    insert.w $w1[0], $2
-; MIPS32R5EL-NEXT:    insert.w $w1[1], $1
-; MIPS32R5EL-NEXT:    lhu $1, 36($sp)
-; MIPS32R5EL-NEXT:    insert.w $w1[2], $1
-; MIPS32R5EL-NEXT:    lhu $1, 38($sp)
-; MIPS32R5EL-NEXT:    insert.w $w1[3], $1
-; MIPS32R5EL-NEXT:    addv.w $w0, $w1, $w0
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EL-NEXT:    copy_s.w $4, $w0[3]
-; MIPS32R5EL-NEXT:    sh $4, 22($sp)
-; MIPS32R5EL-NEXT:    sh $3, 20($sp)
-; MIPS32R5EL-NEXT:    sh $2, 18($sp)
-; MIPS32R5EL-NEXT:    sh $1, 16($sp)
-; MIPS32R5EL-NEXT:    lw $1, 20($sp)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    lw $1, 16($sp)
-; MIPS32R5EL-NEXT:    sw $1, 0($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($sp)
+; MIPS32R5EL-NEXT:    sw $7, 20($sp)
+; MIPS32R5EL-NEXT:    sw $6, 16($sp)
+; MIPS32R5EL-NEXT:    sw $5, 4($sp)
+; MIPS32R5EL-NEXT:    sw $4, 0($sp)
+; MIPS32R5EL-NEXT:    ld.h $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.h $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.h $w0, $w1, $w0
 ; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
+; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
 ; MIPS32R5EL-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
@@ -1730,16 +1450,15 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    sw $7, 28($sp)
-; MIPS32R5EB-NEXT:    sw $6, 20($sp)
-; MIPS32R5EB-NEXT:    sw $5, 12($sp)
-; MIPS32R5EB-NEXT:    sw $4, 4($sp)
-; MIPS32R5EB-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EB-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS32R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
+; MIPS32R5EB-NEXT:    sw $7, 20($sp)
+; MIPS32R5EB-NEXT:    sw $6, 16($sp)
+; MIPS32R5EB-NEXT:    sw $5, 4($sp)
+; MIPS32R5EB-NEXT:    sw $4, 0($sp)
+; MIPS32R5EB-NEXT:    ld.w $w0, 16($sp)
+; MIPS32R5EB-NEXT:    ld.w $w1, 0($sp)
+; MIPS32R5EB-NEXT:    addv.w $w0, $w1, $w0
+; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EB-NEXT:    move $sp, $fp
 ; MIPS32R5EB-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
@@ -1751,18 +1470,13 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS64R5EB:       # %bb.0:
 ; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    dsrl $1, $5, 32
-; MIPS64R5EB-NEXT:    insert.d $w0[0], $1
-; MIPS64R5EB-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EB-NEXT:    dsrl $1, $4, 32
-; MIPS64R5EB-NEXT:    insert.d $w1[0], $1
-; MIPS64R5EB-NEXT:    insert.d $w1[1], $4
-; MIPS64R5EB-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5EB-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5EB-NEXT:    sw $2, 12($sp)
-; MIPS64R5EB-NEXT:    sw $1, 8($sp)
-; MIPS64R5EB-NEXT:    ld $2, 8($sp)
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.w $w1, 0($sp)
+; MIPS64R5EB-NEXT:    addv.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
@@ -1779,15 +1493,15 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    sw $7, 24($sp)
+; MIPS32R5EL-NEXT:    sw $7, 20($sp)
 ; MIPS32R5EL-NEXT:    sw $6, 16($sp)
-; MIPS32R5EL-NEXT:    sw $5, 8($sp)
+; MIPS32R5EL-NEXT:    sw $5, 4($sp)
 ; MIPS32R5EL-NEXT:    sw $4, 0($sp)
-; MIPS32R5EL-NEXT:    ld.d $w0, 16($sp)
-; MIPS32R5EL-NEXT:    ld.d $w1, 0($sp)
-; MIPS32R5EL-NEXT:    addv.d $w0, $w1, $w0
+; MIPS32R5EL-NEXT:    ld.w $w0, 16($sp)
+; MIPS32R5EL-NEXT:    ld.w $w1, 0($sp)
+; MIPS32R5EL-NEXT:    addv.w $w0, $w1, $w0
 ; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
+; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[1]
 ; MIPS32R5EL-NEXT:    move $sp, $fp
 ; MIPS32R5EL-NEXT:    lw $fp, 40($sp) # 4-byte Folded Reload
 ; MIPS32R5EL-NEXT:    lw $ra, 44($sp) # 4-byte Folded Reload
@@ -1800,19 +1514,11 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS64R5EL-NEXT:    sd $5, 16($sp)
-; MIPS64R5EL-NEXT:    sd $4, 24($sp)
-; MIPS64R5EL-NEXT:    lw $1, 20($sp)
-; MIPS64R5EL-NEXT:    insert.d $w0[0], $5
-; MIPS64R5EL-NEXT:    insert.d $w0[1], $1
-; MIPS64R5EL-NEXT:    lw $1, 28($sp)
-; MIPS64R5EL-NEXT:    insert.d $w1[0], $4
-; MIPS64R5EL-NEXT:    insert.d $w1[1], $1
-; MIPS64R5EL-NEXT:    addv.d $w0, $w1, $w0
-; MIPS64R5EL-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5EL-NEXT:    sw $2, 12($sp)
-; MIPS64R5EL-NEXT:    sw $1, 8($sp)
-; MIPS64R5EL-NEXT:    ld $2, 8($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.w $w1, 0($sp)
+; MIPS64R5EL-NEXT:    addv.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
@@ -2561,31 +2267,11 @@ define <8 x i8> @ret_8_i8() {
 ;
 ; MIPS32R5EB-LABEL: ret_8_i8:
 ; MIPS32R5EB:       # %bb.0:
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EB-NEXT:    move $fp, $sp
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EB-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EB-NEXT:    lui $1, %hi(gv8i8)
 ; MIPS32R5EB-NEXT:    lw $2, %lo(gv8i8)($1)
-; MIPS32R5EB-NEXT:    sw $2, 4($sp)
 ; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv8i8)
-; MIPS32R5EB-NEXT:    lw $1, 4($1)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
-; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT:    lw $3, 4($1)
 ; MIPS32R5EB-NEXT:    jr $ra
-; MIPS32R5EB-NEXT:    nop
 ;
 ; MIPS64R5-LABEL: ret_8_i8:
 ; MIPS64R5:       # %bb.0:
@@ -2599,29 +2285,10 @@ define <8 x i8> @ret_8_i8() {
 ;
 ; MIPS32R5EL-LABEL: ret_8_i8:
 ; MIPS32R5EL:       # %bb.0:
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EL-NEXT:    move $fp, $sp
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EL-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EL-NEXT:    lui $1, %hi(gv8i8)
 ; MIPS32R5EL-NEXT:    lw $2, %lo(gv8i8)($1)
-; MIPS32R5EL-NEXT:    sw $2, 0($sp)
 ; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv8i8)
-; MIPS32R5EL-NEXT:    lw $1, 4($1)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT:    lw $3, 4($1)
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
   %1 = load <8 x i8>, ptr @gv8i8
@@ -2738,29 +2405,10 @@ define <4 x i16> @ret_4_i16() {
 ;
 ; MIPS32R5EB-LABEL: ret_4_i16:
 ; MIPS32R5EB:       # %bb.0:
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EB-NEXT:    move $fp, $sp
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EB-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EB-NEXT:    lui $1, %hi(gv4i16)
 ; MIPS32R5EB-NEXT:    lw $2, %lo(gv4i16)($1)
-; MIPS32R5EB-NEXT:    sw $2, 4($sp)
 ; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv4i16)
-; MIPS32R5EB-NEXT:    lw $1, 4($1)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
-; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT:    lw $3, 4($1)
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
@@ -2776,29 +2424,10 @@ define <4 x i16> @ret_4_i16() {
 ;
 ; MIPS32R5EL-LABEL: ret_4_i16:
 ; MIPS32R5EL:       # %bb.0:
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EL-NEXT:    move $fp, $sp
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EL-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EL-NEXT:    lui $1, %hi(gv4i16)
 ; MIPS32R5EL-NEXT:    lw $2, %lo(gv4i16)($1)
-; MIPS32R5EL-NEXT:    sw $2, 0($sp)
 ; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv4i16)
-; MIPS32R5EL-NEXT:    lw $1, 4($1)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT:    lw $3, 4($1)
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
   %1 = load <4 x i16>, ptr @gv4i16
@@ -2877,29 +2506,10 @@ define <2 x i32> @ret_2_i32() {
 ;
 ; MIPS32R5EB-LABEL: ret_2_i32:
 ; MIPS32R5EB:       # %bb.0:
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EB-NEXT:    move $fp, $sp
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EB-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EB-NEXT:    lui $1, %hi(gv2i32)
 ; MIPS32R5EB-NEXT:    lw $2, %lo(gv2i32)($1)
-; MIPS32R5EB-NEXT:    sw $2, 4($sp)
 ; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv2i32)
-; MIPS32R5EB-NEXT:    lw $1, 4($1)
-; MIPS32R5EB-NEXT:    sw $1, 12($sp)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[3]
-; MIPS32R5EB-NEXT:    move $sp, $fp
-; MIPS32R5EB-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT:    lw $3, 4($1)
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
@@ -2915,29 +2525,10 @@ define <2 x i32> @ret_2_i32() {
 ;
 ; MIPS32R5EL-LABEL: ret_2_i32:
 ; MIPS32R5EL:       # %bb.0:
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
-; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    .cfi_offset 30, -8
-; MIPS32R5EL-NEXT:    move $fp, $sp
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
-; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
-; MIPS32R5EL-NEXT:    and $sp, $sp, $1
 ; MIPS32R5EL-NEXT:    lui $1, %hi(gv2i32)
 ; MIPS32R5EL-NEXT:    lw $2, %lo(gv2i32)($1)
-; MIPS32R5EL-NEXT:    sw $2, 0($sp)
 ; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv2i32)
-; MIPS32R5EL-NEXT:    lw $1, 4($1)
-; MIPS32R5EL-NEXT:    sw $1, 8($sp)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($sp)
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EL-NEXT:    move $sp, $fp
-; MIPS32R5EL-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT:    lw $3, 4($1)
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
   %1 = load <2 x i32>, ptr @gv2i32
@@ -3424,9 +3015,9 @@ define void @call_i8_4() {
 ;
 ; MIPS32R5EB-LABEL: call_i8_4:
 ; MIPS32R5EB:       # %bb.0: # %entry
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -24
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32R5EB-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EB-NEXT:    lui $1, 1543
 ; MIPS32R5EB-NEXT:    ori $4, $1, 2314
@@ -3436,17 +3027,17 @@ define void @call_i8_4() {
 ; MIPS32R5EB-NEXT:    nop
 ; MIPS32R5EB-NEXT:    lui $1, %hi(gv4i8)
 ; MIPS32R5EB-NEXT:    sw $2, %lo(gv4i8)($1)
-; MIPS32R5EB-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
 ; MIPS64R5EB-LABEL: call_i8_4:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(call_i8_4)))
@@ -3461,9 +3052,9 @@ define void @call_i8_4() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv4i8)($gp)
 ; MIPS64R5EB-NEXT:    sw $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -3512,9 +3103,9 @@ define void @call_i8_4() {
 ;
 ; MIPS32R5EL-LABEL: call_i8_4:
 ; MIPS32R5EL:       # %bb.0: # %entry
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -24
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32R5EL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    lui $1, 2569
 ; MIPS32R5EL-NEXT:    ori $4, $1, 1798
@@ -3523,17 +3114,17 @@ define void @call_i8_4() {
 ; MIPS32R5EL-NEXT:    nop
 ; MIPS32R5EL-NEXT:    lui $1, %hi(gv4i8)
 ; MIPS32R5EL-NEXT:    sw $2, %lo(gv4i8)($1)
-; MIPS32R5EL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
 ;
 ; MIPS64R5EL-LABEL: call_i8_4:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(call_i8_4)))
@@ -3547,9 +3138,9 @@ define void @call_i8_4() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv4i8)($gp)
 ; MIPS64R5EL-NEXT:    sw $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
@@ -3641,10 +3232,10 @@ define void @call_i8_8() {
 ;
 ; MIPS64R5EB-LABEL: call_i8_8:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(call_i8_8)))
@@ -3667,9 +3258,9 @@ define void @call_i8_8() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv8i8)($gp)
 ; MIPS64R5EB-NEXT:    sd $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -3748,10 +3339,10 @@ define void @call_i8_8() {
 ;
 ; MIPS64R5EL-LABEL: call_i8_8:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(call_i8_8)))
@@ -3769,9 +3360,9 @@ define void @call_i8_8() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv8i8)($gp)
 ; MIPS64R5EL-NEXT:    sd $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
@@ -4059,9 +3650,9 @@ define void @calli16_2() {
 ;
 ; MIPS32R5EB-LABEL: calli16_2:
 ; MIPS32R5EB:       # %bb.0: # %entry
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -24
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32R5EB-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EB-NEXT:    lui $1, 6
 ; MIPS32R5EB-NEXT:    ori $4, $1, 7
@@ -4071,17 +3662,17 @@ define void @calli16_2() {
 ; MIPS32R5EB-NEXT:    nop
 ; MIPS32R5EB-NEXT:    lui $1, %hi(gv2i16)
 ; MIPS32R5EB-NEXT:    sw $2, %lo(gv2i16)($1)
-; MIPS32R5EB-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EB-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EB-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
 ; MIPS64R5EB-LABEL: calli16_2:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4096,9 +3687,9 @@ define void @calli16_2() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv2i16)($gp)
 ; MIPS64R5EB-NEXT:    sw $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -4149,9 +3740,9 @@ define void @calli16_2() {
 ;
 ; MIPS32R5EL-LABEL: calli16_2:
 ; MIPS32R5EL:       # %bb.0: # %entry
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, -32
-; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -24
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32R5EL-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    lui $1, 7
 ; MIPS32R5EL-NEXT:    ori $4, $1, 6
@@ -4161,17 +3752,17 @@ define void @calli16_2() {
 ; MIPS32R5EL-NEXT:    nop
 ; MIPS32R5EL-NEXT:    lui $1, %hi(gv2i16)
 ; MIPS32R5EL-NEXT:    sw $2, %lo(gv2i16)($1)
-; MIPS32R5EL-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
-; MIPS32R5EL-NEXT:    addiu $sp, $sp, 32
+; MIPS32R5EL-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
 ;
 ; MIPS64R5EL-LABEL: calli16_2:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_2)))
@@ -4186,9 +3777,9 @@ define void @calli16_2() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv2i16)($gp)
 ; MIPS64R5EL-NEXT:    sw $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
@@ -4282,10 +3873,10 @@ define void @calli16_4() {
 ;
 ; MIPS64R5EB-LABEL: calli16_4:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_4)))
@@ -4308,9 +3899,9 @@ define void @calli16_4() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv4i16)($gp)
 ; MIPS64R5EB-NEXT:    sd $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -4398,10 +3989,10 @@ define void @calli16_4() {
 ;
 ; MIPS64R5EL-LABEL: calli16_4:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_4)))
@@ -4424,9 +4015,9 @@ define void @calli16_4() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv4i16)($gp)
 ; MIPS64R5EL-NEXT:    sd $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
@@ -4807,10 +4398,10 @@ define void @calli32_2() {
 ;
 ; MIPS64R5EB-LABEL: calli32_2:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EB-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EB-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4826,9 +4417,9 @@ define void @calli32_2() {
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv2i32)($gp)
 ; MIPS64R5EB-NEXT:    sd $2, 0($1)
-; MIPS64R5EB-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -4862,10 +4453,10 @@ define void @calli32_2() {
 ;
 ; MIPS64R5EL-LABEL: calli32_2:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5EL-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64R5EL-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
 ; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
 ; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_2)))
@@ -4882,9 +4473,9 @@ define void @calli32_2() {
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv2i32)($gp)
 ; MIPS64R5EL-NEXT:    sd $2, 0($1)
-; MIPS64R5EL-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 820259d7c7bc25..4fc3f57aa002df 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -2066,46 +2066,38 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
   ret void
 }
 
-; TODO: What code should be emitted?
-define void @truncstore() nounwind {
-; O32-LABEL: truncstore:
+; After legalizing shorter vectors with legal element sizes, this test is
+; no longer called truncstore.
+define void @store_i8_32bit() nounwind {
+; O32-LABEL: store_i8_32bit:
 ; O32:       # %bb.0:
 ; O32-NEXT:    lui $2, %hi(_gp_disp)
 ; O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; O32-NEXT:    addu $1, $2, $25
 ; O32-NEXT:    lw $1, %got(v4i8)($1)
-; O32-NEXT:    addiu $2, $zero, 255
-; O32-NEXT:    sb $2, 3($1)
-; O32-NEXT:    sb $2, 2($1)
-; O32-NEXT:    sb $2, 1($1)
+; O32-NEXT:    addiu $2, $zero, -1
 ; O32-NEXT:    jr $ra
-; O32-NEXT:    sb $2, 0($1)
+; O32-NEXT:    sw $2, 0($1)
 ;
-; N32-LABEL: truncstore:
+; N32-LABEL: store_i8_32bit:
 ; N32:       # %bb.0:
-; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(truncstore)))
+; N32-NEXT:    lui $1, %hi(%neg(%gp_rel(store_i8_32bit)))
 ; N32-NEXT:    addu $1, $1, $25
-; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(truncstore)))
+; N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(store_i8_32bit)))
 ; N32-NEXT:    lw $1, %got_disp(v4i8)($1)
-; N32-NEXT:    addiu $2, $zero, 255
-; N32-NEXT:    sb $2, 3($1)
-; N32-NEXT:    sb $2, 2($1)
-; N32-NEXT:    sb $2, 1($1)
+; N32-NEXT:    addiu $2, $zero, -1
 ; N32-NEXT:    jr $ra
-; N32-NEXT:    sb $2, 0($1)
+; N32-NEXT:    sw $2, 0($1)
 ;
-; N64-LABEL: truncstore:
+; N64-LABEL: store_i8_32bit:
 ; N64:       # %bb.0:
-; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(truncstore)))
+; N64-NEXT:    lui $1, %hi(%neg(%gp_rel(store_i8_32bit)))
 ; N64-NEXT:    daddu $1, $1, $25
-; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(truncstore)))
+; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(store_i8_32bit)))
 ; N64-NEXT:    ld $1, %got_disp(v4i8)($1)
-; N64-NEXT:    addiu $2, $zero, 255
-; N64-NEXT:    sb $2, 3($1)
-; N64-NEXT:    sb $2, 2($1)
-; N64-NEXT:    sb $2, 1($1)
+; N64-NEXT:    addiu $2, $zero, -1
 ; N64-NEXT:    jr $ra
-; N64-NEXT:    sb $2, 0($1)
+; N64-NEXT:    sw $2, 0($1)
   store volatile <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, ptr @v4i8
   ret void
 }

>From 4bf95bc72d52dc80603ab86ff4774cc3226ede85 Mon Sep 17 00:00:00 2001
From: Xinhui Yang <cyan at cyano.uk>
Date: Wed, 15 Jan 2025 12:00:24 +0800
Subject: [PATCH 2/2] [MIPS][MSA] Handle UNDEFs in shuffle indices for VSHF

Currently VSHF does not handle UNDEF indices. However isSPLATI() is able
to handle undefs, which may pass indices with undefs to this function.
Adding a check to handle undefs in shuffle indices.

Also, shuffle mask widened from v2 vector types are guranteed to contain
UNDEFs. These shuffle lower logics can handle UNDEFs, so we just leave
it as is, except for VSHF, which we must use whatever necessary to fill
the UNDEFs.
---
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp | 26 +++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index edfa6efd66972f..71a70d9c2dd466 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <iterator>
 #include <utility>
@@ -2968,8 +2969,14 @@ static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
 // if the type is v8i16 and all the indices are less than 8 then the second
 // operand is unused and can be replaced with anything. We choose to replace it
 // with the used operand since this reduces the number of instructions overall.
+//
+// NOTE: SPLATI shuffle masks may contain UNDEFs, since isSPLATI() treats
+//       UNDEFs as same as SPLATI index.
+//       For other instances we use the last valid index if UNDEF is
+//       encountered.
 static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
                                         const SmallVector<int, 16> &Indices,
+                                        const bool isSPLATI,
                                         SelectionDAG &DAG) {
   SmallVector<SDValue, 16> Ops;
   SDValue Op0;
@@ -2981,6 +2988,9 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
   SDLoc DL(Op);
   int ResTyNumElts = ResTy.getVectorNumElements();
 
+  assert(Indices[0] >= 0 &&
+         "shuffle mask starts with an UNDEF, which is not expected");
+
   for (int i = 0; i < ResTyNumElts; ++i) {
     // Idx == -1 means UNDEF
     int Idx = Indices[i];
@@ -2990,9 +3000,17 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
     if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
       Using2ndVec = true;
   }
-
-  for (int Idx : Indices)
+  int LastValidIndex = 0;
+  for (size_t i = 0; i < Indices.size(); i++) {
+    int Idx = Indices[i];
+    if (Idx < 0) {
+      // Continue using splati index or use the last valid index.
+      Idx = isSPLATI ? Indices[0] : LastValidIndex;
+    } else {
+      LastValidIndex = Idx;
+    }
     Ops.push_back(DAG.getTargetConstant(Idx, DL, MaskEltTy));
+  }
 
   SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
 
@@ -3035,7 +3053,7 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   // splati.[bhwd] is preferable to the others but is matched from
   // MipsISD::VSHF.
   if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG))
-    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+    return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, true, DAG);
   SDValue Result;
   if ((Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG)))
     return Result;
@@ -3051,7 +3069,7 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
     return Result;
   if ((Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG)))
     return Result;
-  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, false, DAG);
 }
 
 MachineBasicBlock *



More information about the llvm-commits mailing list