[llvm] c305557 - [llvm][sve] Lowering for VLS truncating stores

Mon Jul 12 03:14:30 PDT 2021

Author: David Truby
Date: 2021-07-12T11:14:17+01:00
New Revision: c305557acdaad453e32309d575fe9c6c7090c099

URL: https://github.com/llvm/llvm-project/commit/c305557acdaad453e32309d575fe9c6c7090c099
DIFF: https://github.com/llvm/llvm-project/commit/c305557acdaad453e32309d575fe9c6c7090c099.diff

LOG: [llvm][sve] Lowering for VLS truncating stores

This adds custom lowering for truncating stores when operating on
fixed length vectors in SVE. It also includes a DAG combine to
fold extends followed by truncating stores into non-truncating
stores in order to prevent this pattern appearing once truncating
stores are supported.

Currently truncating stores are not used in certain cases where
the size of the vector is larger than the target vector width.

Differential Revision: https://reviews.llvm.org/D104471

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AMDGPU/R600ISelLowering.h
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
    llvm/test/CodeGen/Mips/cconv/byval.ll
    llvm/test/CodeGen/Mips/cconv/vector.ll
    llvm/test/CodeGen/Mips/llvm-ir/store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 1cf6ac0e5f949..2fda9c66ce6c7 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1271,6 +1271,14 @@ class TargetLoweringBase {
        getTruncStoreAction(ValVT, MemVT) == Custom);
   }
 
+  virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+                                    bool LegalOnly) const {
+    if (LegalOnly)
+      return isTruncStoreLegal(ValVT, MemVT);
+
+    return isTruncStoreLegalOrCustom(ValVT, MemVT);
+  }
+
   /// Return how the indexed load should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f02d7621959a5..1a47af6fc16fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18090,10 +18090,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
-  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
-      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
-      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
-                            ST->getMemoryVT())) {
+  if ((Value.getOpcode() == ISD::FP_ROUND ||
+       Value.getOpcode() == ISD::TRUNCATE) &&
+      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               ST->getMemoryVT(), LegalOperations)) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
                              Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5caab2e85486e..c6cf957df9881 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1240,6 +1240,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
+    // SVE supports truncating stores of 64 and 128-bit vectors
+    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+
     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
                     MVT::nxv4f32, MVT::nxv2f64}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -1486,6 +1493,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     setCondCodeAction(ISD::SETUNE, VT, Expand);
   }
 
+  // Mark integer truncating stores as having custom lowering
+  if (VT.isInteger()) {
+    MVT InnerVT = VT.changeVectorElementType(MVT::i8);
+    while (InnerVT != VT) {
+      setTruncStoreAction(VT, InnerVT, Custom);
+      InnerVT = InnerVT.changeVectorElementType(
+          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
+    }
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABS, VT, Custom);
   setOperationAction(ISD::ADD, VT, Custom);
@@ -4552,7 +4569,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   EVT MemVT = StoreNode->getMemoryVT();
 
   if (VT.isVector()) {
-    if (useSVEForFixedLengthVectorVT(VT))
+    if (useSVEForFixedLengthVectorVT(VT, true))
       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
 
     unsigned AS = StoreNode->getAddressSpace();
@@ -4564,7 +4581,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
       return scalarizeVectorStore(StoreNode, DAG);
     }
 
-    if (StoreNode->isTruncatingStore()) {
+    if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
+        MemVT == MVT::v4i8) {
       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
     }
     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
@@ -15145,6 +15163,30 @@ static bool performTBISimplification(SDValue Addr,
   return false;
 }
 
+static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
+  auto OpCode = N->getOpcode();
+  assert(OpCode == ISD::STORE ||
+         OpCode == ISD::MSTORE && "Expected STORE dag node in input!");
+
+  if (auto Store = dyn_cast<StoreSDNode>(N)) {
+    if (!Store->isTruncatingStore())
+      return SDValue();
+    SDValue Ext = Store->getValue();
+    auto ExtOpCode = Ext.getOpcode();
+    if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
+        ExtOpCode != ISD::ANY_EXTEND)
+      return SDValue();
+    SDValue Orig = Ext->getOperand(0);
+    if (Store->getMemoryVT() != Orig->getValueType(0))
+      return SDValue();
+    return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
+                        Store->getBasePtr(), Store->getPointerInfo(),
+                        Store->getAlign());
+  }
+
+  return SDValue();
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -15156,6 +15198,9 @@ static SDValue performSTORECombine(SDNode *N,
       performTBISimplification(N->getOperand(2), DCI, DAG))
     return SDValue(N, 0);
 
+  if (SDValue Store = foldTruncStoreOfExt(DAG, N))
+    return Store;
+
   return SDValue();
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 9813dd0fb504e..920cf3cd97ef7 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -54,6 +54,15 @@ class R600TargetLowering final : public AMDGPUTargetLowering {
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const override;
 
+  virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+                                    bool LegalOperations) const override {
+    // R600 has "custom" lowering for truncating stores despite not supporting
+    // those instructions. If we allow that custom lowering in the DAG combiner
+    // then all truncates are merged into truncating stores, giving worse code
+    // generation. This hook prevents the DAG combiner performing that combine.
+    return isTruncStoreLegal(ValVT, MemVT);
+  }
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index d8b08461b6ee1..5ee33342460c9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -36,10 +36,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
 ; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
 ; CHECK-NEXT: ld1sb { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
-; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
-; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
-; CHECK-NEXT: strb [[RES_LO]], [x0]
-; CHECK-NEXT: strb [[RES_HI]], [x0, #1]
+; CHECK-NEXT: st1b { z[[XTN]].s }, [[PG0]],  [x0]
 ; CHECK-NEXT: ret
   %cval = load <2 x i8>, <2 x i8>* %a
   %ptrs = load <2 x i8*>, <2 x i8*>* %b
@@ -61,8 +58,7 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
 ; CHECK-NEXT: ld1sb { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
-; CHECK-NEXT: uzp1 v[[UZP3:[0-9]+]].8b, v[[UZP2]].8b, v[[UZP2]].8b
-; CHECK-NEXT: str s[[UZP3]], [x0]
+; CHECK-NEXT: st1b { z[[UZP2]].h }, [[PG0]], [x0]
 ; CHECK-NEXT: ret
   %cval = load <4 x i8>, <4 x i8>* %a
   %ptrs = load <4 x i8*>, <4 x i8*>* %b
@@ -178,10 +174,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
 ; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
 ; CHECK-NEXT: ld1sh { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
-; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
-; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
-; CHECK-NEXT: strh [[RES_LO]], [x0]
-; CHECK-NEXT: strh [[RES_HI]], [x0, #2]
+; CHECK-NEXT: st1h { z[[RES]].s }, [[PG0]], [x0]
 ; CHECK-NEXT: ret
   %cval = load <2 x i16>, <2 x i16>* %a
   %ptrs = load <2 x i16*>, <2 x i16*>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
new file mode 100644
index 0000000000000..5db9d4008d56e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -0,0 +1,218 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v2i64i8
+; CHECK: ldr q[[Q0:[0-9]+]], [x0]
+; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
+; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+  %a = load <2 x i64>, <2 x i64>* %ap
+  %val = trunc <2 x i64> %a to <2 x i8>
+  store <2 x i8> %val, <2 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v4i64i8
+; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
+; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+  %a = load <4 x i64>, <4 x i64>* %ap
+  %val = trunc <4 x i64> %a to <4 x i8>
+  store <4 x i8> %val, <4 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
+; VBITS_EQ_256-DAG: st1b { [[Z1]].s }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i8>
+  store <8 x i8> %val, <8 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i64i8:
+; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <16 x i64>, <16 x i64>* %ap
+  %val = trunc <16 x i64> %a to <16 x i8>
+  store <16 x i8> %val, <16 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v32i64i8:
+; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <32 x i64>, <32 x i64>* %ap
+  %val = trunc <32 x i64> %a to <32 x i8>
+  store <32 x i8> %val, <32 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; Currently does not use the truncating store
+; VBITS_EQ_256-DAG:	ptrue	[[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d	{ [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d	{ [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: uzp1	[[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG:	uzp1	[[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG:	mov	v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
+; VBITS_EQ_256-DAG:	str	q[[V0]], [x1]
+; VBITS_EQ_256-DAG:	ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i16>
+  store <8 x i16> %val, <8 x i16>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
+; VBITS_EQ_256-DAG: st1w { [[Z1]].s }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; Currently does not use the truncating store
+; VBITS_EQ_256-DAG:	ptrue	[[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: ld1w	{ [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1w	{ [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: uzp1	[[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].b, [[Z1]].b, [[Z1]].b
+; VBITS_EQ_256-DAG:	uzp1	[[Z0]].b, [[Z0]].b, [[Z0]].b
+; VBITS_EQ_256-DAG:	mov	v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
+; VBITS_EQ_256-DAG:	str	q[[V0]], [x1]
+; VBITS_EQ_256-DAG:	ret
+  %a = load <16 x i32>, <16 x i32>* %ap
+  %val = trunc <16 x i32> %a to <16 x i8>
+  store <16 x i8> %val, <16 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl8
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG: splice [[Z1]].h, [[PG]], [[Z1]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl16
+; VBITS_EQ_256-DAG: st1h { [[Z1]].h }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <16 x i32>, <16 x i32>* %ap
+  %val = trunc <16 x i32> %a to <16 x i16>
+  store <16 x i16> %val, <16 x i16>* %dest
+  ret void
+}
+
+define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1h { [[Z1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl16
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
+; VBITS_EQ_256-DAG: splice [[Z1]].b, [[PG]], [[Z1]].b, [[Z0]].b
+; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl32
+; VBITS_EQ_256-DAG: st1b { [[Z1]].b }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <32 x i16>, <32 x i16>* %ap
+  %val = trunc <32 x i16> %a to <32 x i8>
+  store <32 x i8> %val, <32 x i8>* %dest
+  ret void
+}
+
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/Mips/cconv/byval.ll b/llvm/test/CodeGen/Mips/cconv/byval.ll
index 180563d1ec943..5d77107d5966a 100644
--- a/llvm/test/CodeGen/Mips/cconv/byval.ll
+++ b/llvm/test/CodeGen/Mips/cconv/byval.ll
@@ -255,10 +255,9 @@ define dso_local void @g2(%struct.S1* %a) {
 ; N32-NEXT:    .cfi_offset 31, -8
 ; N32-NEXT:    .cfi_offset 16, -16
 ; N32-NEXT:    move $5, $4
-; N32-NEXT:    sll $1, $5, 0
-; N32-NEXT:    lui $2, 1
-; N32-NEXT:    addu $2, $sp, $2
-; N32-NEXT:    sw $1, -4($2)
+; N32-NEXT:    lui $1, 1
+; N32-NEXT:    addu $1, $sp, $1
+; N32-NEXT:    sw $4, -4($1)
 ; N32-NEXT:    addiu $16, $sp, 8
 ; N32-NEXT:    ori $6, $zero, 65520
 ; N32-NEXT:    jal memcpy
@@ -389,10 +388,8 @@ define dso_local i32 @g3(%struct.S1* %a, %struct.S1* %b) #0 {
 ; N32-NEXT:    .cfi_def_cfa_offset 16
 ; N32-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
 ; N32-NEXT:    .cfi_offset 31, -8
-; N32-NEXT:    sll $1, $5, 0
-; N32-NEXT:    sw $1, 0($sp)
-; N32-NEXT:    sll $1, $4, 0
-; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    sw $5, 0($sp)
+; N32-NEXT:    sw $4, 4($sp)
 ; N32-NEXT:    jal memcpy
 ; N32-NEXT:    ori $6, $zero, 65520
 ; N32-NEXT:    addiu $2, $zero, 4

diff  --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 723a71b52fedf..2187a9b9bf6a0 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -554,10 +554,8 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 ; MIPS64R5:       # %bb.0:
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    sll $1, $4, 0
-; MIPS64R5-NEXT:    sw $1, 12($sp)
+; MIPS64R5-NEXT:    sw $5, 8($sp)
+; MIPS64R5-NEXT:    sw $4, 12($sp)
 ; MIPS64R5-NEXT:    lbu $1, 9($sp)
 ; MIPS64R5-NEXT:    lbu $2, 8($sp)
 ; MIPS64R5-NEXT:    insert.w $w0[0], $2
@@ -1263,10 +1261,8 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
 ; MIPS64R5:       # %bb.0:
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    sll $1, $4, 0
-; MIPS64R5-NEXT:    sw $1, 12($sp)
+; MIPS64R5-NEXT:    sw $5, 8($sp)
+; MIPS64R5-NEXT:    sw $4, 12($sp)
 ; MIPS64R5-NEXT:    lh $1, 10($sp)
 ; MIPS64R5-NEXT:    lh $2, 8($sp)
 ; MIPS64R5-NEXT:    insert.d $w0[0], $2

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/store.ll b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
index 975eb8b90f042..e96165d369df8 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/store.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
@@ -285,65 +285,57 @@ define void @f3(i32 %a) {
 ;
 ; MIPS4-LABEL: f3:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    sll $1, $4, 0 # <MCInst #{{[0-9]+}} SLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Imm:0>>
-; MIPS4-NEXT:    lui $2, %highest(c) # <MCInst #{{[0-9]+}} LUi64
+; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS4-NEXT:    daddiu $2, $2, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS4-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $2, $2, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS4-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
 ; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sw $1, %lo(c)($2) # <MCInst #{{[0-9]+}} SW
+; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%lo(c))>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    sll $1, $4, 0 # <MCInst #{{[0-9]+}} SLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Imm:0>>
-; MIPS64R6-NEXT:    lui $2, %highest(c) # <MCInst #{{[0-9]+}} LUi64
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $2, $2, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $2, $2, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
 ; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sw $1, %lo(c)($2) # <MCInst #{{[0-9]+}} SW
+; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>