[llvm] 1528a4d - [llvm][sve] Lowering for VLS truncating stores

Fri Jul 23 06:05:08 PDT 2021

Author: David Truby
Date: 2021-07-23T14:04:55+01:00
New Revision: 1528a4d40022925dcc3e8cb6b8af7dd109ad7075

URL: https://github.com/llvm/llvm-project/commit/1528a4d40022925dcc3e8cb6b8af7dd109ad7075
DIFF: https://github.com/llvm/llvm-project/commit/1528a4d40022925dcc3e8cb6b8af7dd109ad7075.diff

LOG: [llvm][sve] Lowering for VLS truncating stores

This adds custom lowering for truncating stores when operating on
fixed length vectors in SVE. It also includes a DAG combine to
fold extends followed by truncating stores into non-truncating
stores in order to prevent this pattern appearing once truncating
stores are supported.

Currently truncating stores are not used in certain cases where
the size of the vector is larger than the target vector width.

Differential Revision: https://reviews.llvm.org/D104471

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AMDGPU/R600ISelLowering.h
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
    llvm/test/CodeGen/Mips/cconv/byval.ll
    llvm/test/CodeGen/Mips/cconv/vector.ll
    llvm/test/CodeGen/Mips/llvm-ir/store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9b42cc9dfb3a..e29f53dffb97 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1271,6 +1271,14 @@ class TargetLoweringBase {
        getTruncStoreAction(ValVT, MemVT) == Custom);
   }
 
+  virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+                                    bool LegalOnly) const {
+    if (LegalOnly)
+      return isTruncStoreLegal(ValVT, MemVT);
+
+    return isTruncStoreLegalOrCustom(ValVT, MemVT);
+  }
+
   /// Return how the indexed load should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4df41ee7d8a6..98b26c5d045b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18089,10 +18089,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
-  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
-      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
-      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
-                            ST->getMemoryVT())) {
+  if ((Value.getOpcode() == ISD::FP_ROUND ||
+       Value.getOpcode() == ISD::TRUNCATE) &&
+      Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               ST->getMemoryVT(), LegalOperations)) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
                              Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ca2a70e4676e..e1bc274569ba 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1241,6 +1241,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
+    // SVE supports truncating stores of 64 and 128-bit vectors
+    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+
     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
                     MVT::nxv4f32, MVT::nxv2f64}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -1487,6 +1494,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
     setCondCodeAction(ISD::SETUNE, VT, Expand);
   }
 
+  // Mark integer truncating stores as having custom lowering
+  if (VT.isInteger()) {
+    MVT InnerVT = VT.changeVectorElementType(MVT::i8);
+    while (InnerVT != VT) {
+      setTruncStoreAction(VT, InnerVT, Custom);
+      InnerVT = InnerVT.changeVectorElementType(
+          MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
+    }
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABS, VT, Custom);
   setOperationAction(ISD::ADD, VT, Custom);
@@ -4530,7 +4547,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   EVT MemVT = StoreNode->getMemoryVT();
 
   if (VT.isVector()) {
-    if (useSVEForFixedLengthVectorVT(VT))
+    if (useSVEForFixedLengthVectorVT(VT, true))
       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
 
     unsigned AS = StoreNode->getAddressSpace();
@@ -4542,7 +4559,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
       return scalarizeVectorStore(StoreNode, DAG);
     }
 
-    if (StoreNode->isTruncatingStore()) {
+    if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
+        MemVT == MVT::v4i8) {
       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
     }
     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
@@ -15122,6 +15140,30 @@ static bool performTBISimplification(SDValue Addr,
   return false;
 }
 
+static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
+  auto OpCode = N->getOpcode();
+  assert(OpCode == ISD::STORE ||
+         OpCode == ISD::MSTORE && "Expected STORE dag node in input!");
+
+  if (auto Store = dyn_cast<StoreSDNode>(N)) {
+    if (!Store->isTruncatingStore() || Store->isIndexed())
+      return SDValue();
+    SDValue Ext = Store->getValue();
+    auto ExtOpCode = Ext.getOpcode();
+    if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
+        ExtOpCode != ISD::ANY_EXTEND)
+      return SDValue();
+    SDValue Orig = Ext->getOperand(0);
+    if (Store->getMemoryVT() != Orig->getValueType(0))
+      return SDValue();
+    return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
+                        Store->getBasePtr(), Store->getPointerInfo(),
+                        Store->getAlign());
+  }
+
+  return SDValue();
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -15133,6 +15175,9 @@ static SDValue performSTORECombine(SDNode *N,
       performTBISimplification(N->getOperand(2), DCI, DAG))
     return SDValue(N, 0);
 
+  if (SDValue Store = foldTruncStoreOfExt(DAG, N))
+    return Store;
+
   return SDValue();
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 9813dd0fb504..920cf3cd97ef 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -54,6 +54,15 @@ class R600TargetLowering final : public AMDGPUTargetLowering {
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const override;
 
+  virtual bool canCombineTruncStore(EVT ValVT, EVT MemVT,
+                                    bool LegalOperations) const override {
+    // R600 has "custom" lowering for truncating stores despite not supporting
+    // those instructions. If we allow that custom lowering in the DAG combiner
+    // then all truncates are merged into truncating stores, giving worse code
+    // generation. This hook prevents the DAG combiner performing that combine.
+    return isTruncStoreLegal(ValVT, MemVT);
+  }
+
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index d8b08461b6ee..5ee33342460c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -36,10 +36,7 @@ define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
 ; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
 ; CHECK-NEXT: ld1sb { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
-; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
-; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
-; CHECK-NEXT: strb [[RES_LO]], [x0]
-; CHECK-NEXT: strb [[RES_HI]], [x0, #1]
+; CHECK-NEXT: st1b { z[[XTN]].s }, [[PG0]],  [x0]
 ; CHECK-NEXT: ret
   %cval = load <2 x i8>, <2 x i8>* %a
   %ptrs = load <2 x i8*>, <2 x i8*>* %b
@@ -61,8 +58,7 @@ define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
 ; CHECK-NEXT: ld1sb { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d]
 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
 ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
-; CHECK-NEXT: uzp1 v[[UZP3:[0-9]+]].8b, v[[UZP2]].8b, v[[UZP2]].8b
-; CHECK-NEXT: str s[[UZP3]], [x0]
+; CHECK-NEXT: st1b { z[[UZP2]].h }, [[PG0]], [x0]
 ; CHECK-NEXT: ret
   %cval = load <4 x i8>, <4 x i8>* %a
   %ptrs = load <4 x i8*>, <4 x i8*>* %b
@@ -178,10 +174,7 @@ define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
 ; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
 ; CHECK-NEXT: ld1sh { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d]
 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d
-; CHECK-NEXT: mov [[RES_HI:w[0-9]+]], v[[XTN]].s[1]
-; CHECK-NEXT: fmov [[RES_LO:w[0-9]+]], s[[XTN]]
-; CHECK-NEXT: strh [[RES_LO]], [x0]
-; CHECK-NEXT: strh [[RES_HI]], [x0, #2]
+; CHECK-NEXT: st1h { z[[RES]].s }, [[PG0]], [x0]
 ; CHECK-NEXT: ret
   %cval = load <2 x i16>, <2 x i16>* %a
   %ptrs = load <2 x i16*>, <2 x i16*>* %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
new file mode 100644
index 000000000000..5db9d4008d56
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -0,0 +1,218 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+define void @store_trunc_v2i64i8(<2 x i64>* %ap, <2 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v2i64i8
+; CHECK: ldr q[[Q0:[0-9]+]], [x0]
+; CHECK: ptrue p[[P0:[0-9]+]].d, vl2
+; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+  %a = load <2 x i64>, <2 x i64>* %ap
+  %val = trunc <2 x i64> %a to <2 x i8>
+  store <2 x i8> %val, <2 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v4i64i8(<4 x i64>* %ap, <4 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v4i64i8
+; CHECK: ptrue p[[P0:[0-9]+]].d, vl4
+; CHECK-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z[[Q0]].d }, p[[P0]], [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+  %a = load <4 x i64>, <4 x i64>* %ap
+  %val = trunc <4 x i64> %a to <4 x i8>
+  store <4 x i8> %val, <4 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
+; VBITS_EQ_256-DAG: st1b { [[Z1]].s }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i8>
+  store <8 x i8> %val, <8 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i64i8(<16 x i64>* %ap, <16 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i64i8:
+; VBITS_GE_1024: ptrue p[[P0:[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_1024-NEXT: ret
+  %a = load <16 x i64>, <16 x i64>* %ap
+  %val = trunc <16 x i64> %a to <16 x i8>
+  store <16 x i8> %val, <16 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v32i64i8(<32 x i64>* %ap, <32 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v32i64i8:
+; VBITS_GE_2048: ptrue p[[P0:[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_2048-NEXT: st1b { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_2048-NEXT: ret
+  %a = load <32 x i64>, <32 x i64>* %ap
+  %val = trunc <32 x i64> %a to <32 x i8>
+  store <32 x i8> %val, <32 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i16>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; Currently does not use the truncating store
+; VBITS_EQ_256-DAG:	ptrue	[[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d	{ [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d	{ [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: uzp1	[[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG:	uzp1	[[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG:	mov	v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
+; VBITS_EQ_256-DAG:	str	q[[V0]], [x1]
+; VBITS_EQ_256-DAG:	ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i16>
+  store <8 x i16> %val, <8 x i16>* %dest
+  ret void
+}
+
+define void @store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i32>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v8i64i32:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: ld1d { [[Z0:z[0-9]+]].d }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1d { [[Z1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl4
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].s, [[Z0]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].s, [[Z1]].s, [[Z1]].s
+; VBITS_EQ_256-DAG: splice [[Z1]].s, [[PG]], [[Z1]].s, [[Z0]].s
+; VBITS_EQ_256-DAG: ptrue [[PG]].s, vl8
+; VBITS_EQ_256-DAG: st1w { [[Z1]].s }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <8 x i64>, <8 x i64>* %ap
+  %val = trunc <8 x i64> %a to <8 x i32>
+  store <8 x i32> %val, <8 x i32>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i32i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; Currently does not use the truncating store
+; VBITS_EQ_256-DAG:	ptrue	[[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: ld1w	{ [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1w	{ [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: uzp1	[[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG: uzp1	[[Z1]].b, [[Z1]].b, [[Z1]].b
+; VBITS_EQ_256-DAG:	uzp1	[[Z0]].b, [[Z0]].b, [[Z0]].b
+; VBITS_EQ_256-DAG:	mov	v[[V0:[0-9]+]].d[1], v{{[0-9]+}}.d[0]
+; VBITS_EQ_256-DAG:	str	q[[V0]], [x1]
+; VBITS_EQ_256-DAG:	ret
+  %a = load <16 x i32>, <16 x i32>* %ap
+  %val = trunc <16 x i32> %a to <16 x i8>
+  store <16 x i8> %val, <16 x i8>* %dest
+  ret void
+}
+
+define void @store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i16>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v16i32i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: ld1w { [[Z0:z[0-9]+]].s }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1w { [[Z1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl8
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].h, [[Z0]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].h, [[Z1]].h, [[Z1]].h
+; VBITS_EQ_256-DAG: splice [[Z1]].h, [[PG]], [[Z1]].h, [[Z0]].h
+; VBITS_EQ_256-DAG: ptrue [[PG]].h, vl16
+; VBITS_EQ_256-DAG: st1h { [[Z1]].h }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <16 x i32>, <16 x i32>* %ap
+  %val = trunc <16 x i32> %a to <16 x i16>
+  store <16 x i16> %val, <16 x i16>* %dest
+  ret void
+}
+
+define void @store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i8>* %dest) #0 {
+; CHECK-LABEL: store_trunc_v32i16i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P0]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x8]
+; VBITS_EQ_256-DAG: ld1h { [[Z1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl16
+; VBITS_EQ_256-DAG: uzp1 [[Z0]].b, [[Z0]].b, [[Z0]].b
+; VBITS_EQ_256-DAG: uzp1 [[Z1]].b, [[Z1]].b, [[Z1]].b
+; VBITS_EQ_256-DAG: splice [[Z1]].b, [[PG]], [[Z1]].b, [[Z0]].b
+; VBITS_EQ_256-DAG: ptrue [[PG]].b, vl32
+; VBITS_EQ_256-DAG: st1b { [[Z1]].b }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: ret
+  %a = load <32 x i16>, <32 x i16>* %ap
+  %val = trunc <32 x i16> %a to <32 x i8>
+  store <32 x i8> %val, <32 x i8>* %dest
+  ret void
+}
+
+
+attributes #0 = { "target-features"="+sve" }

diff  --git a/llvm/test/CodeGen/Mips/cconv/byval.ll b/llvm/test/CodeGen/Mips/cconv/byval.ll
index 180563d1ec94..5d77107d5966 100644
--- a/llvm/test/CodeGen/Mips/cconv/byval.ll
+++ b/llvm/test/CodeGen/Mips/cconv/byval.ll
@@ -255,10 +255,9 @@ define dso_local void @g2(%struct.S1* %a) {
 ; N32-NEXT:    .cfi_offset 31, -8
 ; N32-NEXT:    .cfi_offset 16, -16
 ; N32-NEXT:    move $5, $4
-; N32-NEXT:    sll $1, $5, 0
-; N32-NEXT:    lui $2, 1
-; N32-NEXT:    addu $2, $sp, $2
-; N32-NEXT:    sw $1, -4($2)
+; N32-NEXT:    lui $1, 1
+; N32-NEXT:    addu $1, $sp, $1
+; N32-NEXT:    sw $4, -4($1)
 ; N32-NEXT:    addiu $16, $sp, 8
 ; N32-NEXT:    ori $6, $zero, 65520
 ; N32-NEXT:    jal memcpy
@@ -389,10 +388,8 @@ define dso_local i32 @g3(%struct.S1* %a, %struct.S1* %b) #0 {
 ; N32-NEXT:    .cfi_def_cfa_offset 16
 ; N32-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
 ; N32-NEXT:    .cfi_offset 31, -8
-; N32-NEXT:    sll $1, $5, 0
-; N32-NEXT:    sw $1, 0($sp)
-; N32-NEXT:    sll $1, $4, 0
-; N32-NEXT:    sw $1, 4($sp)
+; N32-NEXT:    sw $5, 0($sp)
+; N32-NEXT:    sw $4, 4($sp)
 ; N32-NEXT:    jal memcpy
 ; N32-NEXT:    ori $6, $zero, 65520
 ; N32-NEXT:    addiu $2, $zero, 4

diff  --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 723a71b52fed..2187a9b9bf6a 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -554,10 +554,8 @@ define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
 ; MIPS64R5:       # %bb.0:
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    sll $1, $4, 0
-; MIPS64R5-NEXT:    sw $1, 12($sp)
+; MIPS64R5-NEXT:    sw $5, 8($sp)
+; MIPS64R5-NEXT:    sw $4, 12($sp)
 ; MIPS64R5-NEXT:    lbu $1, 9($sp)
 ; MIPS64R5-NEXT:    lbu $2, 8($sp)
 ; MIPS64R5-NEXT:    insert.w $w0[0], $2
@@ -1263,10 +1261,8 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
 ; MIPS64R5:       # %bb.0:
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sll $1, $5, 0
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    sll $1, $4, 0
-; MIPS64R5-NEXT:    sw $1, 12($sp)
+; MIPS64R5-NEXT:    sw $5, 8($sp)
+; MIPS64R5-NEXT:    sw $4, 12($sp)
 ; MIPS64R5-NEXT:    lh $1, 10($sp)
 ; MIPS64R5-NEXT:    lh $2, 8($sp)
 ; MIPS64R5-NEXT:    insert.d $w0[0], $2

diff  --git a/llvm/test/CodeGen/Mips/llvm-ir/store.ll b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
index 975eb8b90f04..e96165d369df 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/store.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
@@ -285,65 +285,57 @@ define void @f3(i32 %a) {
 ;
 ; MIPS4-LABEL: f3:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    sll $1, $4, 0 # <MCInst #{{[0-9]+}} SLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Imm:0>>
-; MIPS4-NEXT:    lui $2, %highest(c) # <MCInst #{{[0-9]+}} LUi64
+; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS4-NEXT:    daddiu $2, $2, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS4-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $2, $2, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS4-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
 ; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sw $1, %lo(c)($2) # <MCInst #{{[0-9]+}} SW
+; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS4-NEXT:    # <MCOperand Expr:(%lo(c))>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    sll $1, $4, 0 # <MCInst #{{[0-9]+}} SLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Imm:0>>
-; MIPS64R6-NEXT:    lui $2, %highest(c) # <MCInst #{{[0-9]+}} LUi64
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $2, $2, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $2, $2, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $2, $2, 16 # <MCInst #{{[0-9]+}} DSLL
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
 ; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sw $1, %lo(c)($2) # <MCInst #{{[0-9]+}} SW
+; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
 ; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>