[llvm] 1ba8f4f - [AArch64] Move v4i8 concat load lowering to a combine.

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 14 07:19:38 PDT 2022


Author: David Green
Date: 2022-04-14T15:19:33+01:00
New Revision: 1ba8f4f67dcf52cf628caa6e84c3526e936fa6b4

URL: https://github.com/llvm/llvm-project/commit/1ba8f4f67dcf52cf628caa6e84c3526e936fa6b4
DIFF: https://github.com/llvm/llvm-project/commit/1ba8f4f67dcf52cf628caa6e84c3526e936fa6b4.diff

LOG: [AArch64] Move v4i8 concat load lowering to a combine.

The existing code was not updating the uses of loads that it recreated,
leading to incorrect chains which could break the ordering between
nodes. This moves the code to a combine instead, and makes sure we
update the chain references. This does mean it happens earlier -
potentially before the concats are simplified. This can lead to
inefficiencies in the codegen, which will be fixed in followups.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/insert-extend.ll
    llvm/test/CodeGen/AArch64/insert-subvector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cc6aa4ee5a2b2..32a35124ceb79 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1091,8 +1091,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8, Custom);
   }
 
   if (Subtarget->hasSVE()) {
@@ -11121,40 +11119,6 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
 
-  if (Op.getOperand(0).getValueType() == MVT::v4i8) {
-    // If we have a concat of v4i8 loads, convert them to a buildvector of f32
-    // loads to prevent having to go through the v4i8 load legalization that
-    // needs to extend each element into a larger type.
-    if (Op.getNumOperands() % 2 == 0 && all_of(Op->op_values(), [](SDValue V) {
-          return V.getValueType() == MVT::v4i8 &&
-                 (V.getOpcode() == ISD::LOAD || V.isUndef());
-        })) {
-      EVT NVT =
-          EVT::getVectorVT(*DAG.getContext(), MVT::f32, Op.getNumOperands());
-      SmallVector<SDValue> Ops;
-      SDLoc DL(Op);
-
-      for (unsigned i = 0; i < Op.getNumOperands(); i++) {
-        SDValue V = Op.getOperand(i);
-        if (V.isUndef())
-          Ops.push_back(DAG.getUNDEF(MVT::f32));
-        else {
-          LoadSDNode *LD = cast<LoadSDNode>(V);
-          if (!LD->isSimple() || LD->isIndexed() ||
-              LD->getExtensionType() != ISD::NON_EXTLOAD)
-            return SDValue();
-          Ops.push_back(DAG.getLoad(MVT::f32, DL, LD->getChain(),
-                                    LD->getBasePtr(), LD->getMemOperand()));
-        }
-      }
-      return DAG.getBitcast(Op.getValueType(),
-                            DAG.getBuildVector(NVT, DL, Ops));
-    }
-
-    // Let the default expansion happen
-    return SDValue();
-  }
-
   assert(Op.getValueType().isScalableVector() &&
          isTypeLegal(Op.getValueType()) &&
          "Expected legal scalable vector type!");
@@ -14734,6 +14698,42 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     }
   }
 
+  if (N->getOperand(0).getValueType() == MVT::v4i8) {
+    // If we have a concat of v4i8 loads, convert them to a buildvector of f32
+    // loads to prevent having to go through the v4i8 load legalization that
+    // needs to extend each element into a larger type.
+    if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
+          if (V.getValueType() != MVT::v4i8)
+            return false;
+          if (V.isUndef())
+            return true;
+          LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
+          return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
+                 LD->getExtensionType() == ISD::NON_EXTLOAD;
+        })) {
+      EVT NVT =
+          EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+      SmallVector<SDValue> Ops;
+
+      for (unsigned i = 0; i < N->getNumOperands(); i++) {
+        SDValue V = N->getOperand(i);
+        if (V.isUndef())
+          Ops.push_back(DAG.getUNDEF(MVT::f32));
+        else {
+          LoadSDNode *LD = cast<LoadSDNode>(V);
+          SDValue NewLoad =
+              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
+                          LD->getMemOperand());
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+          Ops.push_back(NewLoad);
+        }
+      }
+      return DAG.getBitcast(N->getValueType(0),
+                            DAG.getBuildVector(NVT, dl, Ops));
+    }
+  }
+
+
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())

diff  --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index 49ec069986c3f..8aae004d78a77 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -47,130 +47,136 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
 ; CHECK-LABEL: large:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    sxtw x9, w1
 ; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-NEXT:    sxtw x11, w3
-; CHECK-NEXT:    add x9, x0, x8
-; CHECK-NEXT:    add x12, x2, x11
-; CHECK-NEXT:    add x10, x9, x8
-; CHECK-NEXT:    add x13, x12, x11
-; CHECK-NEXT:    add x8, x10, x8
-; CHECK-NEXT:    add x11, x13, x11
-; CHECK-NEXT:    ldp s1, s5, [x9]
-; CHECK-NEXT:    ldp s0, s4, [x8]
+; CHECK-NEXT:    sxtw x12, w3
+; CHECK-NEXT:    add x8, x0, x9
+; CHECK-NEXT:    add x10, x8, x9
+; CHECK-NEXT:    add x11, x10, x9
+; CHECK-NEXT:    add x9, x2, x12
+; CHECK-NEXT:    add x13, x9, x12
+; CHECK-NEXT:    add x12, x13, x12
+; CHECK-NEXT:    ldp s0, s2, [x11]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x10], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x0], #4
-; CHECK-NEXT:    ldp s2, s6, [x11]
-; CHECK-NEXT:    ldp s3, s7, [x12]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x8], #4
+; CHECK-NEXT:    ld1 { v0.s }[3], [x0], #4
+; CHECK-NEXT:    ldp s1, s3, [x12]
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ld1 { v1.s }[1], [x13], #4
+; CHECK-NEXT:    ld1 { v1.s }[2], [x9], #4
+; CHECK-NEXT:    ld1 { v1.s }[3], [x2], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x13]
+; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ld1 { v2.s }[1], [x13], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x2], #4
-; CHECK-NEXT:    ld1 { v4.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x0]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x13]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x2]
+; CHECK-NEXT:    ld1 { v2.s }[2], [x8]
+; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
+; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    ld1 { v2.s }[3], [x0]
+; CHECK-NEXT:    ld1 { v3.s }[3], [x2]
+; CHECK-NEXT:    usubl v6.4s, v0.4h, v1.4h
+; CHECK-NEXT:    usubl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    usubl v1.4s, v4.4h, v5.4h
+; CHECK-NEXT:    usubl2 v4.4s, v4.8h, v5.8h
+; CHECK-NEXT:    ushll v5.8h, v2.8b, #0
+; CHECK-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    usubl2 v16.4s, v5.8h, v7.8h
+; CHECK-NEXT:    usubl v5.4s, v5.4h, v7.4h
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    usubl v16.4s, v0.4h, v2.4h
-; CHECK-NEXT:    usubl2 v0.4s, v0.8h, v2.8h
-; CHECK-NEXT:    usubl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    usubl2 v1.4s, v1.8h, v3.8h
-; CHECK-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-NEXT:    ushll v5.8h, v6.8b, #0
-; CHECK-NEXT:    ushll v6.8h, v7.8b, #0
-; CHECK-NEXT:    usubl2 v7.4s, v3.8h, v5.8h
-; CHECK-NEXT:    usubl v3.4s, v3.4h, v5.4h
-; CHECK-NEXT:    usubl2 v5.4s, v4.8h, v6.8h
-; CHECK-NEXT:    usubl v4.4s, v4.4h, v6.4h
-; CHECK-NEXT:    shl v3.4s, v3.4s, #16
-; CHECK-NEXT:    shl v6.4s, v7.4s, #16
+; CHECK-NEXT:    shl v7.4s, v16.4s, #16
+; CHECK-NEXT:    usubl2 v16.4s, v2.8h, v3.8h
 ; CHECK-NEXT:    shl v5.4s, v5.4s, #16
-; CHECK-NEXT:    shl v4.4s, v4.4s, #16
-; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    rev64 v6.4s, v1.4s
-; CHECK-NEXT:    rev64 v7.4s, v2.4s
-; CHECK-NEXT:    add v16.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v17.4s, v3.4s, v5.4s
-; CHECK-NEXT:    add v18.4s, v1.4s, v6.4s
-; CHECK-NEXT:    add v19.4s, v2.4s, v7.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
-; CHECK-NEXT:    trn2 v4.4s, v16.4s, v17.4s
-; CHECK-NEXT:    trn2 v5.4s, v19.4s, v18.4s
-; CHECK-NEXT:    zip1 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT:    trn2 v20.4s, v17.4s, v16.4s
-; CHECK-NEXT:    zip1 v6.4s, v0.4s, v3.4s
+; CHECK-NEXT:    usubl v2.4s, v2.4h, v3.4h
+; CHECK-NEXT:    add v3.4s, v5.4s, v6.4s
+; CHECK-NEXT:    shl v5.4s, v16.4s, #16
+; CHECK-NEXT:    shl v2.4s, v2.4s, #16
+; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    rev64 v6.4s, v3.4s
+; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    rev64 v4.4s, v1.4s
+; CHECK-NEXT:    rev64 v5.4s, v2.4s
+; CHECK-NEXT:    add v17.4s, v3.4s, v6.4s
+; CHECK-NEXT:    add v16.4s, v0.4s, v7.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    sub v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v5.4s
+; CHECK-NEXT:    add v19.4s, v1.4s, v4.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    trn2 v6.4s, v16.4s, v17.4s
+; CHECK-NEXT:    zip2 v7.4s, v0.4s, v3.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    trn2 v3.4s, v17.4s, v16.4s
+; CHECK-NEXT:    trn2 v4.4s, v19.4s, v18.4s
+; CHECK-NEXT:    zip1 v5.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    zip2 v18.4s, v19.4s, v18.4s
-; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #8
-; CHECK-NEXT:    ext v16.16b, v4.16b, v16.16b, #8
-; CHECK-NEXT:    ext v7.16b, v2.16b, v7.16b, #8
-; CHECK-NEXT:    ext v17.16b, v20.16b, v17.16b, #8
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mov v2.s[3], v1.s[2]
-; CHECK-NEXT:    mov v4.d[1], v5.d[1]
-; CHECK-NEXT:    mov v6.d[1], v7.d[1]
+; CHECK-NEXT:    ext v17.16b, v3.16b, v17.16b, #8
+; CHECK-NEXT:    ext v16.16b, v6.16b, v16.16b, #8
+; CHECK-NEXT:    ext v4.16b, v19.16b, v4.16b, #8
+; CHECK-NEXT:    ext v5.16b, v1.16b, v5.16b, #8
+; CHECK-NEXT:    mov v1.s[3], v2.s[2]
 ; CHECK-NEXT:    mov v17.d[1], v18.d[1]
-; CHECK-NEXT:    mov v20.d[1], v5.d[1]
-; CHECK-NEXT:    mov v0.d[1], v2.d[1]
 ; CHECK-NEXT:    mov v16.d[1], v18.d[1]
-; CHECK-NEXT:    add v1.4s, v17.4s, v20.4s
-; CHECK-NEXT:    add v2.4s, v0.4s, v6.4s
-; CHECK-NEXT:    sub v3.4s, v4.4s, v16.4s
-; CHECK-NEXT:    sub v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    rev64 v4.4s, v1.4s
-; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    rev64 v6.4s, v0.4s
-; CHECK-NEXT:    rev64 v7.4s, v2.4s
-; CHECK-NEXT:    add v16.4s, v1.4s, v4.4s
-; CHECK-NEXT:    add v17.4s, v3.4s, v5.4s
-; CHECK-NEXT:    add v18.4s, v0.4s, v6.4s
-; CHECK-NEXT:    add v19.4s, v2.4s, v7.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v7.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    ext v4.16b, v2.16b, v19.16b, #12
-; CHECK-NEXT:    ext v5.16b, v0.16b, v18.16b, #12
-; CHECK-NEXT:    ext v7.16b, v3.16b, v17.16b, #12
+; CHECK-NEXT:    mov v6.d[1], v4.d[1]
+; CHECK-NEXT:    mov v3.d[1], v4.d[1]
+; CHECK-NEXT:    mov v0.d[1], v5.d[1]
+; CHECK-NEXT:    mov v7.d[1], v1.d[1]
+; CHECK-NEXT:    sub v1.4s, v6.4s, v16.4s
+; CHECK-NEXT:    add v2.4s, v17.4s, v3.4s
+; CHECK-NEXT:    rev64 v3.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v7.4s, v0.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    rev64 v4.4s, v2.4s
+; CHECK-NEXT:    rev64 v6.4s, v5.4s
+; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    add v16.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v17.4s, v0.4s, v7.4s
+; CHECK-NEXT:    add v18.4s, v5.4s, v6.4s
+; CHECK-NEXT:    sub v5.4s, v5.4s, v6.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    ext v16.16b, v1.16b, v16.16b, #12
+; CHECK-NEXT:    ext v6.16b, v5.16b, v18.16b, #12
+; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    ext v4.16b, v0.16b, v17.16b, #12
+; CHECK-NEXT:    rev64 v3.4s, v3.4s
+; CHECK-NEXT:    ext v7.16b, v16.16b, v1.16b, #4
+; CHECK-NEXT:    ext v17.16b, v6.16b, v5.16b, #4
+; CHECK-NEXT:    ext v18.16b, v6.16b, v6.16b, #8
+; CHECK-NEXT:    ext v19.16b, v4.16b, v0.16b, #4
+; CHECK-NEXT:    ext v20.16b, v4.16b, v4.16b, #8
+; CHECK-NEXT:    ext v21.16b, v16.16b, v16.16b, #8
 ; CHECK-NEXT:    rev64 v16.4s, v16.4s
-; CHECK-NEXT:    ext v6.16b, v4.16b, v2.16b, #4
-; CHECK-NEXT:    ext v17.16b, v4.16b, v4.16b, #8
-; CHECK-NEXT:    ext v18.16b, v5.16b, v0.16b, #4
-; CHECK-NEXT:    ext v19.16b, v5.16b, v5.16b, #8
-; CHECK-NEXT:    ext v20.16b, v7.16b, v3.16b, #4
-; CHECK-NEXT:    ext v21.16b, v7.16b, v7.16b, #8
-; CHECK-NEXT:    rev64 v7.4s, v7.4s
-; CHECK-NEXT:    trn2 v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    rev64 v5.4s, v5.4s
-; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    ext v6.16b, v6.16b, v17.16b, #12
-; CHECK-NEXT:    ext v17.16b, v18.16b, v19.16b, #12
-; CHECK-NEXT:    ext v18.16b, v20.16b, v21.16b, #12
-; CHECK-NEXT:    ext v3.16b, v7.16b, v3.16b, #4
-; CHECK-NEXT:    ext v7.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v0.16b, v5.16b, v0.16b, #4
-; CHECK-NEXT:    ext v2.16b, v4.16b, v2.16b, #4
-; CHECK-NEXT:    add v4.4s, v18.4s, v3.4s
-; CHECK-NEXT:    add v5.4s, v1.4s, v7.4s
-; CHECK-NEXT:    add v16.4s, v17.4s, v0.4s
-; CHECK-NEXT:    add v19.4s, v6.4s, v2.4s
-; CHECK-NEXT:    sub v3.4s, v18.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    sub v0.4s, v17.4s, v0.4s
-; CHECK-NEXT:    mov v19.d[1], v2.d[1]
+; CHECK-NEXT:    trn2 v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    rev64 v3.4s, v4.4s
+; CHECK-NEXT:    rev64 v4.4s, v6.4s
+; CHECK-NEXT:    ext v17.16b, v17.16b, v18.16b, #12
+; CHECK-NEXT:    ext v18.16b, v19.16b, v20.16b, #12
+; CHECK-NEXT:    ext v7.16b, v7.16b, v21.16b, #12
+; CHECK-NEXT:    ext v1.16b, v16.16b, v1.16b, #4
+; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ext v0.16b, v3.16b, v0.16b, #4
+; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #4
+; CHECK-NEXT:    add v4.4s, v7.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v2.4s, v6.4s
+; CHECK-NEXT:    add v16.4s, v18.4s, v0.4s
+; CHECK-NEXT:    add v19.4s, v17.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v7.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v3.4s, v17.4s, v3.4s
+; CHECK-NEXT:    sub v0.4s, v18.4s, v0.4s
+; CHECK-NEXT:    mov v19.d[1], v3.d[1]
 ; CHECK-NEXT:    mov v16.d[1], v0.d[1]
-; CHECK-NEXT:    mov v4.d[1], v3.d[1]
-; CHECK-NEXT:    mov v5.d[1], v1.d[1]
+; CHECK-NEXT:    mov v4.d[1], v1.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
 ; CHECK-NEXT:    movi v0.8h, #1
 ; CHECK-NEXT:    movi v7.2d, #0x00ffff0000ffff
 ; CHECK-NEXT:    ushr v1.4s, v4.4s, #15

diff  --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index 3e2aeb0944786..ed4c0c5958861 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -621,9 +621,13 @@ define <16 x i8> @load2multi1_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
 define <16 x i8> @load2multi2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
 ; CHECK-LABEL: load2multi2_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1r { v0.2s }, [x0]
-; CHECK-NEXT:    ld1r { v1.2s }, [x1]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr s0, [x1]
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-NEXT:    mov v1.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %la = load <4 x i8>, <4 x i8> *%a
   %lb = load <4 x i8>, <4 x i8> *%b
@@ -632,3 +636,44 @@ define <16 x i8> @load2multi2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
   %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %s3
 }
+
+define void @loads_before_stores(i8* %i44) {
+; CHECK-LABEL: loads_before_stores:
+; CHECK:       // %bb.0: // %bb
+; CHECK-NEXT:    add x8, x0, #20
+; CHECK-NEXT:    ldr s0, [x0, #28]
+; CHECK-NEXT:    ldrh w9, [x0, #26]
+; CHECK-NEXT:    ldrh w10, [x0, #24]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-NEXT:    strh w9, [x0, #20]
+; CHECK-NEXT:    strh w10, [x0, #30]
+; CHECK-NEXT:    stur d0, [x0, #22]
+; CHECK-NEXT:    ret
+bb:
+  %i45 = getelementptr inbounds i8, i8* %i44, i64 20
+  %i46 = getelementptr inbounds i8, i8* %i44, i64 26
+  %i48 = load i8, i8* %i46, align 1
+  %i49 = getelementptr inbounds i8, i8* %i44, i64 21
+  %i50 = getelementptr inbounds i8, i8* %i44, i64 27
+  %i52 = load i8, i8* %i50, align 1
+  %i53 = getelementptr inbounds i8, i8* %i44, i64 22
+  %i54 = getelementptr inbounds i8, i8* %i44, i64 28
+  %i61 = getelementptr inbounds i8, i8* %i44, i64 24
+  %i62 = getelementptr inbounds i8, i8* %i44, i64 30
+  %i63 = load i8, i8* %i61, align 1
+  %i65 = getelementptr inbounds i8, i8* %i44, i64 25
+  %i66 = getelementptr inbounds i8, i8* %i44, i64 31
+  %i67 = load i8, i8* %i65, align 1
+  %0 = bitcast i8* %i45 to <4 x i8>*
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  store i8 %i48, i8* %i45, align 1
+  store i8 %i52, i8* %i49, align 1
+  %2 = bitcast i8* %i54 to <4 x i8>*
+  %3 = load <4 x i8>, <4 x i8>* %2, align 1
+  store i8 %i63, i8* %i62, align 1
+  %4 = shufflevector <4 x i8> %3, <4 x i8> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %5 = bitcast i8* %i53 to <8 x i8>*
+  store <8 x i8> %4, <8 x i8>* %5, align 1
+  store i8 %i67, i8* %i66, align 1
+  ret void
+}


        


More information about the llvm-commits mailing list