[llvm] 67d4dd5 - [AArch64][GlobalISel] Select @llvm.aarch64.neon.ld4.*

Tue Aug 24 09:04:01 PDT 2021

Author: Jessica Paquette
Date: 2021-08-24T09:03:49-07:00
New Revision: 67d4dd5c0713681cec2e913d388d44d09928cc33

URL: https://github.com/llvm/llvm-project/commit/67d4dd5c0713681cec2e913d388d44d09928cc33
DIFF: https://github.com/llvm/llvm-project/commit/67d4dd5c0713681cec2e913d388d44d09928cc33.diff

LOG: [AArch64][GlobalISel] Select @llvm.aarch64.neon.ld4.*

Reuse the selection code from the ld2 case. This is similar to how SDAG handles
things in AArch64ISelDAGToDAG. (See SelectLoad)

This fell back ~100 times while building clang with GISel enabled for AArch64.

Factoring out the gross subreg copy part ought to make selecting the rest of
this family fairly easy.

Differential Revision: https://reviews.llvm.org/D108600

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir

Modified: 
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 59ff8659148f..8cba57e6f83f 100644

--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -172,6 +172,14 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
+
+  /// Helper function to select vector load intrinsics like
+  /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
+  /// \p Opc is the opcode that the selected instruction should use.
+  /// \p NumVecs is the number of vector destinations for the instruction.
+  /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
+  bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
+                                 MachineInstr &I);
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineRegisterInfo &MRI);
   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -5050,6 +5058,35 @@ static unsigned findIntrinsicID(MachineInstr &I) {
   return IntrinOp->getIntrinsicID();
 }
 
+bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
+                                                           unsigned NumVecs,
+                                                           MachineInstr &I) {
+  assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  assert(Opc && "Expected an opcode?");
+  assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
+  auto &MRI = *MIB.getMRI();
+  LLT Ty = MRI.getType(I.getOperand(0).getReg());
+  unsigned Size = Ty.getSizeInBits();
+  assert((Size == 64 || Size == 128) &&
+         "Destination must be 64 bits or 128 bits?");
+  unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
+  auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
+  assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
+  auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
+  Load.cloneMemRefs(I);
+  constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
+  Register SelectedLoadDst = Load->getOperand(0).getReg();
+  for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
+    auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
+                   .addReg(SelectedLoadDst, 0, SubReg + Idx);
+    // Emit the subreg copies and immediately select them.
+    // FIXME: We should refactor our copy code into an emitCopy helper and
+    // clean up uses of this pattern elsewhere in the selector.
+    selectCopy(*Vec, TII, MRI, TRI, RBI);
+  }
+  return true;
+}
+
 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
     MachineInstr &I, MachineRegisterInfo &MRI) {
   // Find the intrinsic ID.
@@ -5087,10 +5124,7 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
         .addImm(I.getOperand(1).getImm() | ('U' << 8));
     break;
   case Intrinsic::aarch64_neon_ld2: {
-    Register Dst1 = I.getOperand(0).getReg();
-    Register Dst2 = I.getOperand(1).getReg();
-    Register Ptr = I.getOperand(3).getReg();
-    LLT Ty = MRI.getType(Dst1);
+    LLT Ty = MRI.getType(I.getOperand(0).getReg());
     unsigned Opc = 0;
     if (Ty == LLT::fixed_vector(8, S8))
       Opc = AArch64::LD2Twov8b;
@@ -5110,23 +5144,31 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
       Opc = AArch64::LD1Twov1d;
     else
       llvm_unreachable("Unexpected type for ld2!");
-    unsigned SubReg =
-        Ty.getSizeInBits() == 64 ? AArch64::dsub0 : AArch64::qsub0;
-    // This will be selected as a load into a wide register, which is broken
-    // into two vectors subregister copies.
-    auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
-    Load.cloneMemRefs(I);
-    constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
-    Register SelectedLoadDst = Load->getOperand(0).getReg();
-    // Emit the subreg copies and immediately select them.
-    // FIXME: We should refactor our copy code into an emitCopy helper and
-    // clean up uses of this pattern elsewhere in the selector.
-    auto Vec1 = MIB.buildInstr(TargetOpcode::COPY, {Dst1}, {})
-                    .addReg(SelectedLoadDst, 0, SubReg);
-    auto Vec2 = MIB.buildInstr(AArch64::COPY, {Dst2}, {})
-                    .addReg(SelectedLoadDst, 0, SubReg + 1);
-    selectCopy(*Vec1, TII, MRI, TRI, RBI);
-    selectCopy(*Vec2, TII, MRI, TRI, RBI);
+    selectVectorLoadIntrinsic(Opc, 2, I);
+    break;
+  }
+  case Intrinsic::aarch64_neon_ld4: {
+    LLT Ty = MRI.getType(I.getOperand(0).getReg());
+    unsigned Opc = 0;
+    if (Ty == LLT::fixed_vector(8, S8))
+      Opc = AArch64::LD4Fourv8b;
+    else if (Ty == LLT::fixed_vector(16, S8))
+      Opc = AArch64::LD4Fourv16b;
+    else if (Ty == LLT::fixed_vector(4, S16))
+      Opc = AArch64::LD4Fourv4h;
+    else if (Ty == LLT::fixed_vector(8, S16))
+      Opc = AArch64::LD4Fourv8h;
+    else if (Ty == LLT::fixed_vector(2, S32))
+      Opc = AArch64::LD4Fourv2s;
+    else if (Ty == LLT::fixed_vector(4, S32))
+      Opc = AArch64::LD4Fourv4s;
+    else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
+      Opc = AArch64::LD4Fourv2d;
+    else if (Ty == S64 || Ty == P0)
+      Opc = AArch64::LD1Fourv1d;
+    else
+      llvm_unreachable("Unexpected type for ld4!");
+    selectVectorLoadIntrinsic(Opc, 4, I);
     break;
   }
   case Intrinsic::aarch64_neon_st2: {

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir
new file mode 100644
index 000000000000..ec956ded6eff
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-ld4.mir
@@ -0,0 +1,292 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+...
+---
+name:            LD4Fourv8b
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv8b
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv8b:%[0-9]+]]:dddd = LD4Fourv8b %ptr :: (load (<8 x s64>))
+    ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv8b]].dsub0
+    ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv8b]].dsub1
+    ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv8b]].dsub2
+    ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv8b]].dsub3
+    ; CHECK: $d0 = COPY %dst1
+    ; CHECK: $d1 = COPY %dst2
+    ; CHECK: $d2 = COPY %dst3
+    ; CHECK: $d3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<8 x s8>), %dst2:fpr(<8 x s8>), %dst3:fpr(<8 x s8>), %dst4:fpr(<8 x s8>)= G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<8 x s64>))
+    $d0 = COPY %dst1(<8 x s8>)
+    $d1 = COPY %dst2(<8 x s8>)
+    $d2 = COPY %dst3(<8 x s8>)
+    $d3 = COPY %dst4(<8 x s8>)
+    RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+...
+---
+name:            LD4Fourv16b
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv16b
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv16b:%[0-9]+]]:qqqq = LD4Fourv16b %ptr :: (load (<16 x s64>))
+    ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv16b]].qsub0
+    ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv16b]].qsub1
+    ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv16b]].qsub2
+    ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv16b]].qsub3
+    ; CHECK: $q0 = COPY %dst1
+    ; CHECK: $q1 = COPY %dst2
+    ; CHECK: $q2 = COPY %dst3
+    ; CHECK: $q3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<16 x s8>), %dst2:fpr(<16 x s8>), %dst3:fpr(<16 x s8>), %dst4:fpr(<16 x s8>)  = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<16 x s64>))
+    $q0 = COPY %dst1(<16 x s8>)
+    $q1 = COPY %dst2(<16 x s8>)
+    $q2 = COPY %dst3(<16 x s8>)
+    $q3 = COPY %dst4(<16 x s8>)
+    RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+...
+---
+name:            LD4Fourv4h
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: LD4Fourv4h
+    ; CHECK: liveins: $x0
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv4h:%[0-9]+]]:dddd = LD4Fourv4h %ptr :: (load (<4 x s64>))
+    ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv4h]].dsub0
+    ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv4h]].dsub1
+    ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv4h]].dsub2
+    ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv4h]].dsub3
+    ; CHECK: $d0 = COPY %dst1
+    ; CHECK: $d1 = COPY %dst2
+    ; CHECK: $d2 = COPY %dst3
+    ; CHECK: $d3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<4 x s16>), %dst2:fpr(<4 x s16>), %dst3:fpr(<4 x s16>), %dst4:fpr(<4 x s16>)  = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<4 x s64>))
+    $d0 = COPY %dst1(<4 x s16>)
+    $d1 = COPY %dst2(<4 x s16>)
+    $d2 = COPY %dst3(<4 x s16>)
+    $d3 = COPY %dst4(<4 x s16>)
+    RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+...
+---
+name:            LD4Fourv8h
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv8h
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv8h:%[0-9]+]]:qqqq = LD4Fourv8h %ptr :: (load (<8 x s64>))
+    ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv8h]].qsub0
+    ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv8h]].qsub1
+    ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv8h]].qsub2
+    ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv8h]].qsub3
+    ; CHECK: $q0 = COPY %dst1
+    ; CHECK: $q1 = COPY %dst2
+    ; CHECK: $q2 = COPY %dst3
+    ; CHECK: $q3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<8 x s16>), %dst2:fpr(<8 x s16>), %dst3:fpr(<8 x s16>), %dst4:fpr(<8 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<8 x s64>))
+    $q0 = COPY %dst1(<8 x s16>)
+    $q1 = COPY %dst2(<8 x s16>)
+    $q2 = COPY %dst3(<8 x s16>)
+    $q3 = COPY %dst4(<8 x s16>)
+    RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+...
+---
+name:            LD4Fourv2s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv2s
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv2s:%[0-9]+]]:dddd = LD4Fourv2s %ptr :: (load (<2 x s64>))
+    ; CHECK: %dst1:fpr64 = COPY [[LD4Fourv2s]].dsub0
+    ; CHECK: %dst2:fpr64 = COPY [[LD4Fourv2s]].dsub1
+    ; CHECK: %dst3:fpr64 = COPY [[LD4Fourv2s]].dsub2
+    ; CHECK: %dst4:fpr64 = COPY [[LD4Fourv2s]].dsub3
+    ; CHECK: $d0 = COPY %dst1
+    ; CHECK: $d1 = COPY %dst2
+    ; CHECK: $d2 = COPY %dst3
+    ; CHECK: $d3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<2 x s32>), %dst2:fpr(<2 x s32>), %dst3:fpr(<2 x s32>), %dst4:fpr(<2 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x s64>))
+    $d0 = COPY %dst1(<2 x s32>)
+    $d1 = COPY %dst2(<2 x s32>)
+    $d2 = COPY %dst3(<2 x s32>)
+    $d3 = COPY %dst4(<2 x s32>)
+    RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+...
+---
+name:            LD4Fourv4s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv4s
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv4s:%[0-9]+]]:qqqq = LD4Fourv4s %ptr :: (load (<4 x s64>))
+    ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv4s]].qsub0
+    ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv4s]].qsub1
+    ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv4s]].qsub2
+    ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv4s]].qsub3
+    ; CHECK: $q0 = COPY %dst1
+    ; CHECK: $q1 = COPY %dst2
+    ; CHECK: $q2 = COPY %dst3
+    ; CHECK: $q3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<4 x s32>), %dst2:fpr(<4 x s32>), %dst3:fpr(<4 x s32>), %dst4:fpr(<4 x s32>)= G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<4 x s64>))
+    $q0 = COPY %dst1(<4 x s32>)
+    $q1 = COPY %dst2(<4 x s32>)
+    $q2 = COPY %dst3(<4 x s32>)
+    $q3 = COPY %dst4(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+...
+---
+name:            LD4Fourv2d_v2s64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv2d_v2s64
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv2d:%[0-9]+]]:qqqq = LD4Fourv2d %ptr :: (load (<2 x s64>))
+    ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv2d]].qsub0
+    ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv2d]].qsub1
+    ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv2d]].qsub2
+    ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv2d]].qsub3
+    ; CHECK: $q0 = COPY %dst1
+    ; CHECK: $q1 = COPY %dst2
+    ; CHECK: $q2 = COPY %dst3
+    ; CHECK: $q3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<2 x s64>), %dst2:fpr(<2 x s64>), %dst3:fpr(<2 x s64>), %dst4:fpr(<2 x s64>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x s64>))
+    $q0 = COPY %dst1(<2 x s64>)
+    $q1 = COPY %dst2(<2 x s64>)
+    $q2 = COPY %dst3(<2 x s64>)
+    $q3 = COPY %dst4(<2 x s64>)
+    RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+...
+---
+name:            LD4Fourv2d_v2p0
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD4Fourv2d_v2p0
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD4Fourv2d:%[0-9]+]]:qqqq = LD4Fourv2d %ptr :: (load (<2 x p0>))
+    ; CHECK: %dst1:fpr128 = COPY [[LD4Fourv2d]].qsub0
+    ; CHECK: %dst2:fpr128 = COPY [[LD4Fourv2d]].qsub1
+    ; CHECK: %dst3:fpr128 = COPY [[LD4Fourv2d]].qsub2
+    ; CHECK: %dst4:fpr128 = COPY [[LD4Fourv2d]].qsub3
+    ; CHECK: $q0 = COPY %dst1
+    ; CHECK: $q1 = COPY %dst2
+    ; CHECK: $q2 = COPY %dst3
+    ; CHECK: $q3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(<2 x p0>), %dst2:fpr(<2 x p0>), %dst3:fpr(<2 x p0>), %dst4:fpr(<2 x p0>)  = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (<2 x p0>))
+    $q0 = COPY %dst1(<2 x p0>)
+    $q1 = COPY %dst2(<2 x p0>)
+    $q2 = COPY %dst3(<2 x p0>)
+    $q3 = COPY %dst4(<2 x p0>)
+    RET_ReallyLR implicit $q0, implicit $q1, implicit $q2, implicit $q3
+...
+---
+name:            LD1Fourv1d_s64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD1Fourv1d_s64
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD1Fourv1d:%[0-9]+]]:dddd = LD1Fourv1d %ptr :: (load (s64))
+    ; CHECK: %dst1:fpr64 = COPY [[LD1Fourv1d]].dsub0
+    ; CHECK: %dst2:fpr64 = COPY [[LD1Fourv1d]].dsub1
+    ; CHECK: %dst3:fpr64 = COPY [[LD1Fourv1d]].dsub2
+    ; CHECK: %dst4:fpr64 = COPY [[LD1Fourv1d]].dsub3
+    ; CHECK: $d0 = COPY %dst1
+    ; CHECK: $d1 = COPY %dst2
+    ; CHECK: $d2 = COPY %dst3
+    ; CHECK: $d3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(s64), %dst2:fpr(s64), %dst3:fpr(s64), %dst4:fpr(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (s64))
+    $d0 = COPY %dst1(s64)
+    $d1 = COPY %dst2(s64)
+    $d2 = COPY %dst3(s64)
+    $d3 = COPY %dst4(s64)
+    RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+...
+---
+name:            LD1Fourv1d_p0
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: LD1Fourv1d_p0
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: %ptr:gpr64sp = COPY $x0
+    ; CHECK: [[LD1Fourv1d:%[0-9]+]]:dddd = LD1Fourv1d %ptr :: (load (p0))
+    ; CHECK: %dst1:fpr64 = COPY [[LD1Fourv1d]].dsub0
+    ; CHECK: %dst2:fpr64 = COPY [[LD1Fourv1d]].dsub1
+    ; CHECK: %dst3:fpr64 = COPY [[LD1Fourv1d]].dsub2
+    ; CHECK: %dst4:fpr64 = COPY [[LD1Fourv1d]].dsub3
+    ; CHECK: $d0 = COPY %dst1
+    ; CHECK: $d1 = COPY %dst2
+    ; CHECK: $d2 = COPY %dst3
+    ; CHECK: $d3 = COPY %dst4
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3
+    %ptr:gpr(p0) = COPY $x0
+    %dst1:fpr(p0), %dst2:fpr(p0), %dst3:fpr(p0), %dst4:fpr(p0) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.ld4), %ptr(p0) :: (load (p0))
+    $d0 = COPY %dst1(p0)
+    $d1 = COPY %dst2(p0)
+    $d2 = COPY %dst3(p0)
+    $d3 = COPY %dst4(p0)
+    RET_ReallyLR implicit $d0, implicit $d1, implicit $d2, implicit $d3