[llvm] de75e50 - [ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. (#79287)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 31 06:08:08 PST 2024
Author: Alfie Richards
Date: 2024-01-31T14:08:02Z
New Revision: de75e5079ae1d4894c918fd452e468fb6a888be1
URL: https://github.com/llvm/llvm-project/commit/de75e5079ae1d4894c918fd452e468fb6a888be1
DIFF: https://github.com/llvm/llvm-project/commit/de75e5079ae1d4894c918fd452e468fb6a888be1.diff
LOG: [ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. (#79287)
This ensures the odd/even pseudo instructions are allocated to the same
register range.
This fixes #71763
Added:
Modified:
llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
llvm/lib/Target/ARM/ARMInstrNEON.td
llvm/test/CodeGen/ARM/arm-vlddup-update.ll
llvm/test/CodeGen/ARM/arm-vlddup.ll
llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2f9236bb977fc..f0b69b0b09809 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -640,12 +640,9 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
// has an extra operand that is a use of the super-register. Record the
// operand index and skip over it.
unsigned SrcOpIdx = 0;
- if (!IsVLD2DUP) {
- if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
- RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
- RegSpc == SingleHighTSpc)
- SrcOpIdx = OpIdx++;
- }
+ if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || RegSpc == SingleLowSpc ||
+ RegSpc == SingleHighQSpc || RegSpc == SingleHighTSpc)
+ SrcOpIdx = OpIdx++;
// Copy the predicate operands.
MIB.add(MI.getOperand(OpIdx++));
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e99ee299412a5..20dd3e7baf849 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3032,11 +3032,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
}
if (is64BitVector || NumVecs == 1) {
// Double registers and VLD1 quad registers are directly supported.
- } else if (NumVecs == 2) {
- const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain};
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
- MVT::Other, OpsA);
- Chain = SDValue(VLdA, 1);
} else {
SDValue ImplDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index f31e1e9f97892..f160c031d843a 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -1491,12 +1491,26 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
-def VLD2DUPq8EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+// Duplicate of VLDQQPseudo but with a constraint variable
+// to ensure the odd and even lanes use the same register range
+class VLDQQPseudoInputDST<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr, QQPR: $src), itin,
+ "$src = $dst">;
+class VLDQQWBPseudoInputDST<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR: $src), itin,
+ "$addr.addr = $wb, $src = $dst">;
+class VLDQQWBfixedPseudoInputDST<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, QQPR: $src), itin,
+ "$addr.addr = $wb, $src = $dst">;
+
+def VLD2DUPq8EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// ...with address register writeback:
multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1534,12 +1548,12 @@ defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
-def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
class VLD3DUP<bits<4> op7_4, string Dt>
diff --git a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
index d4b2f0203bde7..875c4e0d3777f 100644
--- a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
+++ b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi --float-abi=hard -verify-machineinstrs \
; RUN: -asm-verbose=false | FileCheck %s
%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> }
@@ -59,6 +60,10 @@ declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr, i32)
define ptr @test_vld2_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u16_update:
; CHECK: vld2.16 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x2_t %tmp, ptr %dest, align 8
@@ -69,6 +74,10 @@ entry:
define ptr @test_vld2_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u16_update_reg:
; CHECK: vld2.16 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x2_t %tmp, ptr %dest, align 8
@@ -79,6 +88,10 @@ entry:
define ptr @test_vld2_dup_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_update:
; CHECK: vld2.32 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x2_t %tmp, ptr %dest, align 8
@@ -89,6 +102,10 @@ entry:
define ptr @test_vld2_dup_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_update_reg:
; CHECK: vld2.32 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x2_t %tmp, ptr %dest, align 8
@@ -99,6 +116,10 @@ entry:
define ptr @test_vld2_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u64_update:
; CHECK: vld1.64 {d16, d17}, [r1:64]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x2_t %tmp, ptr %dest, align 8
@@ -109,6 +130,10 @@ entry:
define ptr @test_vld2_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17}, [r1:64], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x2_t %tmp, ptr %dest, align 8
@@ -119,6 +144,10 @@ entry:
define ptr @test_vld2_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u8_update:
; CHECK: vld2.8 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x2_t %tmp, ptr %dest, align 8
@@ -129,6 +158,10 @@ entry:
define ptr @test_vld2_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u8_update_reg:
; CHECK: vld2.8 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x2_t %tmp, ptr %dest, align 8
@@ -139,6 +172,11 @@ entry:
define ptr @test_vld3_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u16_update:
; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x3_t %tmp, ptr %dest, align 8
@@ -149,6 +187,11 @@ entry:
define ptr @test_vld3_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u16_update_reg:
; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x3_t %tmp, ptr %dest, align 8
@@ -159,6 +202,11 @@ entry:
define ptr @test_vld3_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u32_update:
; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x3_t %tmp, ptr %dest, align 8
@@ -169,6 +217,11 @@ entry:
define ptr @test_vld3_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u32_update_reg:
; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x3_t %tmp, ptr %dest, align 8
@@ -179,6 +232,11 @@ entry:
define ptr @test_vld3_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u64_update:
; CHECK: vld1.64 {d16, d17, d18}, [r1]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x3_t %tmp, ptr %dest, align 8
@@ -189,6 +247,11 @@ entry:
define ptr @test_vld3_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17, d18}, [r1], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x3_t %tmp, ptr %dest, align 8
@@ -199,6 +262,11 @@ entry:
define ptr @test_vld3_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u8_update:
; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x3_t %tmp, ptr %dest, align 8
@@ -209,6 +277,11 @@ entry:
define ptr @test_vld3_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u8_update_reg:
; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x3_t %tmp, ptr %dest, align 8
@@ -219,6 +292,12 @@ entry:
define ptr @test_vld4_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u16_update:
; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x4_t %tmp, ptr %dest, align 8
@@ -229,6 +308,12 @@ entry:
define ptr @test_vld4_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u16_update_reg:
; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x4_t %tmp, ptr %dest, align 8
@@ -239,6 +324,12 @@ entry:
define ptr @test_vld4_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u32_update:
; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x4_t %tmp, ptr %dest, align 8
@@ -249,6 +340,12 @@ entry:
define ptr @test_vld4_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u32_update_reg:
; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x4_t %tmp, ptr %dest, align 8
@@ -259,6 +356,12 @@ entry:
define ptr @test_vld4_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u64_update:
; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x4_t %tmp, ptr %dest, align 8
@@ -269,6 +372,12 @@ entry:
define ptr @test_vld4_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x4_t %tmp, ptr %dest, align 8
@@ -279,6 +388,12 @@ entry:
define ptr @test_vld4_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u8_update:
; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x4_t %tmp, ptr %dest, align 8
@@ -289,6 +404,12 @@ entry:
define ptr @test_vld4_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u8_update_reg:
; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x4_t %tmp, ptr %dest, align 8
@@ -300,6 +421,10 @@ define ptr @test_vld2q_dup_u16_update(ptr %dest, ptr %src, ptr %dest0) {
; CHECK-LABEL: test_vld2q_dup_u16_update:
; CHECK: vld2.16 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x2_t %tmp, ptr %dest, align 8
@@ -311,6 +436,10 @@ define ptr @test_vld2q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u16_update_reg:
; CHECK: vld2.16 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x2_t %tmp, ptr %dest, align 8
@@ -322,6 +451,10 @@ define ptr @test_vld2q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2q_dup_u32_update:
; CHECK: vld2.32 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x2_t %tmp, ptr %dest, align 8
@@ -333,6 +466,10 @@ define ptr @test_vld2q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u32_update_reg:
; CHECK: vld2.32 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x2_t %tmp, ptr %dest, align 8
@@ -344,6 +481,10 @@ define ptr @test_vld2q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2q_dup_u8_update:
; CHECK: vld2.8 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x2_t %tmp, ptr %dest, align 8
@@ -355,6 +496,10 @@ define ptr @test_vld2q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u8_update_reg:
; CHECK: vld2.8 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x2_t %tmp, ptr %dest, align 8
@@ -365,7 +510,12 @@ entry:
define ptr @test_vld3q_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u16_update:
; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x3_t %tmp, ptr %dest, align 8
@@ -377,6 +527,11 @@ define ptr @test_vld3q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u16_update_reg:
; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x3_t %tmp, ptr %dest, align 8
@@ -387,7 +542,12 @@ entry:
define ptr @test_vld3q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u32_update:
; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x3_t %tmp, ptr %dest, align 8
@@ -399,6 +559,11 @@ define ptr @test_vld3q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u32_update_reg:
; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x3_t %tmp, ptr %dest, align 8
@@ -409,7 +574,12 @@ entry:
define ptr @test_vld3q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u8_update:
; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x3_t %tmp, ptr %dest, align 8
@@ -421,6 +591,11 @@ define ptr @test_vld3q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u8_update_reg:
; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x3_t %tmp, ptr %dest, align 8
@@ -431,7 +606,13 @@ entry:
define ptr @test_vld4q_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u16_update:
; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.16 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x4_t %tmp, ptr %dest, align 8
@@ -443,6 +624,12 @@ define ptr @test_vld4q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u16_update_reg:
; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.16 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x4_t %tmp, ptr %dest, align 8
@@ -453,7 +640,13 @@ entry:
define ptr @test_vld4q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u32_update:
; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.32 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x4_t %tmp, ptr %dest, align 8
@@ -465,6 +658,12 @@ define ptr @test_vld4q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u32_update_reg:
; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.32 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x4_t %tmp, ptr %dest, align 8
@@ -475,7 +674,13 @@ entry:
define ptr @test_vld4q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u8_update:
; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.8 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x4_t %tmp, ptr %dest, align 8
@@ -487,6 +692,12 @@ define ptr @test_vld4q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u8_update_reg:
; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.8 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x4_t %tmp, ptr %dest, align 8
diff --git a/llvm/test/CodeGen/ARM/arm-vlddup.ll b/llvm/test/CodeGen/ARM/arm-vlddup.ll
index c22d1374761f3..3fd35276f6dc9 100644
--- a/llvm/test/CodeGen/ARM/arm-vlddup.ll
+++ b/llvm/test/CodeGen/ARM/arm-vlddup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi --float-abi=hard -verify-machineinstrs \
; RUN: -asm-verbose=false | FileCheck %s
%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> }
@@ -56,178 +56,199 @@ declare %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr, i32)
declare %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr, i32)
declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr, i32)
-; CHECK-LABEL: test_vld2_dup_u16
-; CHECK: vld2.16 {d16[], d17[]}, [r0]
define %struct.uint16x4x2_t @test_vld2_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u16:
+; CHECK: vld2.16 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u32
-; CHECK: vld2.32 {d16[], d17[]}, [r0]
define %struct.uint32x2x2_t @test_vld2_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u32:
+; CHECK: vld2.32 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u64
-; CHECK: vld1.64 {d16, d17}, [r0:64]
define %struct.uint64x1x2_t @test_vld2_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u64:
+; CHECK: vld1.64 {d0, d1}, [r0:64]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u8
-; CHECK: vld2.8 {d16[], d17[]}, [r0]
define %struct.uint8x8x2_t @test_vld2_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u8:
+; CHECK: vld2.8 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x2_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u16
-; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]
define %struct.uint16x4x3_t @test_vld3_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u16:
+; CHECK: vld3.16 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u32
-; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]
define %struct.uint32x2x3_t @test_vld3_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u32:
+; CHECK: vld3.32 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u64
-; CHECK: vld1.64 {d16, d17, d18}, [r1]
define %struct.uint64x1x3_t @test_vld3_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u64:
+; CHECK: vld1.64 {d0, d1, d2}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u8
-; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]
define %struct.uint8x8x3_t @test_vld3_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u8:
+; CHECK: vld3.8 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x3_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u16
-; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint16x4x4_t @test_vld4_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u16:
+; CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u32
-; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint32x2x4_t @test_vld4_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u32:
+; CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u64
-; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]
define %struct.uint64x1x4_t @test_vld4_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u64:
+; CHECK: vld1.64 {d0, d1, d2, d3}, [r0:64]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u8
-; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint8x8x4_t @test_vld4_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u8:
+; CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x4_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u16
-; CHECK: vld2.16 {d16[], d18[]}, [r1]
-; CHECK: vld2.16 {d17[], d19[]}, [r1]
define %struct.uint16x8x2_t @test_vld2q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u16:
+; CHECK: vld2.16 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x2_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u32
-; CHECK: vld2.32 {d16[], d18[]}, [r1]
-; CHECK: vld2.32 {d17[], d19[]}, [r1]
define %struct.uint32x4x2_t @test_vld2q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u32:
+; CHECK: vld2.32 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.32 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x2_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u8
-; CHECK: vld2.8 {d16[], d18[]}, [r1]
-; CHECK: vld2.8 {d17[], d19[]}, [r1]
define %struct.uint8x16x2_t @test_vld2q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u8:
+; CHECK: vld2.8 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.8 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x2_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u16
-; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]
define %struct.uint16x8x3_t @test_vld3q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u16:
+; CHECK: vld3.16 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x3_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u32
-; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]
define %struct.uint32x4x3_t @test_vld3q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u32:
+; CHECK: vld3.32 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.32 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x3_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u8
-; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]
define %struct.uint8x16x3_t @test_vld3q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u8:
+; CHECK: vld3.8 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.8 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x3_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u16
-; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint16x8x4_t @test_vld4q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u16:
+; CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x4_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u32
-; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint32x4x4_t @test_vld4q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u32:
+; CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.32 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x4_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u8
-; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint8x16x4_t @test_vld4q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u8:
+; CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.8 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x4_t %tmp
diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
index cccbdd0435765..e49128f53b115 100644
--- a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
+++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
@@ -488,7 +488,7 @@ entry:
define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) {
; CHECK-LABEL: test_vld2q_dup_bf16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vld2.16 {d16[], d18[]}, [r0]
+; CHECK-NEXT: vld2.16 {d0[], d2[]}, [r0]
; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
; CHECK-NEXT: bx lr
entry:
More information about the llvm-commits
mailing list