[llvm] [ARM][NEON] Add constraint to vld2 Odd/Even Pseudo instructions. (PR #79287)
Alfie Richards via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 01:22:11 PST 2024
https://github.com/AlfieRichardsArm updated https://github.com/llvm/llvm-project/pull/79287
>From 9e23dfe3ba4f427b6138b17be17b1897c0a49c18 Mon Sep 17 00:00:00 2001
From: Alfie Richards <alfie.richards at arm.com>
Date: Mon, 22 Jan 2024 16:06:14 +0000
Subject: [PATCH 1/3] [ARM][NEON] Add constraint to vld2 Odd/Even Pseudo
instructions. This ensures the odd/even pseudo instructions are allocated to
the same register range.
This fixes #71763.
(https://github.com/llvm/llvm-project/issues/71763)
---
llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 9 ++---
llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 5 ---
llvm/lib/Target/ARM/ARMInstrNEON.td | 38 +++++++++++++------
.../test/CodeGen/ARM/bf16-intrinsics-ld-st.ll | 2 +-
4 files changed, 30 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2f9236bb977fc92..f0b69b0b09809f9 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -640,12 +640,9 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
// has an extra operand that is a use of the super-register. Record the
// operand index and skip over it.
unsigned SrcOpIdx = 0;
- if (!IsVLD2DUP) {
- if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
- RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
- RegSpc == SingleHighTSpc)
- SrcOpIdx = OpIdx++;
- }
+ if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc || RegSpc == SingleLowSpc ||
+ RegSpc == SingleHighQSpc || RegSpc == SingleHighTSpc)
+ SrcOpIdx = OpIdx++;
// Copy the predicate operands.
MIB.add(MI.getOperand(OpIdx++));
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e99ee299412a5f3..20dd3e7baf8498f 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3032,11 +3032,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
}
if (is64BitVector || NumVecs == 1) {
// Double registers and VLD1 quad registers are directly supported.
- } else if (NumVecs == 2) {
- const SDValue OpsA[] = {MemAddr, Align, Pred, Reg0, Chain};
- SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl, ResTy,
- MVT::Other, OpsA);
- Chain = SDValue(VLdA, 1);
} else {
SDValue ImplDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index f31e1e9f97892fa..e7cf8b4657b2c8a 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -1491,12 +1491,26 @@ def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
-def VLD2DUPq8EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32EvenPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudo : VLDQQPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+// Duplicate of VLDQQPseudo but with a constraint variable
+// to ensure the odd and even lanes use the same register range
+class VLDQQPseudoConstrained<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr, QQPR: $src), itin,
+ "$src = $dst">;
+class VLDQQWBPseudoConstrained<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR: $src), itin,
+ "$addr.addr = $wb, $src = $dst">;
+class VLDQQWBfixedPseudoConstrained<InstrItinClass itin>
+ : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+ (ins addrmode6:$addr, QQPR: $src), itin,
+ "$addr.addr = $wb, $src = $dst">;
+
+def VLD2DUPq8EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// ...with address register writeback:
multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1534,12 +1548,12 @@ defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
-def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudo<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
class VLD3DUP<bits<4> op7_4, string Dt>
diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
index cccbdd043576508..e49128f53b11575 100644
--- a/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
+++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-ld-st.ll
@@ -488,7 +488,7 @@ entry:
define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) {
; CHECK-LABEL: test_vld2q_dup_bf16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vld2.16 {d16[], d18[]}, [r0]
+; CHECK-NEXT: vld2.16 {d0[], d2[]}, [r0]
; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
; CHECK-NEXT: bx lr
entry:
>From 4670f146515edc2eadc474957f81e4c1247261d6 Mon Sep 17 00:00:00 2001
From: Alfie Richards <alfie.richards at arm.com>
Date: Thu, 25 Jan 2024 16:43:13 +0000
Subject: [PATCH 2/3] Update tests to use hard-float calling convention
---
llvm/test/CodeGen/ARM/arm-vlddup-update.ll | 225 ++++++++++++++++++++-
llvm/test/CodeGen/ARM/arm-vlddup.ll | 125 +++++++-----
2 files changed, 291 insertions(+), 59 deletions(-)
diff --git a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
index d4b2f0203bde785..875c4e0d3777f3b 100644
--- a/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
+++ b/llvm/test/CodeGen/ARM/arm-vlddup-update.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi --float-abi=hard -verify-machineinstrs \
; RUN: -asm-verbose=false | FileCheck %s
%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> }
@@ -59,6 +60,10 @@ declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr, i32)
define ptr @test_vld2_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u16_update:
; CHECK: vld2.16 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x2_t %tmp, ptr %dest, align 8
@@ -69,6 +74,10 @@ entry:
define ptr @test_vld2_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u16_update_reg:
; CHECK: vld2.16 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x2_t %tmp, ptr %dest, align 8
@@ -79,6 +88,10 @@ entry:
define ptr @test_vld2_dup_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_update:
; CHECK: vld2.32 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x2_t %tmp, ptr %dest, align 8
@@ -89,6 +102,10 @@ entry:
define ptr @test_vld2_dup_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_update_reg:
; CHECK: vld2.32 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x2_t %tmp, ptr %dest, align 8
@@ -99,6 +116,10 @@ entry:
define ptr @test_vld2_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u64_update:
; CHECK: vld1.64 {d16, d17}, [r1:64]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x2_t %tmp, ptr %dest, align 8
@@ -109,6 +130,10 @@ entry:
define ptr @test_vld2_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17}, [r1:64], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x2_t %tmp, ptr %dest, align 8
@@ -119,6 +144,10 @@ entry:
define ptr @test_vld2_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2_dup_u8_update:
; CHECK: vld2.8 {d16[], d17[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x2_t %tmp, ptr %dest, align 8
@@ -129,6 +158,10 @@ entry:
define ptr @test_vld2_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2_dup_u8_update_reg:
; CHECK: vld2.8 {d16[], d17[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vstr d17, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x2_t %tmp, ptr %dest, align 8
@@ -139,6 +172,11 @@ entry:
define ptr @test_vld3_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u16_update:
; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x3_t %tmp, ptr %dest, align 8
@@ -149,6 +187,11 @@ entry:
define ptr @test_vld3_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u16_update_reg:
; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x3_t %tmp, ptr %dest, align 8
@@ -159,6 +202,11 @@ entry:
define ptr @test_vld3_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u32_update:
; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x3_t %tmp, ptr %dest, align 8
@@ -169,6 +217,11 @@ entry:
define ptr @test_vld3_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u32_update_reg:
; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x3_t %tmp, ptr %dest, align 8
@@ -179,6 +232,11 @@ entry:
define ptr @test_vld3_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u64_update:
; CHECK: vld1.64 {d16, d17, d18}, [r1]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x3_t %tmp, ptr %dest, align 8
@@ -189,6 +247,11 @@ entry:
define ptr @test_vld3_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17, d18}, [r1], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x3_t %tmp, ptr %dest, align 8
@@ -199,6 +262,11 @@ entry:
define ptr @test_vld3_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3_dup_u8_update:
; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x3_t %tmp, ptr %dest, align 8
@@ -209,6 +277,11 @@ entry:
define ptr @test_vld3_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3_dup_u8_update_reg:
; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vstr d18, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x3_t %tmp, ptr %dest, align 8
@@ -219,6 +292,12 @@ entry:
define ptr @test_vld4_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u16_update:
; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x4_t %tmp, ptr %dest, align 8
@@ -229,6 +308,12 @@ entry:
define ptr @test_vld4_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u16_update_reg:
; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.16 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
store %struct.uint16x4x4_t %tmp, ptr %dest, align 8
@@ -239,6 +324,12 @@ entry:
define ptr @test_vld4_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u32_update:
; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x4_t %tmp, ptr %dest, align 8
@@ -249,6 +340,12 @@ entry:
define ptr @test_vld4_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u32_update_reg:
; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.32 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
store %struct.uint32x2x4_t %tmp, ptr %dest, align 8
@@ -259,6 +356,12 @@ entry:
define ptr @test_vld4_dup_u64_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u64_update:
; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]!
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x4_t %tmp, ptr %dest, align 8
@@ -269,6 +372,12 @@ entry:
define ptr @test_vld4_dup_u64_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u64_update_reg:
; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64], r2
+; CHECK-NEXT: vst1.64 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.64 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
store %struct.uint64x1x4_t %tmp, ptr %dest, align 8
@@ -279,6 +388,12 @@ entry:
define ptr @test_vld4_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4_dup_u8_update:
; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x4_t %tmp, ptr %dest, align 8
@@ -289,6 +404,12 @@ entry:
define ptr @test_vld4_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4_dup_u8_update_reg:
; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d17}, [r0:64]!
+; CHECK-NEXT: vst1.8 {d18}, [r0:64]!
+; CHECK-NEXT: vstr d19, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
store %struct.uint8x8x4_t %tmp, ptr %dest, align 8
@@ -300,6 +421,10 @@ define ptr @test_vld2q_dup_u16_update(ptr %dest, ptr %src, ptr %dest0) {
; CHECK-LABEL: test_vld2q_dup_u16_update:
; CHECK: vld2.16 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x2_t %tmp, ptr %dest, align 8
@@ -311,6 +436,10 @@ define ptr @test_vld2q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u16_update_reg:
; CHECK: vld2.16 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.16 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x2_t %tmp, ptr %dest, align 8
@@ -322,6 +451,10 @@ define ptr @test_vld2q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2q_dup_u32_update:
; CHECK: vld2.32 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x2_t %tmp, ptr %dest, align 8
@@ -333,6 +466,10 @@ define ptr @test_vld2q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u32_update_reg:
; CHECK: vld2.32 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.32 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x2_t %tmp, ptr %dest, align 8
@@ -344,6 +481,10 @@ define ptr @test_vld2q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld2q_dup_u8_update:
; CHECK: vld2.8 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x2_t %tmp, ptr %dest, align 8
@@ -355,6 +496,10 @@ define ptr @test_vld2q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld2q_dup_u8_update_reg:
; CHECK: vld2.8 {d16[], d18[]}, [r1]
; CHECK-NEXT: vld2.8 {d17[], d19[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.64 {d18, d19}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x2_t %tmp, ptr %dest, align 8
@@ -365,7 +510,12 @@ entry:
define ptr @test_vld3q_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u16_update:
; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x3_t %tmp, ptr %dest, align 8
@@ -377,6 +527,11 @@ define ptr @test_vld3q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u16_update_reg:
; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.16 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x3_t %tmp, ptr %dest, align 8
@@ -387,7 +542,12 @@ entry:
define ptr @test_vld3q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u32_update:
; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x3_t %tmp, ptr %dest, align 8
@@ -399,6 +559,11 @@ define ptr @test_vld3q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u32_update_reg:
; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.32 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x3_t %tmp, ptr %dest, align 8
@@ -409,7 +574,12 @@ entry:
define ptr @test_vld3q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld3q_dup_u8_update:
; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x3_t %tmp, ptr %dest, align 8
@@ -421,6 +591,11 @@ define ptr @test_vld3q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld3q_dup_u8_update_reg:
; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
; CHECK-NEXT: vld3.8 {d17[], d19[], d21[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.64 {d20, d21}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x3_t %tmp, ptr %dest, align 8
@@ -431,7 +606,13 @@ entry:
define ptr @test_vld4q_dup_u16_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u16_update:
; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.16 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x4_t %tmp, ptr %dest, align 8
@@ -443,6 +624,12 @@ define ptr @test_vld4q_dup_u16_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u16_update_reg:
; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.16 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.16 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.16 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.16 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
store %struct.uint16x8x4_t %tmp, ptr %dest, align 8
@@ -453,7 +640,13 @@ entry:
define ptr @test_vld4q_dup_u32_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u32_update:
; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.32 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x4_t %tmp, ptr %dest, align 8
@@ -465,6 +658,12 @@ define ptr @test_vld4q_dup_u32_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u32_update_reg:
; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.32 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.32 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.32 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.32 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
store %struct.uint32x4x4_t %tmp, ptr %dest, align 8
@@ -475,7 +674,13 @@ entry:
define ptr @test_vld4q_dup_u8_update(ptr %dest, ptr %src) {
; CHECK-LABEL: test_vld4q_dup_u8_update:
; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]!
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.8 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x4_t %tmp, ptr %dest, align 8
@@ -487,6 +692,12 @@ define ptr @test_vld4q_dup_u8_update_reg(ptr %dest, ptr %src, i32 %inc) {
; CHECK-LABEL: test_vld4q_dup_u8_update_reg:
; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
; CHECK-NEXT: vld4.8 {d17[], d19[], d21[], d23[]}, [r1], r2
+; CHECK-NEXT: vst1.8 {d16, d17}, [r0]!
+; CHECK-NEXT: vst1.8 {d18, d19}, [r0]!
+; CHECK-NEXT: vst1.8 {d20, d21}, [r0]!
+; CHECK-NEXT: vst1.64 {d22, d23}, [r0]
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
store %struct.uint8x16x4_t %tmp, ptr %dest, align 8
diff --git a/llvm/test/CodeGen/ARM/arm-vlddup.ll b/llvm/test/CodeGen/ARM/arm-vlddup.ll
index c22d1374761f30d..3fd35276f6dc916 100644
--- a/llvm/test/CodeGen/ARM/arm-vlddup.ll
+++ b/llvm/test/CodeGen/ARM/arm-vlddup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -verify-machineinstrs \
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi --float-abi=hard -verify-machineinstrs \
; RUN: -asm-verbose=false | FileCheck %s
%struct.uint16x4x2_t = type { <4 x i16>, <4 x i16> }
@@ -56,178 +56,199 @@ declare %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr, i32)
declare %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr, i32)
declare %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr, i32)
-; CHECK-LABEL: test_vld2_dup_u16
-; CHECK: vld2.16 {d16[], d17[]}, [r0]
define %struct.uint16x4x2_t @test_vld2_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u16:
+; CHECK: vld2.16 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x2_t @llvm.arm.neon.vld2dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u32
-; CHECK: vld2.32 {d16[], d17[]}, [r0]
define %struct.uint32x2x2_t @test_vld2_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u32:
+; CHECK: vld2.32 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x2_t @llvm.arm.neon.vld2dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u64
-; CHECK: vld1.64 {d16, d17}, [r0:64]
define %struct.uint64x1x2_t @test_vld2_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u64:
+; CHECK: vld1.64 {d0, d1}, [r0:64]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x2_t @llvm.arm.neon.vld2dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x2_t %tmp
}
-; CHECK-LABEL: test_vld2_dup_u8
-; CHECK: vld2.8 {d16[], d17[]}, [r0]
define %struct.uint8x8x2_t @test_vld2_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld2_dup_u8:
+; CHECK: vld2.8 {d0[], d1[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x2_t @llvm.arm.neon.vld2dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x2_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u16
-; CHECK: vld3.16 {d16[], d17[], d18[]}, [r1]
define %struct.uint16x4x3_t @test_vld3_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u16:
+; CHECK: vld3.16 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x3_t @llvm.arm.neon.vld3dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u32
-; CHECK: vld3.32 {d16[], d17[], d18[]}, [r1]
define %struct.uint32x2x3_t @test_vld3_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u32:
+; CHECK: vld3.32 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x3_t @llvm.arm.neon.vld3dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u64
-; CHECK: vld1.64 {d16, d17, d18}, [r1]
define %struct.uint64x1x3_t @test_vld3_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u64:
+; CHECK: vld1.64 {d0, d1, d2}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x3_t @llvm.arm.neon.vld3dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x3_t %tmp
}
-; CHECK-LABEL: test_vld3_dup_u8
-; CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]
define %struct.uint8x8x3_t @test_vld3_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld3_dup_u8:
+; CHECK: vld3.8 {d0[], d1[], d2[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x3_t @llvm.arm.neon.vld3dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x3_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u16
-; CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint16x4x4_t @test_vld4_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u16:
+; CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x4x4_t @llvm.arm.neon.vld4dup.v4i16.p0(ptr %src, i32 2)
ret %struct.uint16x4x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u32
-; CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint32x2x4_t @test_vld4_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u32:
+; CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x2x4_t @llvm.arm.neon.vld4dup.v2i32.p0(ptr %src, i32 4)
ret %struct.uint32x2x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u64
-; CHECK: vld1.64 {d16, d17, d18, d19}, [r1:64]
define %struct.uint64x1x4_t @test_vld4_dup_u64(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u64:
+; CHECK: vld1.64 {d0, d1, d2, d3}, [r0:64]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint64x1x4_t @llvm.arm.neon.vld4dup.v1i64.p0(ptr %src, i32 8)
ret %struct.uint64x1x4_t %tmp
}
-; CHECK-LABEL: test_vld4_dup_u8
-; CHECK: vld4.8 {d16[], d17[], d18[], d19[]}, [r1]
define %struct.uint8x8x4_t @test_vld4_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld4_dup_u8:
+; CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x8x4_t @llvm.arm.neon.vld4dup.v8i8.p0(ptr %src, i32 1)
ret %struct.uint8x8x4_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u16
-; CHECK: vld2.16 {d16[], d18[]}, [r1]
-; CHECK: vld2.16 {d17[], d19[]}, [r1]
define %struct.uint16x8x2_t @test_vld2q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u16:
+; CHECK: vld2.16 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x2_t @llvm.arm.neon.vld2dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x2_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u32
-; CHECK: vld2.32 {d16[], d18[]}, [r1]
-; CHECK: vld2.32 {d17[], d19[]}, [r1]
define %struct.uint32x4x2_t @test_vld2q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u32:
+; CHECK: vld2.32 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.32 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x2_t @llvm.arm.neon.vld2dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x2_t %tmp
}
-; CHECK-LABEL: test_vld2q_dup_u8
-; CHECK: vld2.8 {d16[], d18[]}, [r1]
-; CHECK: vld2.8 {d17[], d19[]}, [r1]
define %struct.uint8x16x2_t @test_vld2q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld2q_dup_u8:
+; CHECK: vld2.8 {d0[], d2[]}, [r0]
+; CHECK-NEXT: vld2.8 {d1[], d3[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x2_t @llvm.arm.neon.vld2dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x2_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u16
-; CHECK: vld3.16 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.16 {d17[], d19[], d21[]}, [r1]
define %struct.uint16x8x3_t @test_vld3q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u16:
+; CHECK: vld3.16 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x3_t @llvm.arm.neon.vld3dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x3_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u32
-; CHECK: vld3.32 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.32 {d17[], d19[], d21[]}, [r1]
define %struct.uint32x4x3_t @test_vld3q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u32:
+; CHECK: vld3.32 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.32 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x3_t @llvm.arm.neon.vld3dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x3_t %tmp
}
-; CHECK-LABEL: test_vld3q_dup_u8
-; CHECK: vld3.8 {d16[], d18[], d20[]}, [r1]
-; CHECK: vld3.8 {d17[], d19[], d21[]}, [r1]
define %struct.uint8x16x3_t @test_vld3q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld3q_dup_u8:
+; CHECK: vld3.8 {d0[], d2[], d4[]}, [r0]
+; CHECK-NEXT: vld3.8 {d1[], d3[], d5[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x3_t @llvm.arm.neon.vld3dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x3_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u16
-; CHECK: vld4.16 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.16 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint16x8x4_t @test_vld4q_dup_u16(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u16:
+; CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint16x8x4_t @llvm.arm.neon.vld4dup.v8i16.p0(ptr %src, i32 2)
ret %struct.uint16x8x4_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u32
-; CHECK: vld4.32 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.32 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint32x4x4_t @test_vld4q_dup_u32(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u32:
+; CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.32 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint32x4x4_t @llvm.arm.neon.vld4dup.v4i32.p0(ptr %src, i32 4)
ret %struct.uint32x4x4_t %tmp
}
-; CHECK-LABEL: test_vld4q_dup_u8
-; CHECK: vld4.8 {d16[], d18[], d20[], d22[]}, [r1]
-; CHECK: vld4.8 {d17[], d19[], d21[], d23[]}, [r1]
define %struct.uint8x16x4_t @test_vld4q_dup_u8(ptr %src) {
+; CHECK-LABEL: test_vld4q_dup_u8:
+; CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r0]
+; CHECK-NEXT: vld4.8 {d1[], d3[], d5[], d7[]}, [r0]
+; CHECK-NEXT: bx lr
entry:
%tmp = tail call %struct.uint8x16x4_t @llvm.arm.neon.vld4dup.v16i8.p0(ptr %src, i32 1)
ret %struct.uint8x16x4_t %tmp
>From ff0891af2ddd524ea5af24e87d5023376313a361 Mon Sep 17 00:00:00 2001
From: Alfie Richards <alfie.richards at arm.com>
Date: Fri, 26 Jan 2024 09:08:40 +0000
Subject: [PATCH 3/3] Changed names
---
llvm/lib/Target/ARM/ARMInstrNEON.td | 30 ++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index e7cf8b4657b2c8a..f160c031d843a90 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -1493,24 +1493,24 @@ def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
// Duplicate of VLDQQPseudo but with a constraint variable
// to ensure the odd and even lanes use the same register range
-class VLDQQPseudoConstrained<InstrItinClass itin>
+class VLDQQPseudoInputDST<InstrItinClass itin>
: PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr, QQPR: $src), itin,
"$src = $dst">;
-class VLDQQWBPseudoConstrained<InstrItinClass itin>
+class VLDQQWBPseudoInputDST<InstrItinClass itin>
: PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
(ins addrmode6:$addr, am6offset:$offset, QQPR: $src), itin,
"$addr.addr = $wb, $src = $dst">;
-class VLDQQWBfixedPseudoConstrained<InstrItinClass itin>
+class VLDQQWBfixedPseudoInputDST<InstrItinClass itin>
: PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
(ins addrmode6:$addr, QQPR: $src), itin,
"$addr.addr = $wb, $src = $dst">;
-def VLD2DUPq8EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32EvenPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudo : VLDQQPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32EvenPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudo : VLDQQPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// ...with address register writeback:
multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1548,12 +1548,12 @@ defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
addrmode6dupalign64>;
-def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
-def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudoConstrained<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_fixed : VLDQQWBfixedPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq8OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq16OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
+def VLD2DUPq32OddPseudoWB_register : VLDQQWBPseudoInputDST<IIC_VLD2dup>, Sched<[WriteVLD2]>;
// VLD3DUP : Vector Load (single 3-element structure to all lanes)
class VLD3DUP<bits<4> op7_4, string Dt>
More information about the llvm-commits
mailing list