[llvm] fece148 - [ARM] Additional tests for qr intrinsics in loops. NFC
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 17 04:39:34 PDT 2020
Author: David Green
Date: 2020-09-17T12:39:21+01:00
New Revision: fece1489d10bb189fe46bd08385ff6b8954dc39c
URL: https://github.com/llvm/llvm-project/commit/fece1489d10bb189fe46bd08385ff6b8954dc39c
DIFF: https://github.com/llvm/llvm-project/commit/fece1489d10bb189fe46bd08385ff6b8954dc39c.diff
LOG: [ARM] Additional tests for qr intrinsics in loops. NFC
Added:
llvm/test/CodeGen/Thumb2/mve-qrintr.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintr.ll b/llvm/test/CodeGen/Thumb2/mve-qrintr.ll
new file mode 100644
index 000000000000..4fcfe37b89e5
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintr.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+
+define void @vadd(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vadd:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB0_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB0_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vadd.i32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB0_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vsub(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vsub:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB1_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vsub.i32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB1_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vmul(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vmul:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB2_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB2_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmul.i32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB2_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vqadd(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vqadd:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB3_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vqadd.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB3_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vqsub(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vqsub:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB4_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB4_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vqsub.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB4_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vhadd(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vhadd:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB5_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vhadd.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB5_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vhsub(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vhsub:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB6_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB6_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vhsub.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB6_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vqdmull(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vqdmull:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB7_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.16 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB7_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrh.s32 q1, [r0]
+; CHECK-NEXT: vqdmullb.s16 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB7_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %conv = trunc i32 %c0 to i16
+ %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i16>*
+ %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
+ %3 = sext <4 x i16> %2 to <4 x i32>
+ %4 = bitcast <4 x i32> %3 to <8 x i16>
+ %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3)
+ %6 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vqdmulh:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB8_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vqdmulh.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB8_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) {
+; CHECK-LABEL: vqrdmulh:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB9_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB9_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vqrdmulh.s32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB9_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
+ %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
+ %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
+ %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
+ tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vaddf(float* %s1, float %c0, i32 %N) {
+; CHECK-LABEL: vaddf:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB10_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB10_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vadd.f32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB10_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast float* %s1.addr.013 to <4 x float>*
+ %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
+ %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
+ tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vsubf(float* %s1, float %c0, i32 %N) {
+; CHECK-LABEL: vsubf:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB11_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vsub.f32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB11_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast float* %s1.addr.013 to <4 x float>*
+ %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
+ %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
+ tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vmulf(float* %s1, float %c0, i32 %N) {
+; CHECK-LABEL: vmulf:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB12_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r1
+; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: .LBB12_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmul.f32 q1, q1, q0
+; CHECK-NEXT: vstrw.32 q1, [r0], #16
+; CHECK-NEXT: letp lr, .LBB12_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp11 = icmp sgt i32 %N, 0
+ br i1 %cmp11, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
+ %1 = bitcast float* %s1.addr.013 to <4 x float>*
+ %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
+ %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
+ tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
+ %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
+ %sub = add nsw i32 %N.addr.012, -4
+ %cmp = icmp sgt i32 %N.addr.012, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) {
+; CHECK-LABEL: vfma:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r3, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r2
+; CHECK-NEXT: dlstp.32 lr, r3
+; CHECK-NEXT: .LBB13_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vfma.f32 q2, q1, q0
+; CHECK-NEXT: vstrw.32 q2, [r0], #16
+; CHECK-NEXT: letp lr, .LBB13_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp12 = icmp sgt i32 %N, 0
+ br i1 %cmp12, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %0 = bitcast float* %s2 to <4 x float>*
+ %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
+ %2 = bitcast float* %s1.addr.014 to <4 x float>*
+ %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
+ %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
+ %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1)
+ tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
+ %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
+ %sub = add nsw i32 %N.addr.013, -4
+ %cmp = icmp sgt i32 %N.addr.013, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) {
+; CHECK-LABEL: vfmas:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r3, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: poplt {r7, pc}
+; CHECK-NEXT: .LBB14_1: @ %while.body.lr.ph
+; CHECK-NEXT: vdup.32 q0, r2
+; CHECK-NEXT: dlstp.32 lr, r3
+; CHECK-NEXT: .LBB14_2: @ %while.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vfma.f32 q3, q2, q1
+; CHECK-NEXT: vstrw.32 q3, [r0], #16
+; CHECK-NEXT: letp lr, .LBB14_2
+; CHECK-NEXT: @ %bb.3: @ %while.end
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp12 = icmp sgt i32 %N, 0
+ br i1 %cmp12, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph: ; preds = %entry
+ %0 = bitcast float* %s2 to <4 x float>*
+ %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
+ %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
+ %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
+ %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
+ %2 = bitcast float* %s1.addr.014 to <4 x float>*
+ %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
+ %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
+ %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1)
+ tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
+ %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
+ %sub = add nsw i32 %N.addr.013, -4
+ %cmp = icmp sgt i32 %N.addr.013, 4
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body, %entry
+ ret void
+}
+
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
+declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
More information about the llvm-commits
mailing list