[llvm] [AArch64][SVE2] Lower read-after-write mask to whilerw (PR #114028)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 5 08:50:17 PST 2024
https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/114028
>From fea72a0fff1bfc0f0a950facf0e7e5404fedc0c0 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Tue, 29 Oct 2024 10:05:07 +0000
Subject: [PATCH 1/4] [AArch64][SVE2] Lower read-after-write mask to whilerw
This patch extends the whilewr matching to also match a read-after-write
mask and lower it to a whilerw.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 34 ++-
llvm/test/CodeGen/AArch64/whilewr.ll | 261 +++++++++++++-----
2 files changed, 223 insertions(+), 72 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf2f0674b5b65e..a2517761afc0c9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14189,7 +14189,16 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
return SDValue();
SDValue Diff = Cmp.getOperand(0);
- if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
+ SDValue NonAbsDiff = Diff;
+ bool WriteAfterRead = true;
+ // A read-after-write will have an abs call on the diff
+ if (Diff.getOpcode() == ISD::ABS) {
+ NonAbsDiff = Diff.getOperand(0);
+ WriteAfterRead = false;
+ }
+
+ if (NonAbsDiff.getOpcode() != ISD::SUB ||
+ NonAbsDiff.getValueType() != MVT::i64)
return SDValue();
if (!isNullConstant(LaneMask.getOperand(1)) ||
@@ -14210,8 +14219,13 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
// it's positive, otherwise the difference plus the element size if it's
// negative: pos_diff = diff < 0 ? (diff + 7) : diff
SDValue Select = DiffDiv.getOperand(0);
+ SDValue SelectOp3 = Select.getOperand(3);
+ // Check for an abs in the case of a read-after-write
+ if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS)
+ SelectOp3 = SelectOp3.getOperand(0);
+
// Make sure the difference is being compared by the select
- if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff)
+ if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff)
return SDValue();
// Make sure it's checking if the difference is less than 0
if (!isNullConstant(Select.getOperand(1)) ||
@@ -14243,22 +14257,26 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
} else if (LaneMask.getOperand(2) != Diff)
return SDValue();
- SDValue StorePtr = Diff.getOperand(0);
- SDValue ReadPtr = Diff.getOperand(1);
+ SDValue StorePtr = NonAbsDiff.getOperand(0);
+ SDValue ReadPtr = NonAbsDiff.getOperand(1);
unsigned IntrinsicID = 0;
switch (EltSize) {
case 1:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
+ IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b
+ : Intrinsic::aarch64_sve_whilerw_b;
break;
case 2:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
+ IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h
+ : Intrinsic::aarch64_sve_whilerw_h;
break;
case 4:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
+ IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s
+ : Intrinsic::aarch64_sve_whilerw_s;
break;
case 8:
- IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
+ IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d
+ : Intrinsic::aarch64_sve_whilerw_d;
break;
default:
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 9f1ea850792384..ec59e42feb6a4f 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -30,6 +30,36 @@ entry:
ret <vscale x 16 x i1> %active.lane.mask.alias
}
+define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilerw p0.b, x2, x1
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b24 = ptrtoint ptr %b to i64
+ %c25 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %c25, %b24
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %neg.compare = icmp slt i64 %0, 0
+ %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %0)
+ %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 16 x i1> %active.lane.mask.alias
+}
+
define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_commutative:
; CHECK: // %bb.0: // %entry
@@ -89,6 +119,39 @@ entry:
ret <vscale x 8 x i1> %active.lane.mask.alias
}
+define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilerw p0.h, x2, x1
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT: cmn x8, #1
+; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b24 = ptrtoint ptr %b to i64
+ %c25 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %c25, %b24
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 2
+ %neg.compare = icmp slt i64 %0, -1
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 8 x i1> %active.lane.mask.alias
+}
+
define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_32:
; CHECK: // %bb.0: // %entry
@@ -122,6 +185,41 @@ entry:
ret <vscale x 4 x i1> %active.lane.mask.alias
}
+define <vscale x 4 x i1> @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilerw p0.s, x2, x1
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT: add x9, x8, #3
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmn x8, #3
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #2
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b24 = ptrtoint ptr %b to i64
+ %c25 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %c25, %b24
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 4
+ %neg.compare = icmp slt i64 %0, -3
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 4 x i1> %active.lane.mask.alias
+}
+
define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_64:
; CHECK: // %bb.0: // %entry
@@ -155,6 +253,41 @@ entry:
ret <vscale x 2 x i1> %active.lane.mask.alias
}
+define <vscale x 2 x i1> @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: whilerw_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: whilerw p0.d, x2, x1
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: whilerw_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT: add x9, x8, #7
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmn x8, #7
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: asr x9, x9, #3
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b24 = ptrtoint ptr %b to i64
+ %c25 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %c25, %b24
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 8
+ %neg.compare = icmp slt i64 %0, -7
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 2 x i1> %active.lane.mask.alias
+}
+
define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: no_whilewr_128:
; CHECK: // %bb.0: // %entry
@@ -212,7 +345,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB6_3
+; CHECK-NEXT: b.lt .LBB10_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: whilewr p0.b, x1, x2
; CHECK-NEXT: mov w9, w3
@@ -220,7 +353,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.b, xzr, x9
; CHECK-NEXT: cntp x10, p0, p0.b
; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB6_2: // %vector.body
+; CHECK-NEXT: .LBB10_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
@@ -229,14 +362,14 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB6_2
-; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB10_2
+; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB6_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB10_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x1, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
@@ -250,7 +383,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
@@ -259,8 +392,8 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB10_2
+; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -306,14 +439,14 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB7_3
+; CHECK-NEXT: b.lt .LBB11_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: whilewr p1.h, x1, x2
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: whilelo p0.h, xzr, x8
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB7_2: // %vector.body
+; CHECK-NEXT: .LBB11_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
@@ -321,14 +454,14 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1]
; CHECK-NEXT: inch x9
; CHECK-NEXT: whilelo p0.h, x9, x8
-; CHECK-NEXT: b.mi .LBB7_2
-; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB11_2
+; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB7_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB11_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: mov w9, w3
; CHECK-NOSVE2-NEXT: sub x10, x1, x2
@@ -344,7 +477,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: cnth x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
@@ -352,8 +485,8 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB7_2
-; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB11_2
+; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -399,14 +532,14 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB8_3
+; CHECK-NEXT: b.lt .LBB12_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: whilewr p1.s, x1, x2
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB8_2: // %vector.body
+; CHECK-NEXT: .LBB12_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
@@ -414,14 +547,14 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2]
; CHECK-NEXT: incw x9
; CHECK-NEXT: whilelo p0.s, x9, x8
-; CHECK-NEXT: b.mi .LBB8_2
-; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB12_2
+; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB8_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB12_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: mov w9, w3
; CHECK-NOSVE2-NEXT: sub x10, x1, x2
@@ -439,7 +572,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: cntw x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
@@ -447,8 +580,8 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB8_2
-; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB12_2
+; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp9 = icmp sgt i32 %n, 0
@@ -494,14 +627,14 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB9_3
+; CHECK-NEXT: b.lt .LBB13_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: whilewr p1.d, x1, x2
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: .LBB9_2: // %vector.body
+; CHECK-NEXT: .LBB13_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
@@ -509,14 +642,14 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3]
; CHECK-NEXT: incd x9
; CHECK-NEXT: whilelo p0.d, x9, x8
-; CHECK-NEXT: b.mi .LBB9_2
-; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB13_2
+; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB9_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB13_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: mov w9, w3
; CHECK-NOSVE2-NEXT: sub x10, x1, x2
@@ -534,7 +667,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: cntd x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
@@ -542,8 +675,8 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB9_2
-; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB13_2
+; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp9 = icmp sgt i32 %n, 0
@@ -589,7 +722,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_multiple_8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB10_3
+; CHECK-NEXT: b.lt .LBB14_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: whilewr p0.b, x0, x2
; CHECK-NEXT: mov w9, w3
@@ -599,7 +732,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.b, xzr, x9
; CHECK-NEXT: cntp x10, p0, p0.b
; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB10_2: // %vector.body
+; CHECK-NEXT: .LBB14_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
@@ -608,14 +741,14 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB10_2
-; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB14_2
+; CHECK-NEXT: .LBB14_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB10_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB14_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
@@ -637,7 +770,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9
; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b
; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB14_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
@@ -646,8 +779,8 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB10_2
-; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB14_2
+; CHECK-NOSVE2-NEXT: .LBB14_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -701,7 +834,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_multiple_16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB11_3
+; CHECK-NEXT: b.lt .LBB15_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: whilewr p0.h, x0, x2
; CHECK-NEXT: mov w9, w3
@@ -711,7 +844,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.h, xzr, x9
; CHECK-NEXT: cntp x10, p0, p0.h
; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB11_2: // %vector.body
+; CHECK-NEXT: .LBB15_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -720,14 +853,14 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.h, x8, x9
-; CHECK-NEXT: b.mi .LBB11_2
-; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB15_2
+; CHECK-NEXT: .LBB15_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB11_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB15_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
@@ -753,7 +886,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h
; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB15_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
@@ -762,8 +895,8 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB11_2
-; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB15_2
+; CHECK-NOSVE2-NEXT: .LBB15_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp11 = icmp sgt i32 %n, 0
@@ -819,7 +952,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_multiple_32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB12_3
+; CHECK-NEXT: b.lt .LBB16_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: whilewr p0.s, x0, x2
; CHECK-NEXT: mov w9, w3
@@ -829,7 +962,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.s, xzr, x9
; CHECK-NEXT: cntp x10, p0, p0.s
; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB12_2: // %vector.body
+; CHECK-NEXT: .LBB16_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -838,14 +971,14 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.s, x8, x9
-; CHECK-NEXT: b.mi .LBB12_2
-; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB16_2
+; CHECK-NEXT: .LBB16_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB12_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB16_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
@@ -875,7 +1008,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s
; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB16_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
@@ -884,8 +1017,8 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB12_2
-; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB16_2
+; CHECK-NOSVE2-NEXT: .LBB16_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp9 = icmp sgt i32 %n, 0
@@ -941,7 +1074,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_loop_multiple_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB13_3
+; CHECK-NEXT: b.lt .LBB17_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: whilewr p0.d, x0, x2
; CHECK-NEXT: mov w9, w3
@@ -951,7 +1084,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.d, xzr, x9
; CHECK-NEXT: cntp x10, p0, p0.d
; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB13_2: // %vector.body
+; CHECK-NEXT: .LBB17_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -960,14 +1093,14 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
; CHECK-NEXT: add x8, x8, x10
; CHECK-NEXT: whilelo p1.d, x8, x9
-; CHECK-NEXT: b.mi .LBB13_2
-; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup
+; CHECK-NEXT: b.mi .LBB17_2
+; CHECK-NEXT: .LBB17_3: // %for.cond.cleanup
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
-; CHECK-NOSVE2-NEXT: b.lt .LBB13_3
+; CHECK-NOSVE2-NEXT: b.lt .LBB17_3
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
@@ -997,7 +1130,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d
; CHECK-NOSVE2-NEXT: and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body
+; CHECK-NOSVE2-NEXT: .LBB17_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
@@ -1006,8 +1139,8 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT: b.mi .LBB13_2
-; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup
+; CHECK-NOSVE2-NEXT: b.mi .LBB17_2
+; CHECK-NOSVE2-NEXT: .LBB17_3: // %for.cond.cleanup
; CHECK-NOSVE2-NEXT: ret
entry:
%cmp9 = icmp sgt i32 %n, 0
>From c131430b06e289a57098cd01884c3b72b7ebb1f3 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Nov 2024 15:16:56 +0000
Subject: [PATCH 2/4] Fix comparisons in tests
---
.../Target/AArch64/AArch64ISelLowering.cpp | 155 ++++++------
llvm/test/CodeGen/AArch64/whilewr.ll | 226 +++++++++---------
2 files changed, 199 insertions(+), 182 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a2517761afc0c9..edf86fd7f806f3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14153,11 +14153,13 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
return ResultSLI;
}
-/// Try to lower the construction of a pointer alias mask to a WHILEWR.
-/// The mask's enabled lanes represent the elements that will not overlap across
-/// one loop iteration. This tries to match:
-/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
+/// Try to lower the construction of a pointer alias mask to a WHILEWR or
+/// WHILERW. The mask's enabled lanes represent the elements that will not
+/// overlap across one loop iteration. This tries to match:
+/// or (splat (setcc_lt/lte/eq (sub ptrA, ptrB), 0)),
/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
+/// A call to abs on the subtraction signifies that it's a read-after-write and
+/// hence a WHILERW.
SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
const AArch64Subtarget &Subtarget) {
if (!Subtarget.hasSVE2())
@@ -14170,6 +14172,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
+ !isNullConstant(LaneMask.getOperand(1)) ||
Splat.getOpcode() != ISD::SPLAT_VECTOR)
return SDValue();
@@ -14177,16 +14180,17 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
if (Cmp.getOpcode() != ISD::SETCC)
return SDValue();
- CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
-
- auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
- if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
- Cond->get() != ISD::CondCode::SETLT)
- return SDValue();
- unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
- unsigned EltSize = CompValue + 1;
- if (!isPowerOf2_64(EltSize) || EltSize > 8)
- return SDValue();
+ // The number of elements that alias is calculated by dividing the positive
+ // difference between the pointers by the element size. An alias mask for i8
+ // elements omits the division because it would just divide by 1
+ SDValue DiffDiv = LaneMask.getOperand(2);
+ unsigned EltSize = 1;
+ if (DiffDiv.getOpcode() == ISD::SRA) {
+ auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
+ if (!DiffDivConst)
+ return SDValue();
+ EltSize = 1 << DiffDivConst->getZExtValue();
+ }
SDValue Diff = Cmp.getOperand(0);
SDValue NonAbsDiff = Diff;
@@ -14197,66 +14201,81 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
WriteAfterRead = false;
}
- if (NonAbsDiff.getOpcode() != ISD::SUB ||
- NonAbsDiff.getValueType() != MVT::i64)
+ ISD::CondCode Cond = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
+ auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
+ if (!ComparatorConst)
return SDValue();
- if (!isNullConstant(LaneMask.getOperand(1)) ||
- (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
+ // The diff should be compared to 0. A write-after-read should be less than or
+ // equal and a read-after-write should be equal.
+ int CompValue = ComparatorConst->getSExtValue();
+ switch (CompValue) {
+ case 0:
+ if (WriteAfterRead && Cond != ISD::CondCode::SETLE)
+ return SDValue();
+ else if (!WriteAfterRead && Cond != ISD::CondCode::SETEQ)
+ return SDValue();
+ break;
+ case 1:
+ if (!WriteAfterRead)
+ return SDValue();
+ if (Cond != ISD::CondCode::SETLT)
+ return SDValue();
+ break;
+ default:
return SDValue();
+ }
- // The number of elements that alias is calculated by dividing the positive
- // difference between the pointers by the element size. An alias mask for i8
- // elements omits the division because it would just divide by 1
- if (EltSize > 1) {
- SDValue DiffDiv = LaneMask.getOperand(2);
- auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
- if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
- return SDValue();
- if (EltSize > 2) {
- // When masking i32 or i64 elements, the positive value of the
- // possibly-negative difference comes from a select of the difference if
- // it's positive, otherwise the difference plus the element size if it's
- // negative: pos_diff = diff < 0 ? (diff + 7) : diff
- SDValue Select = DiffDiv.getOperand(0);
- SDValue SelectOp3 = Select.getOperand(3);
- // Check for an abs in the case of a read-after-write
- if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS)
- SelectOp3 = SelectOp3.getOperand(0);
-
- // Make sure the difference is being compared by the select
- if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff)
- return SDValue();
- // Make sure it's checking if the difference is less than 0
- if (!isNullConstant(Select.getOperand(1)) ||
- cast<CondCodeSDNode>(Select.getOperand(4))->get() !=
- ISD::CondCode::SETLT)
- return SDValue();
- // An add creates a positive value from the negative difference
- SDValue Add = Select.getOperand(2);
- if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
- return SDValue();
- if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1));
- !AddConst || AddConst->getZExtValue() != EltSize - 1)
- return SDValue();
- } else {
- // When masking i16 elements, this positive value comes from adding the
- // difference's sign bit to the difference itself. This is equivalent to
- // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff)
- SDValue Add = DiffDiv.getOperand(0);
- if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
- return SDValue();
- // A logical right shift by 63 extracts the sign bit from the difference
- SDValue Shift = Add.getOperand(1);
- if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff)
- return SDValue();
- if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
- !ShiftConst || ShiftConst->getZExtValue() != 63)
- return SDValue();
- }
- } else if (LaneMask.getOperand(2) != Diff)
+ if (NonAbsDiff.getOpcode() != ISD::SUB ||
+ NonAbsDiff.getValueType() != MVT::i64)
return SDValue();
+ if (EltSize == 1) {
+ // When the element size is 1, the division is omitted, so the lane mask
+ // just uses the raw difference between the pointers.
+ if (LaneMask.getOperand(2) != Diff)
+ return SDValue();
+ } else if (EltSize == 2) {
+ // When masking i16 elements, this positive value comes from adding the
+ // difference's sign bit to the difference itself. This is equivalent to
+ // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff)
+ SDValue Add = DiffDiv.getOperand(0);
+ if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
+ return SDValue();
+ // A logical right shift by 63 extracts the sign bit from the difference
+ SDValue Shift = Add.getOperand(1);
+ if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff)
+ return SDValue();
+ if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ !ShiftConst || ShiftConst->getZExtValue() != 63)
+ return SDValue();
+ } else if (EltSize > 2) {
+ // When masking i32 or i64 elements, the positive value of the
+ // possibly-negative difference comes from a select of the difference if
+ // it's positive, otherwise the difference plus the element size if it's
+ // negative: pos_diff = diff < 0 ? (diff + 7) : diff
+ SDValue Select = DiffDiv.getOperand(0);
+ SDValue SelectOp3 = Select.getOperand(3);
+ // Check for an abs in the case of a read-after-write
+ if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS)
+ SelectOp3 = SelectOp3.getOperand(0);
+
+ // Make sure the difference is being compared by the select
+ if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff)
+ return SDValue();
+ // Make sure it's checking if the difference is less than 0
+ if (!isNullConstant(Select.getOperand(1)) ||
+ cast<CondCodeSDNode>(Select.getOperand(4))->get() !=
+ ISD::CondCode::SETLT)
+ return SDValue();
+ // An add creates a positive value from the negative difference
+ SDValue Add = Select.getOperand(2);
+ if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
+ return SDValue();
+ if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ !AddConst || AddConst->getZExtValue() != EltSize - 1)
+ return SDValue();
+ }
SDValue StorePtr = NonAbsDiff.getOperand(0);
SDValue ReadPtr = NonAbsDiff.getOperand(1);
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index ec59e42feb6a4f..0bdb4b726731ff 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -11,7 +11,7 @@ define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-LABEL: whilewr_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: cset w9, lt
; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
@@ -22,7 +22,7 @@ entry:
%c14 = ptrtoint ptr %c to i64
%b15 = ptrtoint ptr %b to i64
%sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -33,15 +33,15 @@ entry:
define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilerw_8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.b, x2, x1
+; CHECK-NEXT: whilerw p0.b, x1, x2
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilerw_8:
; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: cset w9, eq
; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8
@@ -50,9 +50,9 @@ define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
entry:
%b24 = ptrtoint ptr %b to i64
%c25 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %c25, %b24
+ %sub.diff = sub i64 %b24, %c25
%0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
- %neg.compare = icmp slt i64 %0, 0
+ %neg.compare = icmp eq i64 %0, 0
%.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %0)
@@ -69,7 +69,7 @@ define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i
; CHECK-NOSVE2-LABEL: whilewr_commutative:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: cset w9, lt
; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8
; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
@@ -80,7 +80,7 @@ entry:
%c14 = ptrtoint ptr %c to i64
%b15 = ptrtoint ptr %b to i64
%sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -97,7 +97,7 @@ define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-LABEL: whilewr_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmn x8, #1
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
; CHECK-NOSVE2-NEXT: cset w9, lt
; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
@@ -111,7 +111,7 @@ entry:
%c15 = ptrtoint ptr %c to i64
%sub.diff = sub i64 %b14, %c15
%diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
@@ -122,16 +122,16 @@ entry:
define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilerw_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.h, x2, x1
+; CHECK-NEXT: whilerw p0.h, x1, x2
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilerw_16:
; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
-; CHECK-NOSVE2-NEXT: cmn x8, #1
+; CHECK-NOSVE2-NEXT: cmp x8, #0
; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: cset w9, eq
; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
; CHECK-NOSVE2-NEXT: asr x8, x8, #1
; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
@@ -141,10 +141,10 @@ define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
entry:
%b24 = ptrtoint ptr %b to i64
%c25 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %c25, %b24
+ %sub.diff = sub i64 %b24, %c25
%0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
%diff = sdiv i64 %0, 2
- %neg.compare = icmp slt i64 %0, -1
+ %neg.compare = icmp eq i64 %0, 0
%.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
@@ -164,7 +164,7 @@ define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x9, x8, #3
; CHECK-NOSVE2-NEXT: cmp x8, #0
; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #3
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: cset w8, lt
; CHECK-NOSVE2-NEXT: asr x9, x9, #2
; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
@@ -177,7 +177,7 @@ entry:
%c13 = ptrtoint ptr %c to i64
%sub.diff = sub i64 %b12, %c13
%diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
@@ -188,31 +188,30 @@ entry:
define <vscale x 4 x i1> @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilerw_32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.s, x2, x1
+; CHECK-NEXT: whilerw p0.s, x1, x2
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilerw_32:
; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
-; CHECK-NOSVE2-NEXT: add x9, x8, #3
; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #3
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: asr x9, x9, #2
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NOSVE2-NEXT: add x9, x8, #3
+; CHECK-NOSVE2-NEXT: cset w10, eq
+; CHECK-NOSVE2-NEXT: csel x8, x9, x8, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #2
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8
; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NOSVE2-NEXT: ret
entry:
%b24 = ptrtoint ptr %b to i64
%c25 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %c25, %b24
+ %sub.diff = sub i64 %b24, %c25
%0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
%diff = sdiv i64 %0, 4
- %neg.compare = icmp slt i64 %0, -3
+ %neg.compare = icmp eq i64 %0, 0
%.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
@@ -232,7 +231,7 @@ define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x9, x8, #7
; CHECK-NOSVE2-NEXT: cmp x8, #0
; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #7
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: cset w8, lt
; CHECK-NOSVE2-NEXT: asr x9, x9, #3
; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
@@ -245,7 +244,7 @@ entry:
%c13 = ptrtoint ptr %c to i64
%sub.diff = sub i64 %b12, %c13
%diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
@@ -256,31 +255,30 @@ entry:
define <vscale x 2 x i1> @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilerw_64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.d, x2, x1
+; CHECK-NEXT: whilerw p0.d, x1, x2
; CHECK-NEXT: ret
;
; CHECK-NOSVE2-LABEL: whilerw_64:
; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x2, x1
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
-; CHECK-NOSVE2-NEXT: add x9, x8, #7
; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #7
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: asr x9, x9, #3
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NOSVE2-NEXT: add x9, x8, #7
+; CHECK-NOSVE2-NEXT: cset w10, eq
+; CHECK-NOSVE2-NEXT: csel x8, x9, x8, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #3
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NOSVE2-NEXT: ret
entry:
%b24 = ptrtoint ptr %b to i64
%c25 = ptrtoint ptr %c to i64
- %sub.diff = sub i64 %c25, %b24
+ %sub.diff = sub i64 %b24, %c25
%0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
%diff = sdiv i64 %0, 8
- %neg.compare = icmp slt i64 %0, -7
+ %neg.compare = icmp eq i64 %0, 0
%.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
@@ -297,7 +295,7 @@ define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: add x9, x8, #15
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: csel x9, x9, x8, lt
-; CHECK-NEXT: cmn x8, #15
+; CHECK-NEXT: cmp x8, #1
; CHECK-NEXT: asr x9, x9, #4
; CHECK-NEXT: cset w8, lt
; CHECK-NEXT: sbfx x8, x8, #0, #1
@@ -317,7 +315,7 @@ define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NOSVE2-NEXT: add x9, x8, #15
; CHECK-NOSVE2-NEXT: cmp x8, #0
; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmn x8, #15
+; CHECK-NOSVE2-NEXT: cmp x8, #1
; CHECK-NOSVE2-NEXT: asr x9, x9, #4
; CHECK-NOSVE2-NEXT: cset w8, lt
; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
@@ -333,7 +331,7 @@ entry:
%c13 = ptrtoint ptr %c to i64
%sub.diff = sub i64 %b12, %c13
%diff = sdiv i64 %sub.diff, 16
- %neg.compare = icmp slt i64 %sub.diff, -15
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
@@ -373,7 +371,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x1, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1
@@ -404,7 +402,7 @@ for.body.preheader:
%b15 = ptrtoint ptr %b to i64
%wide.trip.count = zext nneg i32 %n to i64
%sub.diff = sub i64 %b15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
@@ -442,18 +440,18 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: b.lt .LBB11_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.h, x1, x2
+; CHECK-NEXT: whilewr p0.h, x1, x2
; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: whilelo p1.h, xzr, x8
; CHECK-NEXT: .LBB11_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
+; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1]
; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1]
+; CHECK-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1]
; CHECK-NEXT: inch x9
-; CHECK-NEXT: whilelo p0.h, x9, x8
+; CHECK-NEXT: whilelo p1.h, x9, x8
; CHECK-NEXT: b.mi .LBB11_2
; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -467,7 +465,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: sub x10, x1, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: cmp x10, #1
; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63
; CHECK-NOSVE2-NEXT: cset w11, lt
; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1
@@ -476,11 +474,11 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10
; CHECK-NOSVE2-NEXT: cnth x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p2/z, [x0, x8, lsl #1]
+; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p2/z, [x1, x8, lsl #1]
; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h
; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
@@ -501,21 +499,21 @@ for.body.preheader:
%active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
%sub.diff = sub i64 %b14, %c15
%diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
br label %vector.body
vector.body:
%index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask
%3 = getelementptr inbounds i16, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %2, <vscale x 8 x i16> poison)
%4 = getelementptr inbounds i16, ptr %b, i64 %index
- %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+ %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %2, <vscale x 8 x i16> poison)
%5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
%6 = getelementptr inbounds i16, ptr %c, i64 %index
tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
@@ -535,18 +533,18 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: b.lt .LBB12_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.s, x1, x2
+; CHECK-NEXT: whilewr p0.s, x1, x2
; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.s, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: whilelo p1.s, xzr, x8
; CHECK-NEXT: .LBB12_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
+; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2]
; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2]
+; CHECK-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2]
; CHECK-NEXT: incw x9
-; CHECK-NEXT: whilelo p0.s, x9, x8
+; CHECK-NEXT: whilelo p1.s, x9, x8
; CHECK-NEXT: b.mi .LBB12_2
; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -563,7 +561,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x11, x10, #3
; CHECK-NOSVE2-NEXT: cmp x10, #0
; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT: cmn x10, #3
+; CHECK-NOSVE2-NEXT: cmp x10, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: asr x11, x11, #2
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
@@ -571,11 +569,11 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10
; CHECK-NOSVE2-NEXT: cntw x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p2/z, [x0, x8, lsl #2]
+; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p2/z, [x1, x8, lsl #2]
; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s
; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
@@ -596,21 +594,21 @@ for.body.preheader:
%active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
%sub.diff = sub i64 %b12, %c13
%diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
br label %vector.body
vector.body:
%index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask
%3 = getelementptr inbounds i32, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %2, <vscale x 4 x i32> poison)
%4 = getelementptr inbounds i32, ptr %b, i64 %index
- %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
+ %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %2, <vscale x 4 x i32> poison)
%5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
%6 = getelementptr inbounds i32, ptr %c, i64 %index
tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
@@ -630,18 +628,18 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: b.lt .LBB13_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p1.d, x1, x2
+; CHECK-NEXT: whilewr p0.d, x1, x2
; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p0.d, xzr, x8
-; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
+; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: .LBB13_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
+; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3]
+; CHECK-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3]
; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3]
; CHECK-NEXT: incd x9
-; CHECK-NEXT: whilelo p0.d, x9, x8
+; CHECK-NEXT: whilelo p1.d, x9, x8
; CHECK-NEXT: b.mi .LBB13_2
; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -658,7 +656,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x11, x10, #7
; CHECK-NOSVE2-NEXT: cmp x10, #0
; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT: cmn x10, #7
+; CHECK-NOSVE2-NEXT: cmp x10, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: asr x11, x11, #3
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
@@ -666,11 +664,11 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10
; CHECK-NOSVE2-NEXT: cntd x10
; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body
; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b
+; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p2/z, [x0, x8, lsl #3]
+; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p2/z, [x1, x8, lsl #3]
; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d
; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3]
; CHECK-NOSVE2-NEXT: add x8, x8, x10
@@ -691,21 +689,21 @@ for.body.preheader:
%active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
%sub.diff = sub i64 %b12, %c13
%diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
- %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
br label %vector.body
vector.body:
%index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
- %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+ %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask
%3 = getelementptr inbounds i64, ptr %a, i64 %index
- %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %2, <vscale x 2 x i64> poison)
%4 = getelementptr inbounds i64, ptr %b, i64 %index
- %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+ %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %2, <vscale x 2 x i64> poison)
%5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
%6 = getelementptr inbounds i64, ptr %c, i64 %index
tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
@@ -752,13 +750,13 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9
; CHECK-NOSVE2-NEXT: sub x9, x1, x2
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10
-; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9
; CHECK-NOSVE2-NEXT: mov w9, w3
@@ -792,13 +790,13 @@ for.body.preheader:
%b16 = ptrtoint ptr %b to i64
%wide.trip.count = zext nneg i32 %n to i64
%sub.diff = sub i64 %a15, %c14
- %neg.compare = icmp slt i64 %sub.diff, 0
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
%active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
%sub.diff18 = sub i64 %b16, %c14
- %neg.compare20 = icmp slt i64 %sub.diff18, 0
+ %neg.compare20 = icmp sle i64 %sub.diff18, 0
%.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
%.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
%ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
@@ -864,7 +862,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NOSVE2-NEXT: sub x9, x0, x2
; CHECK-NOSVE2-NEXT: mov x8, xzr
-; CHECK-NOSVE2-NEXT: cmn x9, #1
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
@@ -873,7 +871,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: sub x10, x1, x2
; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9
; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT: cmn x10, #1
+; CHECK-NOSVE2-NEXT: cmp x10, #1
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: asr x9, x9, #1
; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
@@ -909,14 +907,14 @@ for.body.preheader:
%wide.trip.count = zext nneg i32 %n to i64
%sub.diff = sub i64 %a15, %c14
%diff = sdiv i64 %sub.diff, 2
- %neg.compare = icmp slt i64 %sub.diff, -1
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
%sub.diff18 = sub i64 %b16, %c14
%diff19 = sdiv i64 %sub.diff18, 2
- %neg.compare20 = icmp slt i64 %sub.diff18, -1
+ %neg.compare20 = icmp sle i64 %sub.diff18, 0
%.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
%.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
%ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
@@ -985,7 +983,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x10, x9, #3
; CHECK-NOSVE2-NEXT: cmp x9, #0
; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: asr x9, x10, #2
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
@@ -995,7 +993,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x10, x9, #3
; CHECK-NOSVE2-NEXT: cmp x9, #0
; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NOSVE2-NEXT: cset w9, lt
; CHECK-NOSVE2-NEXT: asr x10, x10, #2
@@ -1031,14 +1029,14 @@ for.body.preheader:
%wide.trip.count = zext nneg i32 %n to i64
%sub.diff = sub i64 %a13, %c12
%diff = sdiv i64 %sub.diff, 4
- %neg.compare = icmp slt i64 %sub.diff, -3
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
%sub.diff16 = sub i64 %b14, %c12
%diff17 = sdiv i64 %sub.diff16, 4
- %neg.compare18 = icmp slt i64 %sub.diff16, -3
+ %neg.compare18 = icmp sle i64 %sub.diff16, 0
%.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
%.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
%ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
@@ -1107,7 +1105,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x10, x9, #7
; CHECK-NOSVE2-NEXT: cmp x9, #0
; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: asr x9, x10, #3
; CHECK-NOSVE2-NEXT: cset w10, lt
; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1
@@ -1117,7 +1115,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NOSVE2-NEXT: add x10, x9, #7
; CHECK-NOSVE2-NEXT: cmp x9, #0
; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmn x9, #7
+; CHECK-NOSVE2-NEXT: cmp x9, #1
; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NOSVE2-NEXT: cset w9, lt
; CHECK-NOSVE2-NEXT: asr x10, x10, #3
@@ -1153,14 +1151,14 @@ for.body.preheader:
%wide.trip.count = zext nneg i32 %n to i64
%sub.diff = sub i64 %a13, %c12
%diff = sdiv i64 %sub.diff, 8
- %neg.compare = icmp slt i64 %sub.diff, -7
+ %neg.compare = icmp sle i64 %sub.diff, 0
%.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
%.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
%ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
%active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
%sub.diff16 = sub i64 %b14, %c12
%diff17 = sdiv i64 %sub.diff16, 8
- %neg.compare18 = icmp slt i64 %sub.diff16, -7
+ %neg.compare18 = icmp sle i64 %sub.diff16, 0
%.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
%.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
%ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
>From 7844e574d2338f3c55460e9d171b48c934556cdc Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Nov 2024 15:49:55 +0000
Subject: [PATCH 3/4] Check mask size
---
.../Target/AArch64/AArch64ISelLowering.cpp | 22 ++
llvm/test/CodeGen/AArch64/whilewr.ll | 352 +++++++++++++++++-
2 files changed, 373 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index edf86fd7f806f3..707c29cea59f88 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14164,6 +14164,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
const AArch64Subtarget &Subtarget) {
if (!Subtarget.hasSVE2())
return SDValue();
+ unsigned MaskNumElements = Op.getValueType().getVectorMinNumElements();
SDValue LaneMask = Op.getOperand(0);
SDValue Splat = Op.getOperand(1);
@@ -14192,6 +14193,27 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
EltSize = 1 << DiffDivConst->getZExtValue();
}
+ switch (EltSize) {
+ case 1:
+ if (MaskNumElements != 16)
+ return SDValue();
+ break;
+ case 2:
+ if (MaskNumElements != 8)
+ return SDValue();
+ break;
+ case 4:
+ if (MaskNumElements != 4)
+ return SDValue();
+ break;
+ case 8:
+ if (MaskNumElements != 2)
+ return SDValue();
+ break;
+ default:
+ return SDValue();
+ }
+
SDValue Diff = Cmp.getOperand(0);
SDValue NonAbsDiff = Diff;
bool WriteAfterRead = true;
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index 0bdb4b726731ff..a67d5920092f6c 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK,CHECK-NOSVE2
define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-LABEL: whilewr_8:
@@ -1190,6 +1190,356 @@ for.cond.cleanup:
ret void
}
+define <vscale x 8 x i1> @no_whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilewr_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub x8, x1, x2
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilewr_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %sub.diff = sub i64 %b15, %c14
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %sub.diff)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 8 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 4 x i1> @no_whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilewr_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub x8, x1, x2
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: asr x8, x8, #1
+; CHECK-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NEXT: whilelo p1.s, xzr, x8
+; CHECK-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilewr_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 2
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 4 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 2 x i1> @no_whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilewr_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT: sub x9, x1, x2
+; CHECK-NEXT: movk x8, #21846
+; CHECK-NEXT: cmp x9, #1
+; CHECK-NEXT: smulh x8, x9, x8
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilewr_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NOSVE2-NEXT: sub x9, x1, x2
+; CHECK-NOSVE2-NEXT: movk x8, #21846
+; CHECK-NOSVE2-NEXT: cmp x9, #1
+; CHECK-NOSVE2-NEXT: smulh x8, x9, x8
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 3
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 2 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 1 x i1> @no_whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilewr_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub x8, x1, x2
+; CHECK-NEXT: index z0.d, #0, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x9, x8, #3
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel x9, x9, x8, lt
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: asr x9, x9, #2
+; CHECK-NEXT: cset w8, lt
+; CHECK-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilewr_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: sub x8, x1, x2
+; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
+; CHECK-NOSVE2-NEXT: ptrue p0.d
+; CHECK-NOSVE2-NEXT: add x9, x8, #3
+; CHECK-NOSVE2-NEXT: cmp x8, #0
+; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: asr x9, x9, #2
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: mov z1.d, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
+; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %diff = sdiv i64 %sub.diff, 4
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 1 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 8 x i1> @no_whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilerw_8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: subs x8, x1, x2
+; CHECK-NEXT: cneg x8, x8, mi
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilerw_8:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
+; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %c14 = ptrtoint ptr %c to i64
+ %b15 = ptrtoint ptr %b to i64
+ %sub.diff = sub i64 %b15, %c14
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %neg.compare = icmp sle i64 %0, 0
+ %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %0)
+ %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 8 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 4 x i1> @no_whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilerw_16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: subs x8, x1, x2
+; CHECK-NEXT: cneg x9, x8, mi
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: add x8, x9, x9, lsr #63
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: asr x8, x8, #1
+; CHECK-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NEXT: whilelo p1.s, xzr, x8
+; CHECK-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilerw_16:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
+; CHECK-NOSVE2-NEXT: cneg x9, x8, mi
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: add x8, x9, x9, lsr #63
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: asr x8, x8, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 2
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 4 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 2 x i1> @no_whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilerw_32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT: subs x9, x1, x2
+; CHECK-NEXT: movk x8, #21846
+; CHECK-NEXT: cneg x10, x9, mi
+; CHECK-NEXT: cmp x9, #1
+; CHECK-NEXT: smulh x8, x10, x8
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilerw_32:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NOSVE2-NEXT: subs x9, x1, x2
+; CHECK-NOSVE2-NEXT: movk x8, #21846
+; CHECK-NOSVE2-NEXT: cneg x10, x9, mi
+; CHECK-NOSVE2-NEXT: cmp x9, #1
+; CHECK-NOSVE2-NEXT: smulh x8, x10, x8
+; CHECK-NOSVE2-NEXT: cset w9, lt
+; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
+; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
+; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 3
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 2 x i1> %active.lane.mask.alias
+}
+
+define <vscale x 1 x i1> @no_whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
+; CHECK-LABEL: no_whilerw_64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: subs x8, x1, x2
+; CHECK-NEXT: index z0.d, #0, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cneg x9, x8, mi
+; CHECK-NEXT: add x10, x9, #3
+; CHECK-NEXT: cmp x9, #0
+; CHECK-NEXT: csel x9, x10, x9, lt
+; CHECK-NEXT: cmp x8, #1
+; CHECK-NEXT: asr x9, x9, #2
+; CHECK-NEXT: cset w8, lt
+; CHECK-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NEXT: mov z1.d, x9
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NEXT: punpklo p1.h, p1.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: ret
+;
+; CHECK-NOSVE2-LABEL: no_whilerw_64:
+; CHECK-NOSVE2: // %bb.0: // %entry
+; CHECK-NOSVE2-NEXT: subs x8, x1, x2
+; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
+; CHECK-NOSVE2-NEXT: ptrue p0.d
+; CHECK-NOSVE2-NEXT: cneg x9, x8, mi
+; CHECK-NOSVE2-NEXT: add x10, x9, #3
+; CHECK-NOSVE2-NEXT: cmp x9, #0
+; CHECK-NOSVE2-NEXT: csel x9, x10, x9, lt
+; CHECK-NOSVE2-NEXT: cmp x8, #1
+; CHECK-NOSVE2-NEXT: asr x9, x9, #2
+; CHECK-NOSVE2-NEXT: cset w8, lt
+; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
+; CHECK-NOSVE2-NEXT: mov z1.d, x9
+; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
+; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
+; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
+; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NOSVE2-NEXT: ret
+entry:
+ %b14 = ptrtoint ptr %b to i64
+ %c15 = ptrtoint ptr %c to i64
+ %sub.diff = sub i64 %b14, %c15
+ %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false)
+ %diff = sdiv i64 %0, 4
+ %neg.compare = icmp sle i64 %sub.diff, 0
+ %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
+ %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+ %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
+ %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
+ ret <vscale x 1 x i1> %active.lane.mask.alias
+}
+
declare i64 @llvm.vscale.i64()
declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
>From 41c2bcd48b15ed196bb482a48824a8b427074b79 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 5 Nov 2024 15:50:39 +0000
Subject: [PATCH 4/4] Combine identical CHECK statements
---
llvm/test/CodeGen/AArch64/whilewr.ll | 624 +++++++++++----------------
1 file changed, 241 insertions(+), 383 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
index a67d5920092f6c..006c1c180ac45a 100644
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ b/llvm/test/CodeGen/AArch64/whilewr.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK,CHECK-NOSVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2
+; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOSVE2
define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_8:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
@@ -31,11 +31,11 @@ entry:
}
define <vscale x 16 x i1> @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.b, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilerw_8:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilerw p0.b, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilerw_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: subs x8, x1, x2
@@ -61,11 +61,11 @@ entry:
}
define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_commutative:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_commutative:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_commutative:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
@@ -89,11 +89,11 @@ entry:
}
define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.h, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_16:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilewr p0.h, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
@@ -120,11 +120,11 @@ entry:
}
define <vscale x 8 x i1> @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.h, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilerw_16:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilerw p0.h, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilerw_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: subs x8, x1, x2
@@ -153,11 +153,11 @@ entry:
}
define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.s, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilewr p0.s, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
@@ -186,11 +186,11 @@ entry:
}
define <vscale x 4 x i1> @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.s, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilerw_32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilerw p0.s, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilerw_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: subs x8, x1, x2
@@ -220,11 +220,11 @@ entry:
}
define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilewr p0.d, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_64:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilewr p0.d, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: sub x8, x1, x2
@@ -253,11 +253,11 @@ entry:
}
define <vscale x 2 x i1> @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilerw_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: whilerw p0.d, x1, x2
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilerw_64:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: whilerw p0.d, x1, x2
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilerw_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: subs x8, x1, x2
@@ -306,26 +306,6 @@ define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_128:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT: ptrue p0.d
-; CHECK-NOSVE2-NEXT: add x9, x8, #15
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: asr x9, x9, #4
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: mov z1.d, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b12 = ptrtoint ptr %b to i64
%c13 = ptrtoint ptr %c to i64
@@ -340,30 +320,30 @@ entry:
}
define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB10_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.b, x1, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.b
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB10_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB10_2
-; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_8:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB10_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2
+; CHECK-SVE2-NEXT: mov w9, w3
+; CHECK-SVE2-NEXT: mov x8, xzr
+; CHECK-SVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-SVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-SVE2-NEXT: and x10, x10, #0xff
+; CHECK-SVE2-NEXT: .LBB10_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-SVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-SVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-SVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-SVE2-NEXT: add x8, x8, x10
+; CHECK-SVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-SVE2-NEXT: b.mi .LBB10_2
+; CHECK-SVE2-NEXT: .LBB10_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -434,28 +414,28 @@ for.cond.cleanup:
}
define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB11_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p0.h, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p1.h, xzr, x8
-; CHECK-NEXT: .LBB11_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1]
-; CHECK-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1]
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1]
-; CHECK-NEXT: inch x9
-; CHECK-NEXT: whilelo p1.h, x9, x8
-; CHECK-NEXT: b.mi .LBB11_2
-; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_16:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB11_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: mov w8, w3
+; CHECK-SVE2-NEXT: whilewr p0.h, x1, x2
+; CHECK-SVE2-NEXT: mov x9, xzr
+; CHECK-SVE2-NEXT: whilelo p1.h, xzr, x8
+; CHECK-SVE2-NEXT: .LBB11_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1]
+; CHECK-SVE2-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1]
+; CHECK-SVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-SVE2-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1]
+; CHECK-SVE2-NEXT: inch x9
+; CHECK-SVE2-NEXT: whilelo p1.h, x9, x8
+; CHECK-SVE2-NEXT: b.mi .LBB11_2
+; CHECK-SVE2-NEXT: .LBB11_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -527,28 +507,28 @@ for.cond.cleanup:
}
define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB12_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p0.s, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p1.s, xzr, x8
-; CHECK-NEXT: .LBB12_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2]
-; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2]
-; CHECK-NEXT: incw x9
-; CHECK-NEXT: whilelo p1.s, x9, x8
-; CHECK-NEXT: b.mi .LBB12_2
-; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB12_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: mov w8, w3
+; CHECK-SVE2-NEXT: whilewr p0.s, x1, x2
+; CHECK-SVE2-NEXT: mov x9, xzr
+; CHECK-SVE2-NEXT: whilelo p1.s, xzr, x8
+; CHECK-SVE2-NEXT: .LBB12_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2]
+; CHECK-SVE2-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2]
+; CHECK-SVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-SVE2-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2]
+; CHECK-SVE2-NEXT: incw x9
+; CHECK-SVE2-NEXT: whilelo p1.s, x9, x8
+; CHECK-SVE2-NEXT: b.mi .LBB12_2
+; CHECK-SVE2-NEXT: .LBB12_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -622,28 +602,28 @@ for.cond.cleanup:
}
define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB13_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: mov w8, w3
-; CHECK-NEXT: whilewr p0.d, x1, x2
-; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NEXT: .LBB13_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3]
-; CHECK-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3]
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3]
-; CHECK-NEXT: incd x9
-; CHECK-NEXT: whilelo p1.d, x9, x8
-; CHECK-NEXT: b.mi .LBB13_2
-; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_64:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB13_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: mov w8, w3
+; CHECK-SVE2-NEXT: whilewr p0.d, x1, x2
+; CHECK-SVE2-NEXT: mov x9, xzr
+; CHECK-SVE2-NEXT: whilelo p1.d, xzr, x8
+; CHECK-SVE2-NEXT: .LBB13_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3]
+; CHECK-SVE2-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3]
+; CHECK-SVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-SVE2-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3]
+; CHECK-SVE2-NEXT: incd x9
+; CHECK-SVE2-NEXT: whilelo p1.d, x9, x8
+; CHECK-SVE2-NEXT: b.mi .LBB13_2
+; CHECK-SVE2-NEXT: .LBB13_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -717,32 +697,32 @@ for.cond.cleanup:
}
define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB14_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.b, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.b, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.b, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.b
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB14_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT: add z0.b, z1.b, z0.b
-; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.b, x8, x9
-; CHECK-NEXT: b.mi .LBB14_2
-; CHECK-NEXT: .LBB14_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_8:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB14_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: whilewr p0.b, x0, x2
+; CHECK-SVE2-NEXT: mov w9, w3
+; CHECK-SVE2-NEXT: mov x8, xzr
+; CHECK-SVE2-NEXT: whilewr p1.b, x1, x2
+; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: whilelo p1.b, xzr, x9
+; CHECK-SVE2-NEXT: cntp x10, p0, p0.b
+; CHECK-SVE2-NEXT: and x10, x10, #0xff
+; CHECK-SVE2-NEXT: .LBB14_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8]
+; CHECK-SVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8]
+; CHECK-SVE2-NEXT: add z0.b, z1.b, z0.b
+; CHECK-SVE2-NEXT: st1b { z0.b }, p1, [x2, x8]
+; CHECK-SVE2-NEXT: add x8, x8, x10
+; CHECK-SVE2-NEXT: whilelo p1.b, x8, x9
+; CHECK-SVE2-NEXT: b.mi .LBB14_2
+; CHECK-SVE2-NEXT: .LBB14_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -829,32 +809,32 @@ for.cond.cleanup:
}
define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB15_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.h, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.h, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.h, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.h
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB15_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NEXT: add z0.h, z1.h, z0.h
-; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.h, x8, x9
-; CHECK-NEXT: b.mi .LBB15_2
-; CHECK-NEXT: .LBB15_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_16:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB15_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: whilewr p0.h, x0, x2
+; CHECK-SVE2-NEXT: mov w9, w3
+; CHECK-SVE2-NEXT: mov x8, xzr
+; CHECK-SVE2-NEXT: whilewr p1.h, x1, x2
+; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: whilelo p1.h, xzr, x9
+; CHECK-SVE2-NEXT: cntp x10, p0, p0.h
+; CHECK-SVE2-NEXT: and x10, x10, #0xff
+; CHECK-SVE2-NEXT: .LBB15_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; CHECK-SVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
+; CHECK-SVE2-NEXT: add z0.h, z1.h, z0.h
+; CHECK-SVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1]
+; CHECK-SVE2-NEXT: add x8, x8, x10
+; CHECK-SVE2-NEXT: whilelo p1.h, x8, x9
+; CHECK-SVE2-NEXT: b.mi .LBB15_2
+; CHECK-SVE2-NEXT: .LBB15_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -947,32 +927,32 @@ for.cond.cleanup:
}
define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB16_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.s, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.s, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.s, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.s
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB16_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT: add z0.s, z1.s, z0.s
-; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.s, x8, x9
-; CHECK-NEXT: b.mi .LBB16_2
-; CHECK-NEXT: .LBB16_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB16_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: whilewr p0.s, x0, x2
+; CHECK-SVE2-NEXT: mov w9, w3
+; CHECK-SVE2-NEXT: mov x8, xzr
+; CHECK-SVE2-NEXT: whilewr p1.s, x1, x2
+; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: whilelo p1.s, xzr, x9
+; CHECK-SVE2-NEXT: cntp x10, p0, p0.s
+; CHECK-SVE2-NEXT: and x10, x10, #0xff
+; CHECK-SVE2-NEXT: .LBB16_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; CHECK-SVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-SVE2-NEXT: add z0.s, z1.s, z0.s
+; CHECK-SVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2]
+; CHECK-SVE2-NEXT: add x8, x8, x10
+; CHECK-SVE2-NEXT: whilelo p1.s, x8, x9
+; CHECK-SVE2-NEXT: b.mi .LBB16_2
+; CHECK-SVE2-NEXT: .LBB16_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -1069,32 +1049,32 @@ for.cond.cleanup:
}
define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmp w3, #1
-; CHECK-NEXT: b.lt .LBB17_3
-; CHECK-NEXT: // %bb.1: // %for.body.preheader
-; CHECK-NEXT: whilewr p0.d, x0, x2
-; CHECK-NEXT: mov w9, w3
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: whilewr p1.d, x1, x2
-; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT: whilelo p1.d, xzr, x9
-; CHECK-NEXT: cntp x10, p0, p0.d
-; CHECK-NEXT: and x10, x10, #0xff
-; CHECK-NEXT: .LBB17_2: // %vector.body
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NEXT: add z0.d, z1.d, z0.d
-; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NEXT: add x8, x8, x10
-; CHECK-NEXT: whilelo p1.d, x8, x9
-; CHECK-NEXT: b.mi .LBB17_2
-; CHECK-NEXT: .LBB17_3: // %for.cond.cleanup
-; CHECK-NEXT: ret
-;
+; CHECK-SVE2-LABEL: whilewr_loop_multiple_64:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: cmp w3, #1
+; CHECK-SVE2-NEXT: b.lt .LBB17_3
+; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-SVE2-NEXT: whilewr p0.d, x0, x2
+; CHECK-SVE2-NEXT: mov w9, w3
+; CHECK-SVE2-NEXT: mov x8, xzr
+; CHECK-SVE2-NEXT: whilewr p1.d, x1, x2
+; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b
+; CHECK-SVE2-NEXT: whilelo p1.d, xzr, x9
+; CHECK-SVE2-NEXT: cntp x10, p0, p0.d
+; CHECK-SVE2-NEXT: and x10, x10, #0xff
+; CHECK-SVE2-NEXT: .LBB17_2: // %vector.body
+; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b
+; CHECK-SVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; CHECK-SVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
+; CHECK-SVE2-NEXT: add z0.d, z1.d, z0.d
+; CHECK-SVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3]
+; CHECK-SVE2-NEXT: add x8, x8, x10
+; CHECK-SVE2-NEXT: whilelo p1.d, x8, x9
+; CHECK-SVE2-NEXT: b.mi .LBB17_2
+; CHECK-SVE2-NEXT: .LBB17_3: // %for.cond.cleanup
+; CHECK-SVE2-NEXT: ret
+
; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
; CHECK-NOSVE2: // %bb.0: // %entry
; CHECK-NOSVE2-NEXT: cmp w3, #1
@@ -1201,17 +1181,6 @@ define <vscale x 8 x i1> @no_whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.h, xzr, x8
; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%c14 = ptrtoint ptr %c to i64
%b15 = ptrtoint ptr %b to i64
@@ -1237,19 +1206,6 @@ define <vscale x 4 x i1> @no_whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: whilelo p1.s, xzr, x8
; CHECK-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_16:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: asr x8, x8, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
@@ -1278,21 +1234,6 @@ define <vscale x 2 x i1> @no_whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_32:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
-; CHECK-NOSVE2-NEXT: sub x9, x1, x2
-; CHECK-NOSVE2-NEXT: movk x8, #21846
-; CHECK-NOSVE2-NEXT: cmp x9, #1
-; CHECK-NOSVE2-NEXT: smulh x8, x9, x8
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
@@ -1326,26 +1267,6 @@ define <vscale x 1 x i1> @no_whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_64:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: sub x8, x1, x2
-; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT: ptrue p0.d
-; CHECK-NOSVE2-NEXT: add x9, x8, #3
-; CHECK-NOSVE2-NEXT: cmp x8, #0
-; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: asr x9, x9, #2
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: mov z1.d, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
@@ -1371,18 +1292,6 @@ define <vscale x 8 x i1> @no_whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
; CHECK-NEXT: whilelo p1.h, xzr, x8
; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilerw_8:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x1, x2
-; CHECK-NOSVE2-NEXT: cneg x8, x8, mi
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%c14 = ptrtoint ptr %c to i64
%b15 = ptrtoint ptr %b to i64
@@ -1410,20 +1319,6 @@ define <vscale x 4 x i1> @no_whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: whilelo p1.s, xzr, x8
; CHECK-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilerw_16:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x1, x2
-; CHECK-NOSVE2-NEXT: cneg x9, x8, mi
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: add x8, x9, x9, lsr #63
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: asr x8, x8, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
@@ -1454,22 +1349,6 @@ define <vscale x 2 x i1> @no_whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: whilelo p1.d, xzr, x8
; CHECK-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilerw_32:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
-; CHECK-NOSVE2-NEXT: subs x9, x1, x2
-; CHECK-NOSVE2-NEXT: movk x8, #21846
-; CHECK-NOSVE2-NEXT: cneg x10, x9, mi
-; CHECK-NOSVE2-NEXT: cmp x9, #1
-; CHECK-NOSVE2-NEXT: smulh x8, x10, x8
-; CHECK-NOSVE2-NEXT: cset w9, lt
-; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
@@ -1505,27 +1384,6 @@ define <vscale x 1 x i1> @no_whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n)
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
; CHECK-NEXT: ret
-;
-; CHECK-NOSVE2-LABEL: no_whilerw_64:
-; CHECK-NOSVE2: // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT: subs x8, x1, x2
-; CHECK-NOSVE2-NEXT: index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT: ptrue p0.d
-; CHECK-NOSVE2-NEXT: cneg x9, x8, mi
-; CHECK-NOSVE2-NEXT: add x10, x9, #3
-; CHECK-NOSVE2-NEXT: cmp x9, #0
-; CHECK-NOSVE2-NEXT: csel x9, x10, x9, lt
-; CHECK-NOSVE2-NEXT: cmp x8, #1
-; CHECK-NOSVE2-NEXT: asr x9, x9, #2
-; CHECK-NOSVE2-NEXT: cset w8, lt
-; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT: mov z1.d, x9
-; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT: ret
entry:
%b14 = ptrtoint ptr %b to i64
%c15 = ptrtoint ptr %c to i64
More information about the llvm-commits
mailing list