[llvm] [SelectionDAG] Fix unsafe cases for loop.dependence.{war/raw}.mask (PR #168565)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 08:47:22 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sam Tebbs (SamTebbs33)
<details>
<summary>Changes</summary>
There is an unsafe case with the loop dependence mask intrinsics where the difference between the two pointers is less than or equal to half the vector length, e.g. ptrA = 0 and ptrB 3 when the vector length is 32. Currently that produces a correct low-mask with 3 active lanes and an incorrect high mask with all lanes active. This PR adds a select on the high mask which guards against this case.
---
Patch is 42.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168565.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+19)
- (modified) llvm/test/CodeGen/AArch64/alias_mask.ll (+236-204)
- (modified) llvm/test/CodeGen/AArch64/alias_mask_scalable.ll (+170-118)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 71eeee78bd868..c8d66cc21244f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1692,6 +1692,18 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
}
+/// Split a loop dependence mask.
+/// This is done by creating a high and low mask, each of half the vector
+/// length. A select of the high mask and a predicate of all zeroes is needed to
+/// guarantee that the high mask is safe. A case where simply producing a high
+/// mask without the select is unsafe, is when the difference between the two
+/// pointers is less than half the vector length, e.g. ptrA = 0 and ptrB 3 when
+/// the vector length is 32.
+/// The full 32xi1 mask should be three active lanes and the rest inactive,
+/// however when half the vector length is added to ptrA to produce the high
+/// mask, the difference between ptrA and ptrB is now -13, which will result
+/// in a mask with all lanes active. The select will guard against this case
+/// by choosing a mask of all inactive lanes when ptrA + VL/2 >= ptrB.
void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc DL(N);
@@ -1708,7 +1720,14 @@ void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo,
: DAG.getConstant(Offset, DL, MVT::i64);
PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend);
+ EVT CmpVT = MVT::i1;
+ SDValue Cmp = DAG.getSetCC(DL, CmpVT, PtrA, PtrB, ISD::CondCode::SETUGE);
+ Cmp = DAG.getSplat(EVT::getVectorVT(*DAG.getContext(), CmpVT,
+ HiVT.getVectorMinNumElements(),
+ HiVT.isScalableVT()),
+ DL, Cmp);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2));
+ Hi = DAG.getSelect(DL, HiVT, Cmp, DAG.getConstant(0, DL, HiVT), Hi);
}
void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index fdd0a6a4709da..da14e17bf2463 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -101,26 +101,30 @@ define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add x9, x0, #16
-; CHECK-NEXT: whilewr p0.b, x0, x1
-; CHECK-NEXT: whilewr p1.b, x9, x1
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w10, hs
+; CHECK-NEXT: whilewr p0.b, x9, x1
; CHECK-NEXT: adrp x9, .LCPI8_0
-; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: dup v0.16b, w10
+; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: whilewr p0.b, x0, x1
+; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: shl v1.16b, v2.16b, #7
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0]
-; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: str h0, [x8]
-; CHECK-NEXT: str h1, [x8, #2]
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: str h1, [x8]
+; CHECK-NEXT: str h0, [x8, #2]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1)
@@ -131,46 +135,61 @@ define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_8_split2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add x9, x0, #48
-; CHECK-NEXT: whilewr p0.b, x0, x1
; CHECK-NEXT: add x10, x0, #16
-; CHECK-NEXT: whilewr p1.b, x9, x1
-; CHECK-NEXT: add x9, x0, #32
-; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: whilewr p0.b, x9, x1
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w9, hs
+; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: dup v0.16b, w9
+; CHECK-NEXT: add x9, x0, #32
+; CHECK-NEXT: cmp x9, x1
+; CHECK-NEXT: cset w11, hs
+; CHECK-NEXT: cmp x10, x1
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cset w12, hs
+; CHECK-NEXT: whilewr p1.b, x9, x1
+; CHECK-NEXT: whilewr p0.b, x10, x1
+; CHECK-NEXT: dup v2.16b, w11
+; CHECK-NEXT: dup v5.16b, w12
+; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: whilewr p1.b, x0, x1
; CHECK-NEXT: adrp x9, .LCPI9_0
+; CHECK-NEXT: cmge v0.16b, v0.16b, #0
+; CHECK-NEXT: mov z4.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: bic v3.16b, v3.16b, v2.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: whilewr p1.b, x10, x1
+; CHECK-NEXT: bic v4.16b, v4.16b, v5.16b
+; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: shl v1.16b, v1.16b, #7
+; CHECK-NEXT: shl v2.16b, v3.16b, #7
+; CHECK-NEXT: shl v3.16b, v4.16b, #7
; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI9_0]
-; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
-; CHECK-NEXT: shl v2.16b, v2.16b, #7
-; CHECK-NEXT: shl v3.16b, v3.16b, #7
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmlt v2.16b, v2.16b, #0
; CHECK-NEXT: cmlt v3.16b, v3.16b, #0
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
; CHECK-NEXT: and v2.16b, v2.16b, v4.16b
; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT: ext v7.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: ext v7.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v1.16b, v1.16b, v5.16b
-; CHECK-NEXT: zip1 v2.16b, v2.16b, v6.16b
-; CHECK-NEXT: zip1 v3.16b, v3.16b, v7.16b
-; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: zip1 v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: zip1 v3.16b, v3.16b, v6.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v7.16b
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: addv h2, v2.8h
; CHECK-NEXT: addv h3, v3.8h
-; CHECK-NEXT: str h0, [x8]
-; CHECK-NEXT: str h1, [x8, #6]
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: str h1, [x8]
; CHECK-NEXT: str h2, [x8, #4]
; CHECK-NEXT: str h3, [x8, #2]
+; CHECK-NEXT: str h0, [x8, #6]
; CHECK-NEXT: ret
entry:
%0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1)
@@ -227,69 +246,74 @@ entry:
define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_16_expand2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x9, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x10, x9, #32
-; CHECK-NEXT: add x9, x9, x9, lsr #63
+; CHECK-NEXT: add x9, x0, #32
+; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: subs x9, x1, x9
; CHECK-NEXT: add x10, x10, x10, lsr #63
-; CHECK-NEXT: asr x9, x9, #1
-; CHECK-NEXT: asr x10, x10, #1
+; CHECK-NEXT: add x11, x9, x9, lsr #63
+; CHECK-NEXT: asr x9, x10, #1
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x9
-; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z18.d, z0.d
+; CHECK-NEXT: asr x10, x11, #1
+; CHECK-NEXT: dup v3.2d, x9
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: dup v6.2d, x10
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT: add z18.d, z18.d, #14 // =0xe
+; CHECK-NEXT: add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT: cmhi v17.2d, v3.2d, v0.2d
+; CHECK-NEXT: cmhi v19.2d, v3.2d, v1.2d
+; CHECK-NEXT: cmhi v20.2d, v3.2d, v2.2d
+; CHECK-NEXT: cset w11, ls
+; CHECK-NEXT: cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT: cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT: cmhi v21.2d, v6.2d, v4.2d
+; CHECK-NEXT: cmhi v22.2d, v6.2d, v18.2d
+; CHECK-NEXT: cmhi v23.2d, v6.2d, v5.2d
+; CHECK-NEXT: cmhi v24.2d, v6.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d
+; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d
+; CHECK-NEXT: cmhi v18.2d, v3.2d, v18.2d
; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d
+; CHECK-NEXT: cmhi v7.2d, v3.2d, v7.2d
+; CHECK-NEXT: cmhi v3.2d, v3.2d, v16.2d
; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v22.4s
+; CHECK-NEXT: uzp1 v2.4s, v21.4s, v2.4s
; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s
+; CHECK-NEXT: uzp1 v16.4s, v24.4s, v23.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v6.4s
; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h
-; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h
-; CHECK-NEXT: dup v2.16b, w9
+; CHECK-NEXT: uzp1 v6.4s, v19.4s, v18.4s
+; CHECK-NEXT: uzp1 v4.4s, v4.4s, v20.4s
+; CHECK-NEXT: uzp1 v5.4s, v7.4s, v5.4s
+; CHECK-NEXT: uzp1 v3.4s, v17.4s, v3.4s
+; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v16.8h
+; CHECK-NEXT: uzp1 v2.8h, v4.8h, v6.8h
+; CHECK-NEXT: uzp1 v3.8h, v3.8h, v5.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w10
+; CHECK-NEXT: uzp1 v2.16b, v3.16b, v2.16b
+; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI11_0
-; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: dup v3.16b, w10
-; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w11
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shl v1.16b, v2.16b, #7
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0]
-; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
@@ -393,85 +417,89 @@ entry:
define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_32_expand3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: add x9, x0, #64
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x9, x10, #61
-; CHECK-NEXT: subs x11, x10, #64
-; CHECK-NEXT: add x12, x10, #3
-; CHECK-NEXT: csel x9, x9, x11, mi
+; CHECK-NEXT: subs x9, x1, x9
+; CHECK-NEXT: add x10, x9, #3
+; CHECK-NEXT: csel x9, x10, x9, mi
; CHECK-NEXT: asr x11, x9, #2
+; CHECK-NEXT: cset w9, ls
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: cmp x11, #1
; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmp x10, #0
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: csel x10, x12, x10, mi
-; CHECK-NEXT: dup v7.2d, x11
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z17.d, z0.d
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: subs x12, x1, x0
+; CHECK-NEXT: add x13, x12, #3
+; CHECK-NEXT: dup v5.2d, x11
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: asr x10, x10, #2
+; CHECK-NEXT: csel x12, x13, x12, mi
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: dup v16.2d, x10
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
-; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
-; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
-; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
-; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v16.2d, v6.2d
-; CHECK-NEXT: cmhi v5.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
-; CHECK-NEXT: cmhi v6.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v0.2d, v7.2d, v0.2d
-; CHECK-NEXT: uzp1 v7.4s, v21.4s, v20.4s
+; CHECK-NEXT: add z7.d, z7.d, #4 // =0x4
+; CHECK-NEXT: add z17.d, z17.d, #14 // =0xe
+; CHECK-NEXT: add z16.d, z16.d, #2 // =0x2
+; CHECK-NEXT: asr x12, x12, #2
+; CHECK-NEXT: cmhi v18.2d, v5.2d, v0.2d
+; CHECK-NEXT: cmhi v19.2d, v5.2d, v1.2d
+; CHECK-NEXT: cmhi v20.2d, v5.2d, v2.2d
+; CHECK-NEXT: cmhi v21.2d, v5.2d, v3.2d
+; CHECK-NEXT: dup v6.2d, x12
+; CHECK-NEXT: cmhi v22.2d, v5.2d, v4.2d
+; CHECK-NEXT: cmhi v23.2d, v5.2d, v7.2d
+; CHECK-NEXT: cmhi v24.2d, v5.2d, v17.2d
+; CHECK-NEXT: cmhi v5.2d, v5.2d, v16.2d
+; CHECK-NEXT: cmp x12, #1
+; CHECK-NEXT: cmhi v0.2d, v6.2d, v0.2d
+; CHECK-NEXT: cmhi v1.2d, v6.2d, v1.2d
+; CHECK-NEXT: cmhi v2.2d, v6.2d, v2.2d
+; CHECK-NEXT: cmhi v3.2d, v6.2d, v3.2d
+; CHECK-NEXT: cmhi v4.2d, v6.2d, v4.2d
+; CHECK-NEXT: cmhi v17.2d, v6.2d, v17.2d
+; CHECK-NEXT: cmhi v7.2d, v6.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v6.2d, v16.2d
+; CHECK-NEXT: uzp1 v16.4s, v19.4s, v24.4s
+; CHECK-NEXT: uzp1 v19.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
+; CHECK-NEXT: uzp1 v5.4s, v18.4s, v5.4s
+; CHECK-NEXT: uzp1 v1.4s, v1.4s, v17.4s
; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v23.4s, v4.4s
-; CHECK-NEXT: uzp1 v4.4s, v18.4s, v24.4s
-; CHECK-NEXT: uzp1 v5.4s, v5.4s, v22.4s
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s
-; CHECK-NEXT: uzp1 v6.4s, v17.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
-; CHECK-NEXT: uzp1 v3.8h, v4.8h, v3.8h
+; CHECK-NEXT: uzp1 v3.4s, v7.4s, v4.4s
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v6.4s
+; CHECK-NEXT: uzp1 v4.8h, v19.8h, v16.8h
+; CHECK-NEXT: uzp1 v5.8h, v5.8h, v20.8h
; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT: uzp1 v2.8h, v6.8h, v5.8h
-; CHECK-NEXT: uzp1 v0.8h, v7.8h, v0.8h
-; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b
-; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v3.8h
; CHECK-NEXT: dup v3.16b, w10
-; CHECK-NEXT: dup v2.16b, w9
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: uzp1 v2.16b, v5.16b, v4.16b
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: dup v1.16b, w10
+; CHECK-NEXT: orr v2.16b, v2.16b, v3.16b
+; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI14_0
-; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b
-; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bic v1.16b, v2.16b, v3.16b
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
-; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
-; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: str h0, [x8, #2]
+; CHECK-NEXT: addv h1, v1.8h
+; CHECK-NEXT: str h0, [x8]
+; CHECK-NEXT: str h1, [x8, #2]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -587,85 +615,89 @@ entry:
define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_64_expand4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub x10, x1, x0
+; CHECK-NEXT: add x9, x0, #128
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: sub x9, x10, #121
-; CHECK-NEXT: subs x11, x10, #128
-; CHECK-NEXT: add x12, x10, #7
-; CHECK-NEXT: csel x9, x9, x11, mi
+; CHECK-NEXT: subs x9, x1, x9
+; CHECK-NEXT: add x10, x9, #7
+; CHECK-NEXT: csel x9, x10, x9, mi
; CHECK-NEXT: asr x11, x9, #3
+; CHECK-NEXT: cset w9, ls
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: cmp x11, #1
; CHECK-NEXT: mov z4.d, z0.d
-; CHECK-NEXT: mov z5.d, z0.d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmp x10, #0
-; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: csel x10, x12, x10, mi
-; CHECK-NEXT: dup v7.2d, x11
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: mov z16.d, z0.d
+; CHECK-NEXT: mov z17.d, z0.d
+; CHECK-NEXT: cset w10, lt
+; CHECK-NEXT: subs x12, x1, x0
+; CHECK-NEXT: add x13, x12, #7
+; CHECK-NEXT: dup v5.2d, x11
; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: asr x10, x10, #3
+; CHECK-NEXT: csel x12, x13, x12, mi
; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
; CHECK-NEXT: add z4.d, z4...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/168565
More information about the llvm-commits
mailing list