[llvm] [AArch64] Fold addressing modes even with other uses. (PR #105666)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 07:15:29 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
The intent behind this patch is to fold more cases into addressing modes if they are "free", even if that means calculating the value in both the addressing mode and a separate instruction. This allows the load to start a cycle earlier and can sometimes help the address sink to different block.
It requires some additional heuristics to try to ensure that LDP are still generated, and to prefer `add x0, x0, x1, lsl 3; ldr x, [x0]` as opposed to `lsl x1, x1, 3; ldr x, [x0, x1]; add x0, x0, x1` if the addlsl is a quick operation.
---
Full diff: https://github.com/llvm/llvm-project/pull/105666.diff
4 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+20-18)
- (modified) llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll (+116)
- (modified) llvm/test/CodeGen/AArch64/insert-extend.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/reduce-shuffle.ll (+40-40)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f5f9a62faa0f53..8bd3418283aee1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -666,16 +666,6 @@ static bool isWorthFoldingSHL(SDValue V) {
unsigned ShiftVal = CSD->getZExtValue();
if (ShiftVal > 3)
return false;
-
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
- const SDNode *Node = V.getNode();
- for (SDNode *UI : Node->uses())
- if (!isa<MemSDNode>(*UI))
- for (SDNode *UII : UI->uses())
- if (!isa<MemSDNode>(*UII))
- return false;
return true;
}
@@ -1234,12 +1224,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
return false;
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
+ // Check if this particular node is reused in an add node that might make it
+ // better folded into a LDP/STP.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
- if (!isa<MemSDNode>(*UI))
+ if (UI->getOpcode() == ISD::ADD ||
+ (UI->isMachineOpcode() &&
+ (TII->get(UI->getMachineOpcode()).mayLoad() ||
+ TII->get(UI->getMachineOpcode()).mayStore())))
return false;
}
@@ -1318,12 +1310,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
SDValue RHS = N.getOperand(1);
SDLoc DL(N);
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
+ // Check if this particular node is reused in an add node that might make it
+ // better folded into a LDP/STP.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
- if (!isa<MemSDNode>(*UI))
+ if (UI->getOpcode() == ISD::ADD ||
+ (UI->isMachineOpcode() &&
+ (TII->get(UI->getMachineOpcode()).mayLoad() ||
+ TII->get(UI->getMachineOpcode()).mayStore())))
return false;
}
@@ -1374,6 +1368,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
return true;
}
+ // If we can create add(a, lsl y, C) from the add and it is quick, leave it as
+ // a separate operation.
+ if (isWorthFoldingALU(LHS, true) || isWorthFoldingALU(RHS, true)) {
+ for (SDNode *UI : Node->uses())
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
// Match any non-shifted, non-extend, non-immediate add expression.
Base = LHS;
Offset = RHS;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a0..09797e1736d4db 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -445,6 +445,122 @@ define i64 @addlsl4(i64 %a, i64 %b) {
%r = xor i64 %y, %z
ret i64 %r
}
+
+define ptr @gep_i32_4_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_4_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #2
+; CHECK-NEXT: ldr w9, [x1, x0, lsl #2]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str w9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i32, ptr %p, i64 %b
+ %l = load i32, ptr %g
+ store i32 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i32_8_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_8_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x0, x1, x0, lsl #3
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i32, ptr %g
+ store i32 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #3
+; CHECK-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str x9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ store i64 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_16_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_16_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x0, x1, x0, lsl #4
+; CHECK-NEXT: ldr x8, [x0]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i128, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ store i64 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_uses_add(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses_add:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #3
+; CHECK-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT: add x9, x9, #2
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str x9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ %m = add i64 %l, 2
+ store i64 %m, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_ldp_uses(i64 %b, ptr %p) {
+; CHECK0-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK0-SDAG: // %bb.0:
+; CHECK0-SDAG-NEXT: add x0, x1, x0, lsl #3
+; CHECK0-SDAG-NEXT: ldp x8, x9, [x0]
+; CHECK0-SDAG-NEXT: add x8, x8, x9
+; CHECK0-SDAG-NEXT: str x8, [x1]
+; CHECK0-SDAG-NEXT: ret
+;
+; CHECK0-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK0-GISEL: // %bb.0:
+; CHECK0-GISEL-NEXT: add x8, x1, x0, lsl #3
+; CHECK0-GISEL-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK0-GISEL-NEXT: ldr x10, [x8, #8]
+; CHECK0-GISEL-NEXT: mov x0, x8
+; CHECK0-GISEL-NEXT: add x9, x9, x10
+; CHECK0-GISEL-NEXT: str x9, [x1]
+; CHECK0-GISEL-NEXT: ret
+;
+; CHECK3-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK3-SDAG: // %bb.0:
+; CHECK3-SDAG-NEXT: add x0, x1, x0, lsl #3
+; CHECK3-SDAG-NEXT: ldp x8, x9, [x0]
+; CHECK3-SDAG-NEXT: add x8, x8, x9
+; CHECK3-SDAG-NEXT: str x8, [x1]
+; CHECK3-SDAG-NEXT: ret
+;
+; CHECK3-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK3-GISEL: // %bb.0:
+; CHECK3-GISEL-NEXT: add x8, x1, x0, lsl #3
+; CHECK3-GISEL-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK3-GISEL-NEXT: ldr x10, [x8, #8]
+; CHECK3-GISEL-NEXT: mov x0, x8
+; CHECK3-GISEL-NEXT: add x9, x9, x10
+; CHECK3-GISEL-NEXT: str x9, [x1]
+; CHECK3-GISEL-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ %h = getelementptr inbounds i64, ptr %g, i64 1
+ %m = load i64, ptr %h
+ %n = add i64 %l, %m
+ store i64 %n, ptr %p
+ ret ptr %g
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK0: {{.*}}
; CHECK3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index 851fb0d03e8aa3..ddf7651b7a690e 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -52,10 +52,10 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x2]
+; CHECK-NEXT: ldr d2, [x0, x8]
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: add x11, x11, x9
@@ -69,22 +69,22 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: shll2 v4.4s, v1.8h, #16
; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h
-; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
; CHECK-NEXT: shll2 v5.4s, v3.8h, #16
; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h
+; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
; CHECK-NEXT: rev64 v4.4s, v0.4s
-; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h
; CHECK-NEXT: rev64 v5.4s, v1.4s
+; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-NEXT: rev64 v6.4s, v2.4s
-; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: rev64 v7.4s, v3.4s
+; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s
; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s
-; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s
; CHECK-NEXT: mov v6.s[1], v7.s[0]
; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 325ab444205bf9..25cd6f5ea4d060 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -12,13 +12,13 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: ldr d1, [x2]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
+; CHECK-NEXT: ldr d2, [x0, x8]
; CHECK-NEXT: add x10, x10, x8
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: ldr d4, [x10]
-; CHECK-NEXT: ldr d6, [x10, x8]
; CHECK-NEXT: ldr d5, [x11]
+; CHECK-NEXT: ldr d6, [x10, x8]
; CHECK-NEXT: ldr d7, [x11, x9]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b
@@ -26,16 +26,16 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b
; CHECK-NEXT: shll2 v4.4s, v0.8h, #16
; CHECK-NEXT: shll2 v5.4s, v1.8h, #16
-; CHECK-NEXT: shll2 v6.4s, v3.8h, #16
; CHECK-NEXT: shll2 v7.4s, v2.8h, #16
+; CHECK-NEXT: shll2 v6.4s, v3.8h, #16
; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h
; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h
; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h
; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s
; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: mov v7.16b, v2.16b
+; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12
; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s
; CHECK-NEXT: ext v16.16b, v1.16b, v4.16b, #8
@@ -227,59 +227,59 @@ entry:
define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
; CHECK-LABEL: v2:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sxtw x8, w1
-; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d4, [x0]
; CHECK-NEXT: ldr d5, [x2]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: add x12, x10, x8
-; CHECK-NEXT: ldr d6, [x10]
-; CHECK-NEXT: ldr d7, [x11]
-; CHECK-NEXT: ldr d0, [x12, x8]
-; CHECK-NEXT: add x8, x11, x9
-; CHECK-NEXT: ldr d1, [x12]
-; CHECK-NEXT: ldr d2, [x8, x9]
-; CHECK-NEXT: ldr d3, [x8]
-; CHECK-NEXT: usubl v1.8h, v1.8b, v3.8b
-; CHECK-NEXT: usubl v0.8h, v0.8b, v2.8b
+; CHECK-NEXT: ldr d6, [x0, x8]
+; CHECK-NEXT: add x10, x10, x8
+; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: ldr d7, [x2, x9]
+; CHECK-NEXT: ldr d0, [x10]
+; CHECK-NEXT: ldr d1, [x11]
+; CHECK-NEXT: ldr d2, [x10, x8]
+; CHECK-NEXT: ldr d3, [x11, x9]
+; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b
; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b
; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b
-; CHECK-NEXT: shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT: shll2 v5.4s, v1.8h, #16
+; CHECK-NEXT: shll2 v5.4s, v0.8h, #16
+; CHECK-NEXT: shll2 v4.4s, v1.8h, #16
; CHECK-NEXT: shll2 v7.4s, v3.8h, #16
; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
-; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h
+; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h
; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h
; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT: uzp2 v4.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v17.16b, v0.16b
+; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s
; CHECK-NEXT: mov v7.16b, v3.16b
-; CHECK-NEXT: mov v17.16b, v1.16b
; CHECK-NEXT: zip1 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: zip2 v6.4s, v3.4s, v2.4s
-; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v18.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT: zip2 v16.4s, v1.4s, v0.4s
+; CHECK-NEXT: ext v18.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT: mov v17.s[1], v1.s[0]
; CHECK-NEXT: mov v7.s[3], v2.s[2]
-; CHECK-NEXT: mov v17.s[1], v0.s[0]
-; CHECK-NEXT: uzp2 v2.4s, v4.4s, v0.4s
-; CHECK-NEXT: mov v4.16b, v0.16b
-; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp2 v2.4s, v4.4s, v1.4s
+; CHECK-NEXT: mov v4.16b, v1.16b
; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT: mov v4.s[0], v1.s[1]
+; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v4.s[0], v0.s[1]
; CHECK-NEXT: mov v16.d[1], v7.d[1]
-; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12
+; CHECK-NEXT: ext v0.16b, v0.16b, v18.16b, #12
; CHECK-NEXT: mov v2.d[1], v6.d[1]
-; CHECK-NEXT: mov v0.d[1], v7.d[1]
; CHECK-NEXT: mov v17.d[1], v3.d[1]
+; CHECK-NEXT: mov v1.d[1], v7.d[1]
; CHECK-NEXT: mov v4.d[1], v5.d[1]
-; CHECK-NEXT: mov v1.d[1], v6.d[1]
+; CHECK-NEXT: mov v0.d[1], v6.d[1]
; CHECK-NEXT: add v2.4s, v2.4s, v16.4s
; CHECK-NEXT: add v3.4s, v4.4s, v17.4s
; CHECK-NEXT: rev64 v5.4s, v2.4s
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s
; CHECK-NEXT: rev64 v6.4s, v3.4s
; CHECK-NEXT: mov v5.d[1], v2.d[1]
@@ -449,10 +449,10 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x2]
+; CHECK-NEXT: ldr d2, [x0, x8]
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: add x11, x11, x9
@@ -474,14 +474,14 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: rev64 v5.4s, v1.4s
; CHECK-NEXT: saddw v3.4s, v3.4s, v4.4h
; CHECK-NEXT: rev64 v4.4s, v2.4s
-; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s
-; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: rev64 v7.4s, v3.4s
+; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s
; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s
-; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ext v1.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s
+; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: mov v6.s[3], v5.s[2]
; CHECK-NEXT: zip2 v16.4s, v4.4s, v7.4s
``````````
</details>
https://github.com/llvm/llvm-project/pull/105666
More information about the llvm-commits
mailing list