[llvm] [AArch64] Fold addressing modes even with other uses. (PR #105666)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 07:14:56 PDT 2024
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/105666
The intent behind this patch is to fold more cases into addressing modes if they are "free", even if that means calculating the value in both the addressing mode and a separate instruction. This allows the load to start a cycle earlier and can sometimes help the address sink to different block.
It requires some additional heuristics to try to ensure that LDP are still generated, and to prefer `add x0, x0, x1, lsl 3; ldr x, [x0]` as opposed to `lsl x1, x1, 3; ldr x, [x0, x1]; add x0, x0, x1` if the addlsl is a quick operation.
>From 7247e30b0cf637a41b185698b1dc7e39ac6bc11e Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 22 Aug 2024 14:53:28 +0100
Subject: [PATCH] [AArch64] Fold into address even with other uses.
The intent behind this patch is to fold more cases into addressing modes if
they are "free", even if that means calculating the value in both the
addressing mode and a separate instruction. This allows the load to start a
cycle earlier and can sometimes help the address sink to different block.
---
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 38 +++---
.../CodeGen/AArch64/aarch64-fold-lslfast.ll | 116 ++++++++++++++++++
llvm/test/CodeGen/AArch64/insert-extend.ll | 16 +--
llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 80 ++++++------
4 files changed, 184 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f5f9a62faa0f53..8bd3418283aee1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -666,16 +666,6 @@ static bool isWorthFoldingSHL(SDValue V) {
unsigned ShiftVal = CSD->getZExtValue();
if (ShiftVal > 3)
return false;
-
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
- const SDNode *Node = V.getNode();
- for (SDNode *UI : Node->uses())
- if (!isa<MemSDNode>(*UI))
- for (SDNode *UII : UI->uses())
- if (!isa<MemSDNode>(*UII))
- return false;
return true;
}
@@ -1234,12 +1224,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
return false;
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
+ // Check if this particular node is reused in an add node that might make it
+ // better folded into a LDP/STP.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
- if (!isa<MemSDNode>(*UI))
+ if (UI->getOpcode() == ISD::ADD ||
+ (UI->isMachineOpcode() &&
+ (TII->get(UI->getMachineOpcode()).mayLoad() ||
+ TII->get(UI->getMachineOpcode()).mayStore())))
return false;
}
@@ -1318,12 +1310,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
SDValue RHS = N.getOperand(1);
SDLoc DL(N);
- // Check if this particular node is reused in any non-memory related
- // operation. If yes, do not try to fold this node into the address
- // computation, since the computation will be kept.
+ // Check if this particular node is reused in an add node that might make it
+ // better folded into a LDP/STP.
const SDNode *Node = N.getNode();
for (SDNode *UI : Node->uses()) {
- if (!isa<MemSDNode>(*UI))
+ if (UI->getOpcode() == ISD::ADD ||
+ (UI->isMachineOpcode() &&
+ (TII->get(UI->getMachineOpcode()).mayLoad() ||
+ TII->get(UI->getMachineOpcode()).mayStore())))
return false;
}
@@ -1374,6 +1368,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
return true;
}
+ // If we can create add(a, lsl y, C) from the add and it is quick, leave it as
+ // a separate operation.
+ if (isWorthFoldingALU(LHS, true) || isWorthFoldingALU(RHS, true)) {
+ for (SDNode *UI : Node->uses())
+ if (!isa<MemSDNode>(*UI))
+ return false;
+ }
+
// Match any non-shifted, non-extend, non-immediate add expression.
Base = LHS;
Offset = RHS;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a0..09797e1736d4db 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -445,6 +445,122 @@ define i64 @addlsl4(i64 %a, i64 %b) {
%r = xor i64 %y, %z
ret i64 %r
}
+
+define ptr @gep_i32_4_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_4_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #2
+; CHECK-NEXT: ldr w9, [x1, x0, lsl #2]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str w9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i32, ptr %p, i64 %b
+ %l = load i32, ptr %g
+ store i32 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i32_8_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_8_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x0, x1, x0, lsl #3
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: str w8, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i32, ptr %g
+ store i32 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #3
+; CHECK-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str x9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ store i64 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_16_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_16_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x0, x1, x0, lsl #4
+; CHECK-NEXT: ldr x8, [x0]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i128, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ store i64 %l, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_uses_add(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses_add:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x1, x0, lsl #3
+; CHECK-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT: add x9, x9, #2
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: str x9, [x1]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ %m = add i64 %l, 2
+ store i64 %m, ptr %p
+ ret ptr %g
+}
+
+define ptr @gep_i64_ldp_uses(i64 %b, ptr %p) {
+; CHECK0-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK0-SDAG: // %bb.0:
+; CHECK0-SDAG-NEXT: add x0, x1, x0, lsl #3
+; CHECK0-SDAG-NEXT: ldp x8, x9, [x0]
+; CHECK0-SDAG-NEXT: add x8, x8, x9
+; CHECK0-SDAG-NEXT: str x8, [x1]
+; CHECK0-SDAG-NEXT: ret
+;
+; CHECK0-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK0-GISEL: // %bb.0:
+; CHECK0-GISEL-NEXT: add x8, x1, x0, lsl #3
+; CHECK0-GISEL-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK0-GISEL-NEXT: ldr x10, [x8, #8]
+; CHECK0-GISEL-NEXT: mov x0, x8
+; CHECK0-GISEL-NEXT: add x9, x9, x10
+; CHECK0-GISEL-NEXT: str x9, [x1]
+; CHECK0-GISEL-NEXT: ret
+;
+; CHECK3-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK3-SDAG: // %bb.0:
+; CHECK3-SDAG-NEXT: add x0, x1, x0, lsl #3
+; CHECK3-SDAG-NEXT: ldp x8, x9, [x0]
+; CHECK3-SDAG-NEXT: add x8, x8, x9
+; CHECK3-SDAG-NEXT: str x8, [x1]
+; CHECK3-SDAG-NEXT: ret
+;
+; CHECK3-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK3-GISEL: // %bb.0:
+; CHECK3-GISEL-NEXT: add x8, x1, x0, lsl #3
+; CHECK3-GISEL-NEXT: ldr x9, [x1, x0, lsl #3]
+; CHECK3-GISEL-NEXT: ldr x10, [x8, #8]
+; CHECK3-GISEL-NEXT: mov x0, x8
+; CHECK3-GISEL-NEXT: add x9, x9, x10
+; CHECK3-GISEL-NEXT: str x9, [x1]
+; CHECK3-GISEL-NEXT: ret
+ %g = getelementptr inbounds i64, ptr %p, i64 %b
+ %l = load i64, ptr %g
+ %h = getelementptr inbounds i64, ptr %g, i64 1
+ %m = load i64, ptr %h
+ %n = add i64 %l, %m
+ store i64 %n, ptr %p
+ ret ptr %g
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK0: {{.*}}
; CHECK3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index 851fb0d03e8aa3..ddf7651b7a690e 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -52,10 +52,10 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x2]
+; CHECK-NEXT: ldr d2, [x0, x8]
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: add x11, x11, x9
@@ -69,22 +69,22 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: shll2 v4.4s, v1.8h, #16
; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h
-; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
; CHECK-NEXT: shll2 v5.4s, v3.8h, #16
; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h
+; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
; CHECK-NEXT: rev64 v4.4s, v0.4s
-; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h
; CHECK-NEXT: rev64 v5.4s, v1.4s
+; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-NEXT: rev64 v6.4s, v2.4s
-; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: rev64 v7.4s, v3.4s
+; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s
; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s
-; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s
; CHECK-NEXT: mov v6.s[1], v7.s[0]
; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 325ab444205bf9..25cd6f5ea4d060 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -12,13 +12,13 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: ldr d1, [x2]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
+; CHECK-NEXT: ldr d2, [x0, x8]
; CHECK-NEXT: add x10, x10, x8
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: ldr d4, [x10]
-; CHECK-NEXT: ldr d6, [x10, x8]
; CHECK-NEXT: ldr d5, [x11]
+; CHECK-NEXT: ldr d6, [x10, x8]
; CHECK-NEXT: ldr d7, [x11, x9]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b
@@ -26,16 +26,16 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b
; CHECK-NEXT: shll2 v4.4s, v0.8h, #16
; CHECK-NEXT: shll2 v5.4s, v1.8h, #16
-; CHECK-NEXT: shll2 v6.4s, v3.8h, #16
; CHECK-NEXT: shll2 v7.4s, v2.8h, #16
+; CHECK-NEXT: shll2 v6.4s, v3.8h, #16
; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h
; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h
; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h
; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s
; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: mov v7.16b, v2.16b
+; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12
; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s
; CHECK-NEXT: ext v16.16b, v1.16b, v4.16b, #8
@@ -227,59 +227,59 @@ entry:
define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
; CHECK-LABEL: v2:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: sxtw x8, w1
-; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d4, [x0]
; CHECK-NEXT: ldr d5, [x2]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: add x12, x10, x8
-; CHECK-NEXT: ldr d6, [x10]
-; CHECK-NEXT: ldr d7, [x11]
-; CHECK-NEXT: ldr d0, [x12, x8]
-; CHECK-NEXT: add x8, x11, x9
-; CHECK-NEXT: ldr d1, [x12]
-; CHECK-NEXT: ldr d2, [x8, x9]
-; CHECK-NEXT: ldr d3, [x8]
-; CHECK-NEXT: usubl v1.8h, v1.8b, v3.8b
-; CHECK-NEXT: usubl v0.8h, v0.8b, v2.8b
+; CHECK-NEXT: ldr d6, [x0, x8]
+; CHECK-NEXT: add x10, x10, x8
+; CHECK-NEXT: add x11, x11, x9
+; CHECK-NEXT: ldr d7, [x2, x9]
+; CHECK-NEXT: ldr d0, [x10]
+; CHECK-NEXT: ldr d1, [x11]
+; CHECK-NEXT: ldr d2, [x10, x8]
+; CHECK-NEXT: ldr d3, [x11, x9]
+; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b
; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b
; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b
-; CHECK-NEXT: shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT: shll2 v5.4s, v1.8h, #16
+; CHECK-NEXT: shll2 v5.4s, v0.8h, #16
+; CHECK-NEXT: shll2 v4.4s, v1.8h, #16
; CHECK-NEXT: shll2 v7.4s, v3.8h, #16
; CHECK-NEXT: shll2 v6.4s, v2.8h, #16
-; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h
+; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h
; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h
; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT: uzp2 v4.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v17.16b, v0.16b
+; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s
; CHECK-NEXT: mov v7.16b, v3.16b
-; CHECK-NEXT: mov v17.16b, v1.16b
; CHECK-NEXT: zip1 v5.4s, v3.4s, v2.4s
; CHECK-NEXT: zip2 v6.4s, v3.4s, v2.4s
-; CHECK-NEXT: zip2 v16.4s, v0.4s, v1.4s
-; CHECK-NEXT: ext v18.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT: zip2 v16.4s, v1.4s, v0.4s
+; CHECK-NEXT: ext v18.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT: mov v17.s[1], v1.s[0]
; CHECK-NEXT: mov v7.s[3], v2.s[2]
-; CHECK-NEXT: mov v17.s[1], v0.s[0]
-; CHECK-NEXT: uzp2 v2.4s, v4.4s, v0.4s
-; CHECK-NEXT: mov v4.16b, v0.16b
-; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: uzp2 v2.4s, v4.4s, v1.4s
+; CHECK-NEXT: mov v4.16b, v1.16b
; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT: mov v4.s[0], v1.s[1]
+; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v4.s[0], v0.s[1]
; CHECK-NEXT: mov v16.d[1], v7.d[1]
-; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12
+; CHECK-NEXT: ext v0.16b, v0.16b, v18.16b, #12
; CHECK-NEXT: mov v2.d[1], v6.d[1]
-; CHECK-NEXT: mov v0.d[1], v7.d[1]
; CHECK-NEXT: mov v17.d[1], v3.d[1]
+; CHECK-NEXT: mov v1.d[1], v7.d[1]
; CHECK-NEXT: mov v4.d[1], v5.d[1]
-; CHECK-NEXT: mov v1.d[1], v6.d[1]
+; CHECK-NEXT: mov v0.d[1], v6.d[1]
; CHECK-NEXT: add v2.4s, v2.4s, v16.4s
; CHECK-NEXT: add v3.4s, v4.4s, v17.4s
; CHECK-NEXT: rev64 v5.4s, v2.4s
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s
; CHECK-NEXT: rev64 v6.4s, v3.4s
; CHECK-NEXT: mov v5.d[1], v2.d[1]
@@ -449,10 +449,10 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: sxtw x9, w3
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x2]
+; CHECK-NEXT: ldr d2, [x0, x8]
+; CHECK-NEXT: ldr d3, [x2, x9]
; CHECK-NEXT: add x10, x0, x8
; CHECK-NEXT: add x11, x2, x9
-; CHECK-NEXT: ldr d2, [x10]
-; CHECK-NEXT: ldr d3, [x11]
; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: add x10, x10, x8
; CHECK-NEXT: add x11, x11, x9
@@ -474,14 +474,14 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: rev64 v5.4s, v1.4s
; CHECK-NEXT: saddw v3.4s, v3.4s, v4.4h
; CHECK-NEXT: rev64 v4.4s, v2.4s
-; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s
-; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: rev64 v7.4s, v3.4s
+; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s
; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s
-; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ext v1.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s
+; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: mov v6.s[3], v5.s[2]
; CHECK-NEXT: zip2 v16.4s, v4.4s, v7.4s
More information about the llvm-commits
mailing list