[llvm] [AArch64] Fold addressing modes even with other uses. (PR #105666)

Thu Aug 22 07:15:29 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

<details>
<summary>Changes</summary>

The intent behind this patch is to fold more cases into addressing modes if they are "free", even if that means calculating the value in both the addressing mode and a separate instruction. This allows the load to start a cycle earlier and can sometimes help the address sink to different block.

It requires some additional heuristics to try to ensure that LDP are still generated, and to prefer `add x0, x0, x1, lsl 3; ldr x, [x0]` as opposed to `lsl x1, x1, 3; ldr x, [x0, x1]; add x0, x0, x1` if the addlsl is a quick operation.

---
Full diff: https://github.com/llvm/llvm-project/pull/105666.diff


4 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+20-18) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll (+116) 
- (modified) llvm/test/CodeGen/AArch64/insert-extend.ll (+8-8) 
- (modified) llvm/test/CodeGen/AArch64/reduce-shuffle.ll (+40-40) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f5f9a62faa0f53..8bd3418283aee1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -666,16 +666,6 @@ static bool isWorthFoldingSHL(SDValue V) {
   unsigned ShiftVal = CSD->getZExtValue();
   if (ShiftVal > 3)
     return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = V.getNode();
-  for (SDNode *UI : Node->uses())
-    if (!isa<MemSDNode>(*UI))
-      for (SDNode *UII : UI->uses())
-        if (!isa<MemSDNode>(*UII))
-          return false;
   return true;
 }
 
@@ -1234,12 +1224,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
     return false;
 
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
+  // Check if this particular node is reused in an add node that might make it
+  // better folded into a LDP/STP.
   const SDNode *Node = N.getNode();
   for (SDNode *UI : Node->uses()) {
-    if (!isa<MemSDNode>(*UI))
+    if (UI->getOpcode() == ISD::ADD ||
+        (UI->isMachineOpcode() &&
+         (TII->get(UI->getMachineOpcode()).mayLoad() ||
+          TII->get(UI->getMachineOpcode()).mayStore())))
       return false;
   }
 
@@ -1318,12 +1310,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   SDValue RHS = N.getOperand(1);
   SDLoc DL(N);
 
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
+  // Check if this particular node is reused in an add node that might make it
+  // better folded into a LDP/STP.
   const SDNode *Node = N.getNode();
   for (SDNode *UI : Node->uses()) {
-    if (!isa<MemSDNode>(*UI))
+    if (UI->getOpcode() == ISD::ADD ||
+        (UI->isMachineOpcode() &&
+         (TII->get(UI->getMachineOpcode()).mayLoad() ||
+          TII->get(UI->getMachineOpcode()).mayStore())))
       return false;
   }
 
@@ -1374,6 +1368,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
     return true;
   }
 
+  // If we can create add(a, lsl y, C) from the add and it is quick, leave it as
+  // a separate operation.
+  if (isWorthFoldingALU(LHS, true) || isWorthFoldingALU(RHS, true)) {
+    for (SDNode *UI : Node->uses())
+      if (!isa<MemSDNode>(*UI))
+        return false;
+  }
+
   // Match any non-shifted, non-extend, non-immediate add expression.
   Base = LHS;
   Offset = RHS;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a0..09797e1736d4db 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -445,6 +445,122 @@ define i64 @addlsl4(i64 %a, i64 %b) {
   %r = xor i64 %y, %z
   ret i64 %r
 }
+
+define ptr @gep_i32_4_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_4_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x1, x0, lsl #2
+; CHECK-NEXT:    ldr w9, [x1, x0, lsl #2]
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    str w9, [x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i32, ptr %p, i64 %b
+  %l = load i32, ptr %g
+  store i32 %l, ptr %p
+  ret ptr %g
+}
+
+define ptr @gep_i32_8_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i32_8_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x0, x1, x0, lsl #3
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %b
+  %l = load i32, ptr %g
+  store i32 %l, ptr %p
+  ret ptr %g
+}
+
+define ptr @gep_i64_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x1, x0, lsl #3
+; CHECK-NEXT:    ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    str x9, [x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %b
+  %l = load i64, ptr %g
+  store i64 %l, ptr %p
+  ret ptr %g
+}
+
+define ptr @gep_i64_16_uses(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_16_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x0, x1, x0, lsl #4
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i128, ptr %p, i64 %b
+  %l = load i64, ptr %g
+  store i64 %l, ptr %p
+  ret ptr %g
+}
+
+define ptr @gep_i64_uses_add(i64 %b, ptr %p) {
+; CHECK-LABEL: gep_i64_uses_add:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x1, x0, lsl #3
+; CHECK-NEXT:    ldr x9, [x1, x0, lsl #3]
+; CHECK-NEXT:    add x9, x9, #2
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    str x9, [x1]
+; CHECK-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %b
+  %l = load i64, ptr %g
+  %m = add i64 %l, 2
+  store i64 %m, ptr %p
+  ret ptr %g
+}
+
+define ptr @gep_i64_ldp_uses(i64 %b, ptr %p) {
+; CHECK0-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK0-SDAG:       // %bb.0:
+; CHECK0-SDAG-NEXT:    add x0, x1, x0, lsl #3
+; CHECK0-SDAG-NEXT:    ldp x8, x9, [x0]
+; CHECK0-SDAG-NEXT:    add x8, x8, x9
+; CHECK0-SDAG-NEXT:    str x8, [x1]
+; CHECK0-SDAG-NEXT:    ret
+;
+; CHECK0-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK0-GISEL:       // %bb.0:
+; CHECK0-GISEL-NEXT:    add x8, x1, x0, lsl #3
+; CHECK0-GISEL-NEXT:    ldr x9, [x1, x0, lsl #3]
+; CHECK0-GISEL-NEXT:    ldr x10, [x8, #8]
+; CHECK0-GISEL-NEXT:    mov x0, x8
+; CHECK0-GISEL-NEXT:    add x9, x9, x10
+; CHECK0-GISEL-NEXT:    str x9, [x1]
+; CHECK0-GISEL-NEXT:    ret
+;
+; CHECK3-SDAG-LABEL: gep_i64_ldp_uses:
+; CHECK3-SDAG:       // %bb.0:
+; CHECK3-SDAG-NEXT:    add x0, x1, x0, lsl #3
+; CHECK3-SDAG-NEXT:    ldp x8, x9, [x0]
+; CHECK3-SDAG-NEXT:    add x8, x8, x9
+; CHECK3-SDAG-NEXT:    str x8, [x1]
+; CHECK3-SDAG-NEXT:    ret
+;
+; CHECK3-GISEL-LABEL: gep_i64_ldp_uses:
+; CHECK3-GISEL:       // %bb.0:
+; CHECK3-GISEL-NEXT:    add x8, x1, x0, lsl #3
+; CHECK3-GISEL-NEXT:    ldr x9, [x1, x0, lsl #3]
+; CHECK3-GISEL-NEXT:    ldr x10, [x8, #8]
+; CHECK3-GISEL-NEXT:    mov x0, x8
+; CHECK3-GISEL-NEXT:    add x9, x9, x10
+; CHECK3-GISEL-NEXT:    str x9, [x1]
+; CHECK3-GISEL-NEXT:    ret
+  %g = getelementptr inbounds i64, ptr %p, i64 %b
+  %l = load i64, ptr %g
+  %h = getelementptr inbounds i64, ptr %g, i64 1
+  %m = load i64, ptr %h
+  %n = add i64 %l, %m
+  store i64 %n, ptr %p
+  ret ptr %g
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK0: {{.*}}
 ; CHECK3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index 851fb0d03e8aa3..ddf7651b7a690e 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -52,10 +52,10 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
 ; CHECK-NEXT:    sxtw x9, w3
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x0, x8]
+; CHECK-NEXT:    ldr d3, [x2, x9]
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    ldr d3, [x11]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    add x11, x11, x9
@@ -69,22 +69,22 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
 ; CHECK-NEXT:    usubl v3.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    shll2 v4.4s, v1.8h, #16
 ; CHECK-NEXT:    saddw v0.4s, v6.4s, v0.4h
-; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
 ; CHECK-NEXT:    shll2 v5.4s, v3.8h, #16
 ; CHECK-NEXT:    saddw v1.4s, v4.4s, v1.4h
+; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
 ; CHECK-NEXT:    saddw v3.4s, v5.4s, v3.4h
 ; CHECK-NEXT:    rev64 v5.4s, v1.4s
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
 ; CHECK-NEXT:    rev64 v6.4s, v2.4s
-; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    rev64 v7.4s, v3.4s
+; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
-; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip1 v16.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
 ; CHECK-NEXT:    mov v6.s[1], v7.s[0]
 ; CHECK-NEXT:    ext v7.16b, v5.16b, v16.16b, #8
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 325ab444205bf9..25cd6f5ea4d060 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -12,13 +12,13 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ldr d1, [x2]
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    ldr d2, [x0, x8]
 ; CHECK-NEXT:    add x10, x10, x8
-; CHECK-NEXT:    ldr d3, [x11]
 ; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    ldr d3, [x2, x9]
 ; CHECK-NEXT:    ldr d4, [x10]
-; CHECK-NEXT:    ldr d6, [x10, x8]
 ; CHECK-NEXT:    ldr d5, [x11]
+; CHECK-NEXT:    ldr d6, [x10, x8]
 ; CHECK-NEXT:    ldr d7, [x11, x9]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    usubl v1.8h, v2.8b, v3.8b
@@ -26,16 +26,16 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
 ; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
-; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
 ; CHECK-NEXT:    shll2 v7.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
 ; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
 ; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    saddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    zip1 v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip2 v6.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    mov v7.16b, v2.16b
+; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #12
 ; CHECK-NEXT:    zip2 v18.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    ext v16.16b, v1.16b, v4.16b, #8
@@ -227,59 +227,59 @@ entry:
 define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v2:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    sxtw x9, w3
 ; CHECK-NEXT:    ldr d4, [x0]
 ; CHECK-NEXT:    ldr d5, [x2]
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    add x12, x10, x8
-; CHECK-NEXT:    ldr d6, [x10]
-; CHECK-NEXT:    ldr d7, [x11]
-; CHECK-NEXT:    ldr d0, [x12, x8]
-; CHECK-NEXT:    add x8, x11, x9
-; CHECK-NEXT:    ldr d1, [x12]
-; CHECK-NEXT:    ldr d2, [x8, x9]
-; CHECK-NEXT:    ldr d3, [x8]
-; CHECK-NEXT:    usubl v1.8h, v1.8b, v3.8b
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v2.8b
+; CHECK-NEXT:    ldr d6, [x0, x8]
+; CHECK-NEXT:    add x10, x10, x8
+; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    ldr d7, [x2, x9]
+; CHECK-NEXT:    ldr d0, [x10]
+; CHECK-NEXT:    ldr d1, [x11]
+; CHECK-NEXT:    ldr d2, [x10, x8]
+; CHECK-NEXT:    ldr d3, [x11, x9]
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v1.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    usubl v2.8h, v4.8b, v5.8b
-; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v4.4s, v1.8h, #16
 ; CHECK-NEXT:    shll2 v7.4s, v3.8h, #16
 ; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
-; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT:    saddw v0.4s, v5.4s, v0.4h
+; CHECK-NEXT:    saddw v1.4s, v4.4s, v1.4h
 ; CHECK-NEXT:    saddw v3.4s, v7.4s, v3.4h
 ; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT:    uzp2 v4.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v17.16b, v0.16b
+; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    mov v7.16b, v3.16b
-; CHECK-NEXT:    mov v17.16b, v1.16b
 ; CHECK-NEXT:    zip1 v5.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    zip2 v6.4s, v3.4s, v2.4s
-; CHECK-NEXT:    zip2 v16.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v18.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ext v18.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT:    mov v17.s[1], v1.s[0]
 ; CHECK-NEXT:    mov v7.s[3], v2.s[2]
-; CHECK-NEXT:    mov v17.s[1], v0.s[0]
-; CHECK-NEXT:    uzp2 v2.4s, v4.4s, v0.4s
-; CHECK-NEXT:    mov v4.16b, v0.16b
-; CHECK-NEXT:    zip2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v2.4s, v4.4s, v1.4s
+; CHECK-NEXT:    mov v4.16b, v1.16b
 ; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT:    mov v4.s[0], v1.s[1]
+; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v4.s[0], v0.s[1]
 ; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    ext v1.16b, v1.16b, v18.16b, #12
+; CHECK-NEXT:    ext v0.16b, v0.16b, v18.16b, #12
 ; CHECK-NEXT:    mov v2.d[1], v6.d[1]
-; CHECK-NEXT:    mov v0.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v17.d[1], v3.d[1]
+; CHECK-NEXT:    mov v1.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v4.d[1], v5.d[1]
-; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    mov v0.d[1], v6.d[1]
 ; CHECK-NEXT:    add v2.4s, v2.4s, v16.4s
 ; CHECK-NEXT:    add v3.4s, v4.4s, v17.4s
 ; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    sub v1.4s, v17.4s, v4.4s
 ; CHECK-NEXT:    rev64 v6.4s, v3.4s
 ; CHECK-NEXT:    mov v5.d[1], v2.d[1]
@@ -449,10 +449,10 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    sxtw x9, w3
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    ldr d2, [x0, x8]
+; CHECK-NEXT:    ldr d3, [x2, x9]
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    ldr d3, [x11]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    add x11, x11, x9
@@ -474,14 +474,14 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    rev64 v5.4s, v1.4s
 ; CHECK-NEXT:    saddw v3.4s, v3.4s, v4.4h
 ; CHECK-NEXT:    rev64 v4.4s, v2.4s
-; CHECK-NEXT:    sub v6.4s, v0.4s, v6.4s
-; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    rev64 v7.4s, v3.4s
+; CHECK-NEXT:    sub v6.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v4.4s, v2.4s, v4.4s
-; CHECK-NEXT:    addp v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ext v1.16b, v5.16b, v6.16b, #4
 ; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
+; CHECK-NEXT:    sub v4.4s, v2.4s, v4.4s
+; CHECK-NEXT:    addp v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    mov v6.s[3], v5.s[2]
 ; CHECK-NEXT:    zip2 v16.4s, v4.4s, v7.4s

``````````

</details>


https://github.com/llvm/llvm-project/pull/105666