[llvm] [AArch64] Fix postinc operands for Cortex-A510 scheduling (PR #68518)

via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 8 03:35:00 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

<details>
<summary>Changes</summary>

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc loads/stores in the Cortex-A510 scheduling model.

I will add the same for other models too, this will be the most impactful due to it being the default cpu scheduling model.

---

Patch is 200.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68518.diff


9 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64SchedA510.td (+34-34) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll (+9-9) 
- (modified) llvm/test/CodeGen/AArch64/extbinopload.ll (+90-90) 
- (modified) llvm/test/CodeGen/AArch64/ld1postmul.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/tbl-loops.ll (+7-7) 
- (modified) llvm/test/CodeGen/AArch64/vldn_shuffle.ll (+2-2) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s (+1021-1021) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index fab2cda87807554..1afbc5d9102ca96 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -295,16 +295,16 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 
 //    2-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
@@ -312,10 +312,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
 
 //    3-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
@@ -323,10 +323,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
 
 //    4-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
@@ -334,10 +334,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
 def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 
 //---
 // Vector Stores
@@ -347,28 +347,28 @@ def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d
 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //---
 // Floating Point Conversions, MAC, DIV, SQRT
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
index cbda7b027587d9c..07fbe5d7310f60f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
@@ -47,10 +47,10 @@ define void @f_undef_1(<8 x i64> %a, ptr %dst) {
 ; CHECK-LABEL: f_undef_1:
 ; CHECK:       // %bb.0: // %BB
 ; CHECK-NEXT:    mov v16.16b, v0.16b
-; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v5.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q1_q2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q3_q4
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    mov v4.16b, v3.16b
 ; CHECK-NEXT:    mov v17.16b, v16.16b
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 6657b19d24929d8..7d73e1c6c1d7f41 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14320,8 +14320,8 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.b { v0 }[1], [x0], #1
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1lane:
@@ -14345,8 +14345,8 @@ define <8 x i8> @test_v8i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <8 x i
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.b { v0 }[1], [x0], x2
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_reg_ld1lane:
@@ -14413,8 +14413,8 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A)
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], #2
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_imm_ld1lane:
@@ -14439,8 +14439,8 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4 x
 ; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_reg_ld1lane:
@@ -14507,8 +14507,8 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A)
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_imm_ld1lane:
@@ -14533,8 +14533,8 @@ define <2 x i32> @test_v2i32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2 x
 ; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_reg_ld1lane:
@@ -14644,8 +14644,8 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float>
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_imm_ld1lane:
@@ -14670,8 +14670,8 @@ define <2 x float> @test_v2f32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2
 ; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_reg_ld1lane:
@@ -14776,9 +14776,9 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(ptr %bar, ptr %ptr,
 ; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
 ; CHECK-NEXT:    ldr d1, [x3]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    cnt.8b v1, v1
 ; CHECK-NEXT:    uaddlp.4h v1, v1
 ; CHECK-NEXT:    uaddlp.2s v1, v1
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 99f573795489a08..849fc7aa00a8e7e 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -365,15 +365,15 @@ define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ldp s3, s2, [x2]
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v4.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
 ; CHECK-NEXT:    uaddw v2.4s, v2.4s, v3.4h
 ; CHECK-NEXT:    uaddw2 v3.4s, v4.4s, v0.8h
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
@@ -407,10 +407,10 @@ define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    uaddl v1.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    ret
@@ -444,10 +444,10 @@ define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    shll v3.4s, v2.4h, #16
@@ -489,18 +489,18 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
-; CHECK-NEXT:    ldp s4, s5, [x4]
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    usubl v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x5], #4
-; CHECK-NEXT:    ldp s6, s7, [x6]
+; CHECK-NEXT:    ldp s4, s5, [x4]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ld1 { v4.s }[1], [x5], #4
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x5]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
+; CHECK-NEXT:    ldp s6, s7, [x6]
 ; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x7]
 ; CHECK-NEXT:    usubl v5.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    shll v0.4s, v4.4h, #16
@@ -647,7 +647,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    str s1, [x4]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ldp s0, s5, [x2]
@@ -664,16 +664,16 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x9, x1, #4
 ; CHECK-NEXT:    uzp1 v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    mov v0.b[11], w10
-; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    add x10, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[3], [x3], #4
 ; CHECK-NEXT:    ldr s4, [x0, #12]
 ; CHECK-NEXT:    ldp s3, s16, [x0, #4]
-; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
+; CHECK-NEXT:    ldp s6, s7, [x2, #8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
 ; CHECK-NEXT:    add x8, x1, #8
 ; CHECK-NEXT:    ld1 { v16.s }[1], [x8]
 ; CHECK-NEXT:    uaddl v2.8h, v3.8b, v4.8b
@@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shuffle:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s2, s7, [x0, #8]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldr s18, [x1, #12]
-; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s3, s16, [x0]
-; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    mov v4.16b, v7.16b
-; CHECK-NEXT:    ldp s6, s17, [x2, #8]
+; CHECK-NEXT:    ldp s0, s1, [x0, #8]
+; CHECK-NEXT:    add x8, x1, #8
+; CHECK-NEXT:    ldr s6, [x1, #12]
+; CHECK-NEXT:    ldp s17, s18, [x2, #8]
+; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    add x9, x3, #8
+; CHECK-NEXT:    mov v4.16b, v1.16b
+; CHECK-NEXT:    ldp s7, s16, [x0]
 ; CHECK-NEXT:    ldr s5, [x3, #12]
-; CHECK-NEXT:    mov v7.s[1], v18.s[0]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT:    mov v4.s[1], v18.s[0]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    mov v1.s[1], v6.s[0]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    mov v4.s[1], v6.s[0]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x1], #4
 ; CHECK-NEXT:    ld1 { v16.s }[1], [x1]
-; CHECK-NEXT:    mov v4.s[2], v17.s[0]
-; CHECK-NEXT:    mov v17.s[1], v5.s[0]
-; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT:    uaddl v6.8h, v0.8b, v6.8b
-; CHECK-NEXT:    uaddl v7.8h, v16.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v17.8b
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:   ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/68518


More information about the llvm-commits mailing list