[llvm] MachineTraceMetrics: always include instr latency in depth (PR #73550)

via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 27 09:57:22 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-mc

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

Fix the depth calculation in MachineTraceMetrics to include the latency of the current instruction when all dependencies are transient, or when there are no dependencies. This is the right thing to do, as demonstrated by changes to the machine-trace-metrics-depth-height test: it no longer reports false depths of 0. This simple change results in better CodeGen on the X86, AArch64, PowerPC, AMDGPU, and RISCV targets.

-- 8< --
Based on #<!-- -->73324. Please review only second patch.

---

Patch is 175.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73550.diff


37 Files Affected:

- (modified) llvm/lib/CodeGen/MachineTraceMetrics.cpp (+8-6) 
- (modified) llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir (+20-12) 
- (modified) llvm/test/CodeGen/AArch64/logical_shifted_reg.ll (+7-8) 
- (modified) llvm/test/CodeGen/AArch64/machine_cse.ll (+11-12) 
- (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+128-128) 
- (modified) llvm/test/CodeGen/AArch64/tbl-loops.ll (+94-98) 
- (modified) llvm/test/CodeGen/AMDGPU/early-if-convert.ll (+381-125) 
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+180-363) 
- (modified) llvm/test/CodeGen/PowerPC/machine-combiner.ll (+267-78) 
- (added) llvm/test/CodeGen/RISCV/machine-trace-metrics-depth-height.ll (+177) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll (+12-12) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/alias-static-alloca.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/avx-vinsertf128.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/avx512-regcall-Mask.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/avx512vnni-combine.ll (+6-3) 
- (modified) llvm/test/CodeGen/X86/avx512vnni.ll (+12-6) 
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll (+4-2) 
- (modified) llvm/test/CodeGen/X86/avxvnni-combine.ll (+24-12) 
- (modified) llvm/test/CodeGen/X86/avxvnni.ll (+48-24) 
- (modified) llvm/test/CodeGen/X86/early-ifcvt-remarks.ll (+10-11) 
- (modified) llvm/test/CodeGen/X86/gather-scatter-opaque-ptr-2.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/hipe-cc.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/hipe-cc64.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/imul.ll (+3-4) 
- (modified) llvm/test/CodeGen/X86/lea-opt2.ll (+2-6) 
- (modified) llvm/test/CodeGen/X86/machine-combiner-dbg.mir (+14-9) 
- (modified) llvm/test/CodeGen/X86/machine-trace-metrics-entryBB-critpath.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_gather_scatter.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/mul-constant-i64.ll (+3-4) 
- (modified) llvm/test/CodeGen/X86/no-split-size.ll (+9-9) 
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avx512vnni.ll (+4-2) 
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll (+8-4) 
- (modified) llvm/test/CodeGen/X86/umul-with-overflow.ll (+109-108) 
- (modified) llvm/test/CodeGen/X86/vector-fshr-128.ll (+2-2) 
- (modified) llvm/test/MC/AArch64/local-bounds-single-trap.ll (+35-27) 


``````````diff
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 3e6f36fe936fff0..34a549494de0906 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -817,6 +817,11 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
         .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
     Cycle = std::max(Cycle, DepCycle);
   }
+  // If there are no deps, or all the deps are transient, compute the latency of
+  // the instruction.
+  if (!Cycle)
+    Cycle = MTM.SchedModel.computeInstrLatency(&UseMI);
+
   // Remember the instruction depth.
   InstrCycles &MICycles = Cycles[&UseMI];
   MICycles.Depth = Cycle;
@@ -926,12 +931,11 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
       if (I == RegUnits.end())
         continue;
       unsigned DepHeight = I->Cycle;
-      if (!MI.isTransient()) {
+      if (!MI.isTransient())
         // We may not know the UseMI of this dependency, if it came from the
         // live-in list. SchedModel can handle a NULL UseMI.
         DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(),
                                                       I->MI, I->Op);
-      }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
       RegUnits.erase(I);
@@ -1111,10 +1115,8 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
       // Don't process PHI deps. They depend on the specific predecessor, and
       // we'll get them when visiting the predecessor.
       Deps.clear();
-      bool HasPhysRegs = !MI.isPHI() && getDataDeps(MI, Deps, MTM.MRI);
-
-      // There may also be regunit dependencies to include in the height.
-      if (HasPhysRegs)
+      if (!MI.isPHI() && getDataDeps(MI, Deps, MTM.MRI))
+        // There may also be regunit dependencies to include in the height.
         Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits, MTM.SchedModel,
                                       MTM.TII, MTM.TRI);
 
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
index 425a23214871d68..1c160232849a35a 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
@@ -180,28 +180,36 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   successors: %bb.3(0x30000000), %bb.2(0x50000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY2]], %bb.0, %4, %bb.1
-  ; CHECK-NEXT:   [[LDRBBui:%[0-9]+]]:gpr32common = LDRBBui [[PHI]], 0 :: (load (s8))
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY2]], %bb.0, %4, %bb.3
+  ; CHECK-NEXT:   [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[PHI]], 0 :: (load (s8))
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY $wzr
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32all = COPY [[COPY3]]
+  ; CHECK-NEXT:   CBZW killed [[LDRBBui]], %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 4080, 12, implicit-def $nzcv
   ; CHECK-NEXT:   [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 16711680
   ; CHECK-NEXT:   [[CSELWr:%[0-9]+]]:gpr32common = CSELWr [[COPY]], killed [[MOVi32imm]], 11, implicit $nzcv
   ; CHECK-NEXT:   [[SUBSWri1:%[0-9]+]]:gpr32 = SUBSWri [[CSELWr]], 0, 0, implicit-def $nzcv
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32 = COPY $wzr
   ; CHECK-NEXT:   [[CSELWr1:%[0-9]+]]:gpr32 = CSELWr [[CSELWr]], [[COPY5]], 12, implicit $nzcv
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr32 = COPY [[CSELWr1]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gpr32all = COPY [[CSELWr1]]
   ; CHECK-NEXT:   [[SUBSWri2:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 0, 0, implicit-def $nzcv
   ; CHECK-NEXT:   [[CSELWr2:%[0-9]+]]:gpr32 = CSELWr [[COPY]], [[COPY5]], 12, implicit $nzcv
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr32 = COPY [[CSELWr2]]
-  ; CHECK-NEXT:   $wzr = SUBSWri [[LDRBBui]], 0, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   [[CSELWr3:%[0-9]+]]:gpr32 = CSELWr [[COPY4]], [[COPY6]], 0, implicit $nzcv
-  ; CHECK-NEXT:   $wzr = SUBSWri [[LDRBBui]], 0, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   [[CSELWr4:%[0-9]+]]:gpr32 = CSELWr [[COPY4]], [[COPY7]], 0, implicit $nzcv
-  ; CHECK-NEXT:   STRBBui [[CSELWr4]], [[COPY1]], 0 :: (store (s8))
-  ; CHECK-NEXT:   early-clobber %20:gpr64sp = STRBBpost [[CSELWr3]], [[PHI]], 1 :: (store (s8))
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gpr32all = COPY [[CSELWr2]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr32 = PHI [[COPY4]], %bb.1, [[COPY6]], %bb.2
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:gpr32 = PHI [[COPY4]], %bb.1, [[COPY7]], %bb.2
+  ; CHECK-NEXT:   STRBBui [[PHI2]], [[COPY1]], 0 :: (store (s8))
+  ; CHECK-NEXT:   early-clobber %20:gpr64sp = STRBBpost [[PHI1]], [[PHI]], 1 :: (store (s8))
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gpr64all = COPY %20
   ; CHECK-NEXT:   B %bb.1
   bb.0:
diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
index c8c1e9007c7a0f5..aa54c7441de3a74 100644
--- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -252,17 +252,16 @@ define void @flag_setting() {
 ; CHECK-NEXT:    ldr x9, [x8]
 ; CHECK-NEXT:    ldr x10, [x10]
 ; CHECK-NEXT:    tst x9, x10
-; CHECK-NEXT:    b.gt .LBB2_4
+; CHECK-NEXT:    b.gt .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %test2
+; CHECK-NEXT:    and x11, x9, x10, asr #12
 ; CHECK-NEXT:    tst x9, x10, lsl #63
-; CHECK-NEXT:    b.lt .LBB2_4
-; CHECK-NEXT:  // %bb.2: // %test3
-; CHECK-NEXT:    and x10, x9, x10, asr #12
-; CHECK-NEXT:    cmp x10, #1
-; CHECK-NEXT:    b.ge .LBB2_4
-; CHECK-NEXT:  // %bb.3: // %other_exit
+; CHECK-NEXT:    ccmp x11, #1, #0, ge
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  .LBB2_2: // %common.ret
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_3: // %other_exit
 ; CHECK-NEXT:    str x9, [x8]
-; CHECK-NEXT:  .LBB2_4: // %common.ret
 ; CHECK-NEXT:    ret
   %val1 = load i64, ptr @var1_64
   %val2 = load i64, ptr @var2_64
diff --git a/llvm/test/CodeGen/AArch64/machine_cse.ll b/llvm/test/CodeGen/AArch64/machine_cse.ll
index 6478f5a37f7826f..d7982670e5678e0 100644
--- a/llvm/test/CodeGen/AArch64/machine_cse.ll
+++ b/llvm/test/CodeGen/AArch64/machine_cse.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s
 ; -tail-dup-placement causes tail duplication during layout. This breaks the
 ; assumptions of the test case as written (specifically, it creates an
@@ -12,10 +13,6 @@
 @e = external global i32
 
 define i32 @combine-sign-comparisons-by-cse(ptr %arg) {
-; CHECK: cmp
-; CHECK: b.ge
-; CHECK-NOT: cmp
-; CHECK: b.le
 
 entry:
   %a = load i32, ptr @a, align 4
@@ -50,10 +47,11 @@ return:
 
 define void @combine_vector_zeros(ptr %p, ptr %q) {
 ; CHECK-LABEL: combine_vector_zeros:
-; CHECK: movi v[[REG:[0-9]+]].2d, #0
-; CHECK-NOT: movi
-; CHECK: str d[[REG]], [x0]
-; CHECK: str q[[REG]], [x1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
 entry:
   store <8 x i8> zeroinitializer, ptr %p
   store <16 x i8> zeroinitializer, ptr %q
@@ -62,10 +60,11 @@ entry:
 
 define void @combine_vector_ones(ptr %p, ptr %q) {
 ; CHECK-LABEL: combine_vector_ones:
-; CHECK: movi v[[REG:[0-9]+]].2d, #0xffffffffffffffff
-; CHECK-NOT: movi
-; CHECK: str d[[REG]], [x0]
-; CHECK: str q[[REG]], [x1]
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
 entry:
   store <2 x i32> <i32 -1, i32 -1>, ptr %p
   store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, ptr %q
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 706aa4ad1b46653..b6f91faeb639ab9 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -2018,153 +2018,153 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b1, [sp, #80]
-; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    ldr b0, [sp, #80]
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    add x10, sp, #88
 ; CHECK-NEXT:    ldr b3, [sp, #16]
-; CHECK-NEXT:    add x12, sp, #32
-; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-NEXT:    ldr b1, [sp, #144]
+; CHECK-NEXT:    add x11, sp, #24
+; CHECK-NEXT:    ld1 { v0.b }[1], [x10]
+; CHECK-NEXT:    ldr b5, [sp, #480]
+; CHECK-NEXT:    add x10, sp, #152
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    ldr b6, [sp, #544]
+; CHECK-NEXT:    mov v2.b[1], w1
 ; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    ldr b4, [sp, #480]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #168
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    ldr b5, [sp, #608]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    mov v0.b[1], w1
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    add x14, sp, #184
-; CHECK-NEXT:    ldr b16, [sp, #544]
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #176
-; CHECK-NEXT:    ldr b17, [sp, #672]
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #552
+; CHECK-NEXT:    ldr b7, [sp, #672]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #680
+; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    ld1 { v1.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #32
+; CHECK-NEXT:    mov v2.b[2], w2
+; CHECK-NEXT:    ldr b4, [sp, #608]
+; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #496
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    add x10, sp, #616
+; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    add x8, sp, #40
+; CHECK-NEXT:    ld1 { v1.b }[3], [x11]
+; CHECK-NEXT:    mov v2.b[3], w3
+; CHECK-NEXT:    ld1 { v3.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
+; CHECK-NEXT:    add x11, sp, #176
+; CHECK-NEXT:    add x8, sp, #624
+; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    ldr b16, [sp, #208]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #48
 ; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #488
-; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
-; CHECK-NEXT:    mov v0.b[2], w2
-; CHECK-NEXT:    add x11, sp, #192
-; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
-; CHECK-NEXT:    add x13, sp, #616
-; CHECK-NEXT:    add x12, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-NEXT:    add x13, sp, #496
-; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
-; CHECK-NEXT:    add x14, sp, #680
-; CHECK-NEXT:    ld1 { v17.b }[1], [x14]
-; CHECK-NEXT:    add x13, sp, #504
-; CHECK-NEXT:    ld1 { v3.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #552
-; CHECK-NEXT:    add x12, sp, #688
-; CHECK-NEXT:    ld1 { v16.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
+; CHECK-NEXT:    add x15, sp, #184
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    mov v2.b[4], w4
+; CHECK-NEXT:    add x14, sp, #504
+; CHECK-NEXT:    add x11, sp, #56
+; CHECK-NEXT:    ld1 { v1.b }[5], [x15]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x14]
+; CHECK-NEXT:    sshll v18.8h, v16.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[5], [x11]
 ; CHECK-NEXT:    add x11, sp, #560
-; CHECK-NEXT:    add x8, sp, #136
-; CHECK-NEXT:    ld1 { v17.b }[2], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v16.b }[2], [x11]
-; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    mov v0.b[3], w3
-; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v16.b }[3], [x8]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #576
+; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #688
+; CHECK-NEXT:    add x14, sp, #192
+; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    add x12, sp, #512
+; CHECK-NEXT:    ld1 { v1.b }[6], [x14]
+; CHECK-NEXT:    mov v2.b[5], w5
+; CHECK-NEXT:    add x15, sp, #568
+; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
+; CHECK-NEXT:    add x14, sp, #696
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    add x9, sp, #632
+; CHECK-NEXT:    add x10, sp, #200
+; CHECK-NEXT:    ld1 { v6.b }[3], [x15]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x14]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #520
+; CHECK-NEXT:    mov v17.s[0], v18.s[0]
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #576
 ; CHECK-NEXT:    add x11, sp, #704
 ; CHECK-NEXT:    ldr b18, [sp, #736]
-; CHECK-NEXT:    mov v0.b[4], w4
-; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x8]
-; CHECK-NEXT:    add x9, sp, #528
+; CHECK-NEXT:    mov v2.b[6], w6
+; CHECK-NEXT:    ld1 { v6.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v7.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-NEXT:    add x10, sp, #528
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    add x9, sp, #648
 ; CHECK-NEXT:    add x11, sp, #584
 ; CHECK-NEXT:    add x12, sp, #712
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v16.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-NEXT:    mov v0.b[5], w5
-; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    add x16, sp, #64
+; CHECK-NEXT:    mov v2.b[7], w7
+; CHECK-NEXT:    add x10, sp, #536
 ; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
-; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    ld1 { v3.b }[6], [x16]
+; CHECK-NEXT:    add x9, sp, #656
 ; CHECK-NEXT:    add x11, sp, #592
 ; CHECK-NEXT:    add x12, sp, #720
-; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v17.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x8]
-; CHECK-NEXT:    ldr b6, [sp, #208]
-; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    mov v7.s[0], v18.s[0]
-; CHECK-NEXT:    mov v0.b[6], w6
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    add x8, sp, #72
+; CHECK-NEXT:    mov v16.s[0], v18.s[0]
+; CHECK-NEXT:    add x13, sp, #136
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #664
 ; CHECK-NEXT:    add x9, sp, #600
 ; CHECK-NEXT:    add x10, sp, #728
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    mov v0.b[7], w7
-; CHECK-NEXT:    add x9, sp, #200
-; CHECK-NEXT:    add x10, sp, #72
-; CHECK-NEXT:    saddw v7.4s, v7.4s, v4.4h
-; CHECK-NEXT:    sshll v6.4s, v6.4h, #0
-; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
 ; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    saddw v17.4s, v17.4s, v2.4h
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v18.s[0], v6.s[0]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v6.4s, v17.8h, v16.8h
-; CHECK-NEXT:    saddl2 v4.4s, v5.8h, v4.8h
-; CHECK-NEXT:    saddl v16.4s, v17.4h, v16.4h
-; CHECK-NEXT:    saddw v5.4s, v7.4s, v5.4h
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    saddl2 v17.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddw v0.4s, v18.4s, v0.4h
-; CHECK-NEXT:    saddl2 v7.4s, v3.8h, v2.8h
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
-; CHECK-NEXT:    add v5.4s, v5.4s, v16.4s
-; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    add v6.4s, v17.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT:    saddw v16.4s, v16.4s, v5.4h
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    saddl2 v18.4s, v3.8h, v1.8h
+; CHECK-NEXT:    saddl v1.4s, v3.4h, v1.4h
+; CHECK-NEXT:    saddl2 v2.4s, v2.8h, v0.8h
+; CHECK-NEXT:    saddw v0.4s, v17.4s, v0.4h
+; CHECK-NEXT:    saddl2 v3.4s, v7.8h, v6.8h
+; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
+; CHECK-NEXT:    saddl2 v5.4s, v4.8h, v5.8h
+; CHECK-NEXT:    saddw v4.4s, v16.4s, v4.4h
+; CHECK-NEXT:    add v2.4s, v2.4s, v18.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v6.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 365fe03ab0b0844..9a0e670d208ffc0 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -145,50 +145,19 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-LABEL: loop2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    subs w8, w2, #1
-; CHECK-NEXT:    b.lt .LBB1_7
+; CHECK-NEXT:    b.lt .LBB1_9
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    cmp w8, #2
-; CHECK-NEXT:    b.ls .LBB1_4
+; CHECK-NEXT:    b.ls .LBB1_6
 ; CHECK-NEXT:  // %bb.2: // %vector.memcheck
 ; CHECK-NEXT:    ubfiz x9, x8, #1, #32
 ; CHECK-NEXT:    add x9, x9, #2
 ; CHECK-NEXT:    add x10, x1, x9, lsl #2
-; CHECK-NEXT:    cmp...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/73550


More information about the llvm-commits mailing list