[llvm] [MachineLICM] Allow hoisting loads from invariant address (PR #70796)
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 31 05:48:37 PDT 2023
https://github.com/igogo-x86 created https://github.com/llvm/llvm-project/pull/70796
Sometimes, loads can appear in a loop after the LICM pass is executed
the final time. For example, ExpandMemCmp pass creates loads in a loop,
and one of the operands may be an invariant address.
This patch extends the pre-regalloc stage MachineLICM by allowing to
hoist invariant loads from loops that don't have any stores or calls
and allows load reorderings.
>From 7f2a45b4b14cd0e507c4cecd242b09ac3ef3446a Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Tue, 31 Oct 2023 10:07:55 +0000
Subject: [PATCH 1/2] Pre-commit tests
---
.../AArch64/machine-licm-hoist-load.ll | 428 ++++++++++++++++++
1 file changed, 428 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
new file mode 100644
index 000000000000000..ec06bddc60c85f4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: one_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: cbz x2, .LBB0_2
+; CHECK-NEXT: .LBB0_1: // %for.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr x9, [x0], #8
+; CHECK-NEXT: ldr w10, [x1]
+; CHECK-NEXT: ldr w9, [x9]
+; CHECK-NEXT: cmp w9, w10
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x2, x2, #1
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: .LBB0_2: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %cmp4 = icmp eq i64 %N, 0
+ br i1 %cmp4, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.body ]
+ ret i64 %sum.0.lcssa
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
+ %0 = load ptr, ptr %arrayidx, align 8
+ %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.05, %add
+ %inc = add nuw i64 %i.06, 1
+ %exitcond = icmp eq i64 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: two_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: cbz x2, .LBB1_6
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB1_6
+; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB1_4 Depth 2
+; CHECK-NEXT: ldr x10, [x0, x9, lsl #3]
+; CHECK-NEXT: mov x11, x3
+; CHECK-NEXT: .LBB1_4: // %for.body4
+; CHECK-NEXT: // Parent Loop BB1_3 Depth=1
+; CHECK-NEXT: // => This Inner Loop Header: Depth=2
+; CHECK-NEXT: ldr x12, [x10], #8
+; CHECK-NEXT: ldr w13, [x1]
+; CHECK-NEXT: ldr w12, [x12]
+; CHECK-NEXT: cmp w12, w13
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x11, x11, #1
+; CHECK-NEXT: b.ne .LBB1_4
+; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge
+; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB1_3
+; CHECK-NEXT: .LBB1_6: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %cmp17 = icmp eq i64 %N, 0
+ %cmp214 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp17, %cmp214
+ br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge
+ %i.019 = phi i64 [ %inc7, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ]
+ %sum.018 = phi i64 [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.body4
+
+for.body4: ; preds = %for.cond1.preheader, %for.body4
+ %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
+ %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
+ %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
+ %1 = load ptr, ptr %arrayidx5, align 8
+ %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.115, %add
+ %inc = add nuw i64 %j.016, 1
+ %exitcond = icmp eq i64 %inc, %M
+ br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge, label %for.body4
+
+for.cond1.for.cond.cleanup3_crit_edge: ; preds = %for.body4
+ %inc7 = add nuw i64 %i.019, 1
+ %exitcond22 = icmp eq i64 %inc7, %N
+ br i1 %exitcond22, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond.cleanup3_crit_edge ]
+ ret i64 %sum.0.lcssa
+}
+
+define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: three_dimensional_middle:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: cbz x2, .LBB2_9
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB2_9
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: cbz x4, .LBB2_9
+; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: .LBB2_4: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB2_5 Depth 2
+; CHECK-NEXT: // Child Loop BB2_6 Depth 3
+; CHECK-NEXT: ldr x10, [x8, x9, lsl #3]
+; CHECK-NEXT: mov x11, xzr
+; CHECK-NEXT: .LBB2_5: // %for.cond5.preheader
+; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
+; CHECK-NEXT: // => This Loop Header: Depth=2
+; CHECK-NEXT: // Child Loop BB2_6 Depth 3
+; CHECK-NEXT: lsl x13, x11, #3
+; CHECK-NEXT: mov x14, x4
+; CHECK-NEXT: ldr x12, [x10, x13]
+; CHECK-NEXT: ldr x13, [x1, x13]
+; CHECK-NEXT: .LBB2_6: // %for.body8
+; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB2_5 Depth=2
+; CHECK-NEXT: // => This Inner Loop Header: Depth=3
+; CHECK-NEXT: ldr x15, [x12], #8
+; CHECK-NEXT: ldr w16, [x13]
+; CHECK-NEXT: ldr w15, [x15]
+; CHECK-NEXT: cmp w15, w16
+; CHECK-NEXT: cinc x0, x0, ne
+; CHECK-NEXT: subs x14, x14, #1
+; CHECK-NEXT: b.ne .LBB2_6
+; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond
+; CHECK-NEXT: // in Loop: Header=BB2_5 Depth=2
+; CHECK-NEXT: add x11, x11, #1
+; CHECK-NEXT: cmp x11, x3
+; CHECK-NEXT: b.ne .LBB2_5
+; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond
+; CHECK-NEXT: // in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB2_4
+; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %cmp33 = icmp eq i64 %N, 0
+ %cmp229 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp33, %cmp229
+ %cmp626 = icmp eq i64 %K, 0
+ %or.cond48 = or i1 %or.cond, %cmp626
+ br i1 %or.cond48, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond
+ %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ]
+ %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader
+ %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ]
+ %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
+ %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031
+ %1 = load ptr, ptr %arrayidx9, align 8
+ %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031
+ %2 = load ptr, ptr %arrayidx11, align 8
+ br label %for.body8
+
+for.body8: ; preds = %for.body8, %for.cond5.preheader
+ %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
+ %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
+ %3 = load ptr, ptr %arrayidx10, align 8
+ %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.227, %add
+ %inc = add nuw i64 %k.028, 1
+ %exitcond = icmp eq i64 %inc, %K
+ br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
+
+for.cond5.for.cond: ; preds = %for.body8
+ %inc13 = add nuw i64 %j.031, 1
+ %exitcond46 = icmp eq i64 %inc13, %M
+ br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader
+
+for.cond1.for.cond: ; preds = %for.cond5.for.cond
+ %inc16 = add nuw i64 %i.035, 1
+ %exitcond47 = icmp eq i64 %inc16, %N
+ br i1 %exitcond47, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ]
+ ret i64 %sum.0.lcssa
+}
+
+define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
+; CHECK-LABEL: three_dimensional:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: cbz x2, .LBB3_9
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: cbz x3, .LBB3_9
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: cbz x4, .LBB3_9
+; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader
+; CHECK-NEXT: // =>This Loop Header: Depth=1
+; CHECK-NEXT: // Child Loop BB3_5 Depth 2
+; CHECK-NEXT: // Child Loop BB3_6 Depth 3
+; CHECK-NEXT: ldr x10, [x8, x9, lsl #3]
+; CHECK-NEXT: mov x11, xzr
+; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader
+; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
+; CHECK-NEXT: // => This Loop Header: Depth=2
+; CHECK-NEXT: // Child Loop BB3_6 Depth 3
+; CHECK-NEXT: ldr x12, [x10, x11, lsl #3]
+; CHECK-NEXT: mov x13, x4
+; CHECK-NEXT: .LBB3_6: // %for.body8
+; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
+; CHECK-NEXT: // Parent Loop BB3_5 Depth=2
+; CHECK-NEXT: // => This Inner Loop Header: Depth=3
+; CHECK-NEXT: ldr x14, [x12], #8
+; CHECK-NEXT: ldr w15, [x1]
+; CHECK-NEXT: ldr w14, [x14]
+; CHECK-NEXT: cmp w14, w15
+; CHECK-NEXT: cinc x0, x0, ne
+; CHECK-NEXT: subs x13, x13, #1
+; CHECK-NEXT: b.ne .LBB3_6
+; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond
+; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2
+; CHECK-NEXT: add x11, x11, #1
+; CHECK-NEXT: cmp x11, x3
+; CHECK-NEXT: b.ne .LBB3_5
+; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond
+; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT: add x9, x9, #1
+; CHECK-NEXT: cmp x9, x2
+; CHECK-NEXT: b.ne .LBB3_4
+; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %cmp31 = icmp eq i64 %N, 0
+ %cmp227 = icmp eq i64 %M, 0
+ %or.cond = or i1 %cmp31, %cmp227
+ %cmp624 = icmp eq i64 %K, 0
+ %or.cond46 = or i1 %or.cond, %cmp624
+ br i1 %or.cond46, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry, %for.cond1.for.cond
+ %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ]
+ %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033
+ %0 = load ptr, ptr %arrayidx, align 8
+ br label %for.cond5.preheader
+
+for.cond5.preheader: ; preds = %for.cond5.for.cond, %for.cond1.preheader
+ %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ]
+ %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
+ %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029
+ %1 = load ptr, ptr %arrayidx9, align 8
+ br label %for.body8
+
+for.body8: ; preds = %for.body8, %for.cond5.preheader
+ %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
+ %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
+ %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026
+ %2 = load ptr, ptr %arrayidx10, align 8
+ %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4)
+ %tobool = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool to i64
+ %spec.select = add i64 %sum.225, %add
+ %inc = add nuw i64 %k.026, 1
+ %exitcond = icmp eq i64 %inc, %K
+ br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
+
+for.cond5.for.cond: ; preds = %for.body8
+ %inc12 = add nuw i64 %j.029, 1
+ %exitcond44 = icmp eq i64 %inc12, %M
+ br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader
+
+for.cond1.for.cond: ; preds = %for.cond5.for.cond
+ %inc15 = add nuw i64 %i.033, 1
+ %exitcond45 = icmp eq i64 %inc15, %N
+ br i1 %exitcond45, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup: ; preds = %for.cond1.for.cond, %entry
+ %sum.0.lcssa = phi i64 [ 0, %entry ], [ %spec.select, %for.cond1.for.cond ]
+ ret i64 %sum.0.lcssa
+}
+
+define i32 @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N, i32 %M, i32 %K) {
+; CHECK-LABEL: one_dimensional_with_store:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp w3, #1
+; CHECK-NEXT: b.lt .LBB4_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov w8, w3
+; CHECK-NEXT: .LBB4_2: // %for.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr x9, [x0], #8
+; CHECK-NEXT: ldr w10, [x1]
+; CHECK-NEXT: ldr w9, [x9]
+; CHECK-NEXT: rev w10, w10
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w9, w10
+; CHECK-NEXT: cset w9, hi
+; CHECK-NEXT: cset w10, lo
+; CHECK-NEXT: subs x8, x8, #1
+; CHECK-NEXT: sub w9, w9, w10
+; CHECK-NEXT: strb w9, [x2], #1
+; CHECK-NEXT: b.ne .LBB4_2
+; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret i32 0
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
+ %0 = load ptr, ptr %arrayidx, align 8
+ %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4)
+ %conv = trunc i32 %call to i8
+ %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
+ store i8 %conv, ptr %arrayidx2, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N, i32 %M, i32 %K) {
+; CHECK-LABEL: one_dimensional_with_call:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w21, -24
+; CHECK-NEXT: .cfi_offset w22, -32
+; CHECK-NEXT: .cfi_offset w30, -48
+; CHECK-NEXT: cmp w2, #1
+; CHECK-NEXT: b.lt .LBB5_3
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x21, x0
+; CHECK-NEXT: mov w20, wzr
+; CHECK-NEXT: mov w22, w2
+; CHECK-NEXT: .LBB5_2: // %for.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr x8, [x21], #8
+; CHECK-NEXT: ldr w9, [x19]
+; CHECK-NEXT: ldr w8, [x8]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cinc w20, w20, ne
+; CHECK-NEXT: bl func
+; CHECK-NEXT: subs x22, x22, #1
+; CHECK-NEXT: b.ne .LBB5_2
+; CHECK-NEXT: b .LBB5_4
+; CHECK-NEXT: .LBB5_3:
+; CHECK-NEXT: mov w20, wzr
+; CHECK-NEXT: .LBB5_4: // %for.cond.cleanup
+; CHECK-NEXT: mov w0, w20
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %cmp4 = icmp sgt i32 %N, 0
+ br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext i32 %N to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %spec.select, %for.body ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
+ %0 = load ptr, ptr %arrayidx, align 8
+ %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
+ %tobool.not = icmp ne i32 %bcmp, 0
+ %add = zext i1 %tobool.not to i32
+ %spec.select = add nuw nsw i32 %sum.05, %add
+ tail call void @func()
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+declare i32 @bcmp(ptr, ptr, i64)
+declare i32 @memcmp(ptr, ptr, i64)
+declare void @func()
>From 149ff643ec6703c0d844a76863b7c7006aa8f204 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Tue, 31 Oct 2023 10:59:36 +0000
Subject: [PATCH 2/2] [MachineLICM] Allow hoisting loads from invariant address
Sometimes, loads can appear in a loop after the LICM pass is executed
the final time. For example, ExpandMemCmp pass creates loads in a loop,
and one of the operands may be an invariant address.
This patch extends the pre-regalloc stage MachineLICM by allowing to
hoist invariant loads from loops that don't have any stores or calls
and allows load reorderings.
---
llvm/lib/CodeGen/MachineLICM.cpp | 79 +++++++++++++++----
.../AArch64/machine-licm-hoist-load.ll | 75 ++++++++++--------
.../AArch64/ragreedy-local-interval-cost.ll | 2 +-
llvm/test/CodeGen/AArch64/sinksplat.ll | 2 +-
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 4 +-
.../RISCV/rvv/fold-scalar-load-crash.ll | 48 +++++------
.../CodeGen/X86/2009-04-25-CoalescerBug.ll | 7 +-
llvm/test/CodeGen/X86/block-placement.ll | 3 +-
llvm/test/CodeGen/X86/fma-commute-loop.ll | 2 +-
llvm/test/CodeGen/X86/pr49393.ll | 15 ++--
llvm/test/CodeGen/X86/pr53842.ll | 14 ++--
11 files changed, 154 insertions(+), 97 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index e29f28ecaea0dce..f1af74328f0025a 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -72,6 +72,11 @@ static cl::opt<bool>
HoistConstStores("hoist-const-stores",
cl::desc("Hoist invariant stores"),
cl::init(true), cl::Hidden);
+
+static cl::opt<bool> HoistConstLoads("hoist-const-loads",
+ cl::desc("Hoist invariant loads"),
+ cl::init(true), cl::Hidden);
+
// The default threshold of 100 (i.e. if target block is 100 times hotter)
// is based on empirical data on a single target and is subject to tuning.
static cl::opt<unsigned>
@@ -222,9 +227,11 @@ namespace {
void AddToLiveIns(MCRegister Reg, MachineLoop *CurLoop);
- bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop);
+ bool IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad);
- bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop);
+ bool IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad);
bool HasLoopPHIUse(const MachineInstr *MI, MachineLoop *CurLoop);
@@ -277,7 +284,7 @@ namespace {
bool MayCSE(MachineInstr *MI);
unsigned Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
- MachineLoop *CurLoop);
+ MachineLoop *CurLoop, bool SafeToMoveLoad);
void InitCSEMap(MachineBasicBlock *BB);
@@ -494,7 +501,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
// operands. FIXME: Consider unfold load folding instructions.
if (Def && !RuledOut) {
int FI = std::numeric_limits<int>::min();
- if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop)) ||
+ if ((!HasNonInvariantUse && IsLICMCandidate(*MI, CurLoop, false)) ||
(TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
Candidates.push_back(CandidateInfo(MI, Def, FI));
}
@@ -772,6 +779,32 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
BackTrace.clear();
InitRegPressure(Preheader);
+ // Compute information about whether it is allowed to move load instruction
+ // out of the current loop or one of the inner loops
+ SmallDenseMap<MachineLoop *, bool> AllowedToHoistLoads;
+ if (HoistConstLoads) {
+ SmallVector<MachineLoop *, 4> Worklist{CurLoop};
+
+ while (!Worklist.empty()) {
+ auto *L = Worklist.pop_back_val();
+ AllowedToHoistLoads[L] = true;
+ Worklist.insert(Worklist.end(), L->getSubLoops().begin(),
+ L->getSubLoops().end());
+ }
+
+ for (auto *MBB : CurLoop->blocks()) {
+ for (auto &MI : *MBB) {
+ if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef())) {
+ for (MachineLoop *L = MLI->getLoopFor(MI.getParent()); L != CurLoop;
+ L = L->getParentLoop())
+ AllowedToHoistLoads[L] = false;
+ AllowedToHoistLoads[CurLoop] = false;
+ break;
+ }
+ }
+ }
+ }
+
// Now perform LICM.
for (MachineDomTreeNode *Node : Scopes) {
MachineBasicBlock *MBB = Node->getBlock();
@@ -780,9 +813,23 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
// Process the block
SpeculationState = SpeculateUnknown;
+
+ auto CanMoveLoad = [](MachineLoop *L) -> bool {
+ dbgs() << L << "\n";
+ for (auto *MBB : L->blocks()) {
+ for (auto &MI : *MBB) {
+ // Taken from MachineInstr::isSafeToMove
+ if (MI.mayStore() || MI.isCall() || (MI.mayLoad() && MI.hasOrderedMemoryRef()))
+ return false;
+ }
+ }
+ return true;
+ };
+
+ bool SafeToMoveLoad = HoistConstLoads && AllowedToHoistLoads[CurLoop];
for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
unsigned HoistRes = HoistResult::NotHoisted;
- HoistRes = Hoist(&MI, Preheader, CurLoop);
+ HoistRes = Hoist(&MI, Preheader, CurLoop, SafeToMoveLoad);
if (HoistRes & HoistResult::NotHoisted) {
// We have failed to hoist MI to outermost loop's preheader. If MI is in
// a subloop, try to hoist it to subloop's preheader.
@@ -793,9 +840,12 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN,
while (!InnerLoopWorkList.empty()) {
MachineLoop *InnerLoop = InnerLoopWorkList.pop_back_val();
+ bool SafeToMoveLoadInner =
+ HoistConstLoads && AllowedToHoistLoads[InnerLoop];
MachineBasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
if (InnerLoopPreheader) {
- HoistRes = Hoist(&MI, InnerLoopPreheader, InnerLoop);
+ HoistRes =
+ Hoist(&MI, InnerLoopPreheader, InnerLoop, SafeToMoveLoadInner);
if (HoistRes & HoistResult::Hoisted)
break;
}
@@ -990,9 +1040,10 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
/// Returns true if the instruction may be a suitable candidate for LICM.
/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
-bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
+bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad) {
// Check if it's safe to move the instruction.
- bool DontMoveAcrossStore = true;
+ bool DontMoveAcrossStore = !SafeToMoveLoad;
if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
!(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
@@ -1025,9 +1076,9 @@ bool MachineLICMBase::IsLICMCandidate(MachineInstr &I, MachineLoop *CurLoop) {
}
/// Returns true if the instruction is loop invariant.
-bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I,
- MachineLoop *CurLoop) {
- if (!IsLICMCandidate(I, CurLoop)) {
+bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I, MachineLoop *CurLoop,
+ bool SafeToMoveLoad) {
+ if (!IsLICMCandidate(I, CurLoop, SafeToMoveLoad)) {
LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n");
return false;
}
@@ -1305,7 +1356,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI,
MBB->insert(Pos, NewMIs[1]);
// If unfolding produced a load that wasn't loop-invariant or profitable to
// hoist, discard the new instructions and bail.
- if (!IsLoopInvariantInst(*NewMIs[0], CurLoop) ||
+ if (!IsLoopInvariantInst(*NewMIs[0], CurLoop, /*SaveToMovLoad=*/false) ||
!IsProfitableToHoist(*NewMIs[0], CurLoop)) {
NewMIs[0]->eraseFromParent();
NewMIs[1]->eraseFromParent();
@@ -1432,7 +1483,7 @@ bool MachineLICMBase::MayCSE(MachineInstr *MI) {
/// that are safe to hoist, this instruction is called to do the dirty work.
/// It returns true if the instruction is hoisted.
unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
- MachineLoop *CurLoop) {
+ MachineLoop *CurLoop, bool SafeToMoveLoad) {
MachineBasicBlock *SrcBlock = MI->getParent();
// Disable the instruction hoisting due to block hotness
@@ -1444,7 +1495,7 @@ unsigned MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader,
}
// First check whether we should hoist this instruction.
bool HasExtractHoistableLoad = false;
- if (!IsLoopInvariantInst(*MI, CurLoop) ||
+ if (!IsLoopInvariantInst(*MI, CurLoop, SafeToMoveLoad) ||
!IsProfitableToHoist(*MI, CurLoop)) {
// If not, try unfolding a hoistable load.
MI = ExtractHoistableLoad(MI, CurLoop);
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index ec06bddc60c85f4..6b76b03fe00fc81 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -4,18 +4,23 @@
define i64 @one_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-LABEL: one_dimensional:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz x2, .LBB0_4
+; CHECK-NEXT: // %bb.1: // %for.body.preheader
+; CHECK-NEXT: ldr w9, [x1]
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: cbz x2, .LBB0_2
-; CHECK-NEXT: .LBB0_1: // %for.body
+; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr x9, [x0], #8
-; CHECK-NEXT: ldr w10, [x1]
-; CHECK-NEXT: ldr w9, [x9]
-; CHECK-NEXT: cmp w9, w10
+; CHECK-NEXT: ldr x10, [x0], #8
+; CHECK-NEXT: ldr w10, [x10]
+; CHECK-NEXT: cmp w10, w9
; CHECK-NEXT: cinc x8, x8, ne
; CHECK-NEXT: subs x2, x2, #1
-; CHECK-NEXT: b.ne .LBB0_1
-; CHECK-NEXT: .LBB0_2: // %for.cond.cleanup
+; CHECK-NEXT: b.ne .LBB0_2
+; CHECK-NEXT: // %bb.3: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ret
entry:
@@ -48,22 +53,22 @@ define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: cbz x3, .LBB1_6
; CHECK-NEXT: // %bb.2: // %for.cond1.preheader.preheader
+; CHECK-NEXT: ldr w10, [x1]
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB1_3: // %for.cond1.preheader
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB1_4 Depth 2
-; CHECK-NEXT: ldr x10, [x0, x9, lsl #3]
-; CHECK-NEXT: mov x11, x3
+; CHECK-NEXT: ldr x11, [x0, x9, lsl #3]
+; CHECK-NEXT: mov x12, x3
; CHECK-NEXT: .LBB1_4: // %for.body4
; CHECK-NEXT: // Parent Loop BB1_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr x12, [x10], #8
-; CHECK-NEXT: ldr w13, [x1]
-; CHECK-NEXT: ldr w12, [x12]
-; CHECK-NEXT: cmp w12, w13
+; CHECK-NEXT: ldr x13, [x11], #8
+; CHECK-NEXT: ldr w13, [x13]
+; CHECK-NEXT: cmp w13, w10
; CHECK-NEXT: cinc x8, x8, ne
-; CHECK-NEXT: subs x11, x11, #1
+; CHECK-NEXT: subs x12, x12, #1
; CHECK-NEXT: b.ne .LBB1_4
; CHECK-NEXT: // %bb.5: // %for.cond1.for.cond.cleanup3_crit_edge
; CHECK-NEXT: // in Loop: Header=BB1_3 Depth=1
@@ -132,18 +137,18 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
; CHECK-NEXT: // => This Loop Header: Depth=2
; CHECK-NEXT: // Child Loop BB2_6 Depth 3
-; CHECK-NEXT: lsl x13, x11, #3
+; CHECK-NEXT: lsl x12, x11, #3
; CHECK-NEXT: mov x14, x4
-; CHECK-NEXT: ldr x12, [x10, x13]
-; CHECK-NEXT: ldr x13, [x1, x13]
+; CHECK-NEXT: ldr x13, [x1, x12]
+; CHECK-NEXT: ldr x12, [x10, x12]
+; CHECK-NEXT: ldr w13, [x13]
; CHECK-NEXT: .LBB2_6: // %for.body8
; CHECK-NEXT: // Parent Loop BB2_4 Depth=1
; CHECK-NEXT: // Parent Loop BB2_5 Depth=2
; CHECK-NEXT: // => This Inner Loop Header: Depth=3
; CHECK-NEXT: ldr x15, [x12], #8
-; CHECK-NEXT: ldr w16, [x13]
; CHECK-NEXT: ldr w15, [x15]
-; CHECK-NEXT: cmp w15, w16
+; CHECK-NEXT: cmp w15, w13
; CHECK-NEXT: cinc x0, x0, ne
; CHECK-NEXT: subs x14, x14, #1
; CHECK-NEXT: b.ne .LBB2_6
@@ -214,43 +219,42 @@ for.cond.cleanup: ; preds = %for.cond1.for.cond,
define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-LABEL: three_dimensional:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: cbz x2, .LBB3_9
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: cbz x3, .LBB3_9
; CHECK-NEXT: // %bb.2: // %entry
; CHECK-NEXT: cbz x4, .LBB3_9
; CHECK-NEXT: // %bb.3: // %for.cond1.preheader.preheader
+; CHECK-NEXT: ldr w10, [x1]
; CHECK-NEXT: mov x9, xzr
-; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB3_4: // %for.cond1.preheader
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB3_5 Depth 2
; CHECK-NEXT: // Child Loop BB3_6 Depth 3
-; CHECK-NEXT: ldr x10, [x8, x9, lsl #3]
-; CHECK-NEXT: mov x11, xzr
+; CHECK-NEXT: ldr x11, [x0, x9, lsl #3]
+; CHECK-NEXT: mov x12, xzr
; CHECK-NEXT: .LBB3_5: // %for.cond5.preheader
; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
; CHECK-NEXT: // => This Loop Header: Depth=2
; CHECK-NEXT: // Child Loop BB3_6 Depth 3
-; CHECK-NEXT: ldr x12, [x10, x11, lsl #3]
-; CHECK-NEXT: mov x13, x4
+; CHECK-NEXT: ldr x13, [x11, x12, lsl #3]
+; CHECK-NEXT: mov x14, x4
; CHECK-NEXT: .LBB3_6: // %for.body8
; CHECK-NEXT: // Parent Loop BB3_4 Depth=1
; CHECK-NEXT: // Parent Loop BB3_5 Depth=2
; CHECK-NEXT: // => This Inner Loop Header: Depth=3
-; CHECK-NEXT: ldr x14, [x12], #8
-; CHECK-NEXT: ldr w15, [x1]
-; CHECK-NEXT: ldr w14, [x14]
-; CHECK-NEXT: cmp w14, w15
-; CHECK-NEXT: cinc x0, x0, ne
-; CHECK-NEXT: subs x13, x13, #1
+; CHECK-NEXT: ldr x15, [x13], #8
+; CHECK-NEXT: ldr w15, [x15]
+; CHECK-NEXT: cmp w15, w10
+; CHECK-NEXT: cinc x8, x8, ne
+; CHECK-NEXT: subs x14, x14, #1
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: // %bb.7: // %for.cond5.for.cond
; CHECK-NEXT: // in Loop: Header=BB3_5 Depth=2
-; CHECK-NEXT: add x11, x11, #1
-; CHECK-NEXT: cmp x11, x3
+; CHECK-NEXT: add x12, x12, #1
+; CHECK-NEXT: cmp x12, x3
; CHECK-NEXT: b.ne .LBB3_5
; CHECK-NEXT: // %bb.8: // %for.cond1.for.cond
; CHECK-NEXT: // in Loop: Header=BB3_4 Depth=1
@@ -258,6 +262,7 @@ define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-NEXT: cmp x9, x2
; CHECK-NEXT: b.ne .LBB3_4
; CHECK-NEXT: .LBB3_9: // %for.cond.cleanup
+; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: ret
entry:
%cmp31 = icmp eq i64 %N, 0
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 419f25c22eb7247..178336870373ec6 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -hoist-const-loads=false < %s | FileCheck %s
@A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
@B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index ca51c7c85d2c9c3..cb63a4d78dc2524 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -hoist-const-loads=false -o - %s | FileCheck %s
define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
; CHECK-LABEL: smull:
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index f24abb568400099..e1f1f5495e9afda 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -hoist-const-loads=false -o - %s | FileCheck --check-prefix=CHECK-BE %s
; CHECK-LABEL: lCPI0_0:
; CHECK-NEXT: .byte 0 ; 0x0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
index 96fdbfc6d097499..79b1e14b774a4f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
@@ -7,49 +7,49 @@
define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
; RV32-LABEL: test:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: addi a3, a2, 1
+; RV32-NEXT: th.lbib a3, (a1), -1, 0
+; RV32-NEXT: th.lrb a0, a1, a0, 0
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a3
+; RV32-NEXT: addi a1, a2, 1
; RV32-NEXT: .LBB0_1: # %for.body
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32-NEXT: mv a4, a1
-; RV32-NEXT: th.lbib a5, (a4), -1, 0
-; RV32-NEXT: th.lrb a4, a4, a0, 0
-; RV32-NEXT: vmv.v.x v8, a5
; RV32-NEXT: vmv.s.x v9, zero
-; RV32-NEXT: vsetvli zero, a3, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v8, v9, a2
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vslideup.vx v10, v9, a2
; RV32-NEXT: vsetivli zero, 8, e8, mf2, tu, ma
-; RV32-NEXT: vmv.s.x v8, a4
+; RV32-NEXT: vmv.s.x v10, a0
; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; RV32-NEXT: vmseq.vi v8, v8, 0
-; RV32-NEXT: vmv.x.s a4, v8
-; RV32-NEXT: andi a4, a4, 255
-; RV32-NEXT: bnez a4, .LBB0_1
+; RV32-NEXT: vmseq.vi v9, v10, 0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: andi a3, a3, 255
+; RV32-NEXT: bnez a3, .LBB0_1
; RV32-NEXT: # %bb.2: # %if.then381
; RV32-NEXT: li a0, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test:
; RV64: # %bb.0: # %entry
+; RV64-NEXT: th.lbib a3, (a1), -1, 0
; RV64-NEXT: sext.w a0, a0
-; RV64-NEXT: addi a3, a2, 1
+; RV64-NEXT: th.lrb a0, a1, a0, 0
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a3
+; RV64-NEXT: addi a1, a2, 1
; RV64-NEXT: .LBB0_1: # %for.body
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
-; RV64-NEXT: mv a4, a1
-; RV64-NEXT: th.lbib a5, (a4), -1, 0
-; RV64-NEXT: th.lrb a4, a4, a0, 0
-; RV64-NEXT: vmv.v.x v8, a5
; RV64-NEXT: vmv.s.x v9, zero
-; RV64-NEXT: vsetvli zero, a3, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v8, v9, a2
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vslideup.vx v10, v9, a2
; RV64-NEXT: vsetivli zero, 8, e8, mf2, tu, ma
-; RV64-NEXT: vmv.s.x v8, a4
+; RV64-NEXT: vmv.s.x v10, a0
; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
-; RV64-NEXT: vmseq.vi v8, v8, 0
-; RV64-NEXT: vmv.x.s a4, v8
-; RV64-NEXT: andi a4, a4, 255
-; RV64-NEXT: bnez a4, .LBB0_1
+; RV64-NEXT: vmseq.vi v9, v10, 0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: andi a3, a3, 255
+; RV64-NEXT: bnez a3, .LBB0_1
; RV64-NEXT: # %bb.2: # %if.then381
; RV64-NEXT: li a0, 0
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
index 8494d87e1e0f2ba..ce28893090c43fe 100644
--- a/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
@@ -5,14 +5,15 @@
define i64 @test(ptr %tmp13) nounwind {
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl (%rdi), %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: shrl %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %while.cond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl (%rdi), %eax
-; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: jne .LBB0_1
; CHECK-NEXT: # %bb.2: # %while.end
-; CHECK-NEXT: shrl %eax
; CHECK-NEXT: retq
entry:
br label %while.cond
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index a522f0e9828a054..2f9635db34a3308 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -318,8 +318,7 @@ define void @unnatural_cfg1() {
; CHECK-LABEL: unnatural_cfg1
; CHECK: %entry
; CHECK: %loop.header
-; CHECK: %loop.body2
-; CHECK: %loop.body3
+; CHECK: %loop.body5
entry:
br label %loop.header
diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll
index 833137fa6cd6dce..a22e5d2e5e0c350 100644
--- a/llvm/test/CodeGen/X86/fma-commute-loop.ll
+++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f -hoist-const-loads=false | FileCheck %s
define void @eggs(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, ptr %arg13, ptr %arg14) nounwind {
; CHECK-LABEL: eggs:
diff --git a/llvm/test/CodeGen/X86/pr49393.ll b/llvm/test/CodeGen/X86/pr49393.ll
index f7bc71d29b07b2b..3fb6a82f3fbb2cc 100644
--- a/llvm/test/CodeGen/X86/pr49393.ll
+++ b/llvm/test/CodeGen/X86/pr49393.ll
@@ -5,14 +5,14 @@ define void @f() {
; CHECK-LABEL: f:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB0_1: # %for.cond
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: imull %eax, %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: mulsd %xmm0, %xmm1
; CHECK-NEXT: subsd %xmm0, %xmm1
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %for.cond
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: imull %eax, %eax
; CHECK-NEXT: cwtl
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2sd %eax, %xmm2
@@ -21,9 +21,10 @@ define void @f() {
; CHECK-NEXT: movapd %xmm2, %xmm3
; CHECK-NEXT: mulsd %xmm1, %xmm3
; CHECK-NEXT: mulsd %xmm0, %xmm2
-; CHECK-NEXT: subsd %xmm3, %xmm1
-; CHECK-NEXT: addsd %xmm2, %xmm1
-; CHECK-NEXT: cvttsd2si %xmm1, %eax
+; CHECK-NEXT: movapd %xmm1, %xmm4
+; CHECK-NEXT: subsd %xmm3, %xmm4
+; CHECK-NEXT: addsd %xmm2, %xmm4
+; CHECK-NEXT: cvttsd2si %xmm4, %eax
; CHECK-NEXT: jmp .LBB0_1
entry:
br label %for.cond
diff --git a/llvm/test/CodeGen/X86/pr53842.ll b/llvm/test/CodeGen/X86/pr53842.ll
index 4a3f751eb1d3c92..89f04e3373ae8c3 100644
--- a/llvm/test/CodeGen/X86/pr53842.ll
+++ b/llvm/test/CodeGen/X86/pr53842.ll
@@ -8,16 +8,16 @@ define void @PR53842() {
; CHECK-LABEL: PR53842:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm3, %ymm3
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %ymm3
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %ymm2
-; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; CHECK-NEXT: vpsubq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4
+; CHECK-NEXT: vpsubq %zmm4, %zmm0, %zmm0
; CHECK-NEXT: jmp .LBB0_1
entry:
br label %vector.body
More information about the llvm-commits
mailing list