[llvm] [MachineLICM] Let targets decide if copy-like instructions are cheap (PR #146599)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 1 13:38:45 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
@llvm/pr-subscribers-backend-aarch64
Author: Guy David (guy-david)
<details>
<summary>Changes</summary>
When checking whether it is profitable to hoist an instruction, the pass may override a target's ruling because it assumes that all COPY instructions are cheap, and that may not be the case for all micro-architectures (especially for when copying between different register classes).
On AArch64 there's 0% difference in performance in LLVM's test-suite with this change. Additionally, very few tests were affected which shows how it is not so useful to keep it.
---
Patch is 41.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146599.diff
9 Files Affected:
- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+1-1)
- (modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+21-21)
- (modified) llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll (+139-119)
- (modified) llvm/test/CodeGen/X86/break-false-dep.ll (+22-22)
- (modified) llvm/test/CodeGen/X86/dag-update-nodetomatch.ll (+38-40)
- (removed) llvm/test/CodeGen/X86/memfold-mov32r0.ll (-9)
- (added) llvm/test/CodeGen/X86/memfold-mov32r0.mir (+143)
- (modified) llvm/test/CodeGen/X86/pr57673.ll (+18-18)
- (modified) llvm/test/CodeGen/X86/reverse_branches.ll (+5-5)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index c9079170ca575..70a178f642fb0 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1219,7 +1219,7 @@ bool MachineLICMImpl::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
/// Return true if the instruction is marked "cheap" or the operand latency
/// between its def and a use is one or less.
bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
- if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
+ if (TII->isAsCheapAsAMove(MI))
return true;
bool isCheap = false;
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 53126a08db86f..2bd04ac30509e 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -18,14 +18,28 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: mvn w8, w8
+; CHECK-NEXT: uunpklo z1.h, z0.b
+; CHECK-NEXT: uunpkhi z2.h, z0.b
; CHECK-NEXT: sbfx x8, x8, #0, #1
; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEXT: uunpkhi z4.s, z1.h
+; CHECK-NEXT: uunpklo z6.s, z2.h
+; CHECK-NEXT: uunpkhi z16.s, z2.h
; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: punpklo p2.h, p1.b
; CHECK-NEXT: punpkhi p4.h, p1.b
+; CHECK-NEXT: uunpklo z1.d, z3.s
+; CHECK-NEXT: uunpkhi z2.d, z3.s
; CHECK-NEXT: punpklo p6.h, p0.b
+; CHECK-NEXT: uunpklo z3.d, z4.s
+; CHECK-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEXT: punpkhi p0.h, p0.b
+; CHECK-NEXT: uunpklo z5.d, z6.s
+; CHECK-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEXT: uunpklo z7.d, z16.s
+; CHECK-NEXT: uunpkhi z16.d, z16.s
; CHECK-NEXT: punpklo p1.h, p2.b
; CHECK-NEXT: punpkhi p2.h, p2.b
; CHECK-NEXT: punpklo p3.h, p4.b
@@ -35,28 +49,14 @@ define fastcc i8 @allocno_reload_assign(ptr %p) {
; CHECK-NEXT: punpklo p7.h, p0.b
; CHECK-NEXT: punpkhi p0.h, p0.b
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: uunpklo z1.h, z0.b
-; CHECK-NEXT: uunpklo z2.s, z1.h
-; CHECK-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEXT: uunpklo z3.d, z2.s
-; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: st1b { z3.d }, p1, [z0.d]
+; CHECK-NEXT: st1b { z1.d }, p1, [z0.d]
; CHECK-NEXT: st1b { z2.d }, p2, [z0.d]
-; CHECK-NEXT: uunpklo z2.d, z1.s
-; CHECK-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEXT: st1b { z2.d }, p3, [z0.d]
-; CHECK-NEXT: uunpkhi z2.h, z0.b
-; CHECK-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEXT: st1b { z1.d }, p4, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p5, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z3.s
-; CHECK-NEXT: st1b { z1.d }, p6, [z0.d]
-; CHECK-NEXT: uunpklo z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p7, [z0.d]
-; CHECK-NEXT: uunpkhi z1.d, z2.s
-; CHECK-NEXT: st1b { z1.d }, p0, [z0.d]
+; CHECK-NEXT: st1b { z3.d }, p3, [z0.d]
+; CHECK-NEXT: st1b { z4.d }, p4, [z0.d]
+; CHECK-NEXT: st1b { z5.d }, p5, [z0.d]
+; CHECK-NEXT: st1b { z6.d }, p6, [z0.d]
+; CHECK-NEXT: st1b { z7.d }, p7, [z0.d]
+; CHECK-NEXT: st1b { z16.d }, p0, [z0.d]
; CHECK-NEXT: str p8, [x0]
; CHECK-NEXT: b .LBB0_1
br label %1
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
index 9cb2d4444b974..6cfb8b0e73f7c 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -1,17 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
;; Tests that the ppc-vsx-fma-mutate pass with the schedule-ppc-vsx-fma-mutation-early pass does not hoist xxspltiw out of loops.
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=CHECK64,AIX64 %s
+; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=AIX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
-; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK64,LINUX64 %s
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=LINUX64 %s
; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \
; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \
; RUN: -mtriple powerpc-ibm-aix < %s | FileCheck --check-prefix=CHECK32 %s
define void @bar(ptr noalias nocapture noundef writeonly %__output_a, ptr noalias nocapture noundef readonly %var1321In_a, ptr noalias nocapture noundef readonly %n) {
+; AIX64-LABEL: bar:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: lwz r5, 0(r5)
+; AIX64-NEXT: cmpwi r5, 1
+; AIX64-NEXT: bltlr cr0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxspltiw vs0, 1069066811
+; AIX64-NEXT: xxspltiw vs1, 1170469888
+; AIX64-NEXT: mtctr r5
+; AIX64-NEXT: li r5, 0
+; AIX64-NEXT: .align 5
+; AIX64-NEXT: L..BB0_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: lxvx vs2, r4, r5
+; AIX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; AIX64-NEXT: stxvx vs2, r3, r5
+; AIX64-NEXT: addi r5, r5, 16
+; AIX64-NEXT: bdnz L..BB0_2
+; AIX64-NEXT: # %bb.3: # %for.end
+; AIX64-NEXT: blr
+;
+; LINUX64-LABEL: bar:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: lwz r5, 0(r5)
+; LINUX64-NEXT: cmpwi r5, 1
+; LINUX64-NEXT: bltlr cr0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxspltiw vs0, 1069066811
+; LINUX64-NEXT: xxspltiw vs1, 1170469888
+; LINUX64-NEXT: mtctr r5
+; LINUX64-NEXT: li r5, 0
+; LINUX64-NEXT: .p2align 5
+; LINUX64-NEXT: .LBB0_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: lxvx vs2, r4, r5
+; LINUX64-NEXT: xvmaddmsp vs2, vs0, vs1
+; LINUX64-NEXT: stxvx vs2, r3, r5
+; LINUX64-NEXT: addi r5, r5, 16
+; LINUX64-NEXT: bdnz .LBB0_2
+; LINUX64-NEXT: # %bb.3: # %for.end
+; LINUX64-NEXT: blr
+;
+; CHECK32-LABEL: bar:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: lwz r5, 0(r5)
+; CHECK32-NEXT: cmpwi r5, 0
+; CHECK32-NEXT: blelr cr0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: xxspltiw vs0, 1069066811
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: li r6, 0
+; CHECK32-NEXT: li r7, 0
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB0_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: slwi r8, r7, 4
+; CHECK32-NEXT: addic r7, r7, 1
+; CHECK32-NEXT: addze r6, r6
+; CHECK32-NEXT: lxvx vs2, r4, r8
+; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
+; CHECK32-NEXT: stxvx vs2, r3, r8
+; CHECK32-NEXT: xor r8, r7, r5
+; CHECK32-NEXT: or. r8, r8, r6
+; CHECK32-NEXT: bne cr0, L..BB0_2
+; CHECK32-NEXT: # %bb.3: # %for.end
+; CHECK32-NEXT: blr
entry:
%0 = load i32, ptr %n, align 4
%cmp11 = icmp sgt i32 %0, 0
@@ -28,7 +95,7 @@ for.body:
%add.ptr.val = load <4 x float>, ptr %add.ptr, align 1
%2 = tail call contract <4 x float> @llvm.fma.v4f32(<4 x float> %add.ptr.val, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 6.270500e+03, float 6.270500e+03, float 6.270500e+03, float 6.270500e+03>)
%add.ptr6 = getelementptr inbounds float, ptr %__output_a, i64 %1
- store <4 x float> %2, ptr %add.ptr6, align 1
+ store <4 x float> %2, ptr %add.ptr6, align 1
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.end, label %for.body
@@ -38,6 +105,74 @@ for.end:
}
define void @foo(i1 %cmp97) #0 {
+; AIX64-LABEL: foo:
+; AIX64: # %bb.0: # %entry
+; AIX64-NEXT: andi. r3, r3, 1
+; AIX64-NEXT: bclr 4, gt, 0
+; AIX64-NEXT: # %bb.1: # %for.body.preheader
+; AIX64-NEXT: xxlxor f0, f0, f0
+; AIX64-NEXT: xxlxor f2, f2, f2
+; AIX64-NEXT: xxmrghd vs1, vs0, vs0
+; AIX64-NEXT: xvcvdpsp vs34, vs1
+; AIX64-NEXT: xxlxor vs1, vs1, vs1
+; AIX64-NEXT: .align 4
+; AIX64-NEXT: L..BB1_2: # %for.body
+; AIX64-NEXT: #
+; AIX64-NEXT: xxmrghd vs2, vs2, vs0
+; AIX64-NEXT: xvcvdpsp vs35, vs2
+; AIX64-NEXT: xxspltiw vs2, 1170469888
+; AIX64-NEXT: vmrgew v3, v3, v2
+; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; AIX64-NEXT: xvmaddasp vs2, vs35, vs1
+; AIX64-NEXT: xxland vs2, vs3, vs2
+; AIX64-NEXT: xscvspdpn f2, vs2
+; AIX64-NEXT: b L..BB1_2
+;
+; LINUX64-LABEL: foo:
+; LINUX64: # %bb.0: # %entry
+; LINUX64-NEXT: andi. r3, r3, 1
+; LINUX64-NEXT: bclr 4, gt, 0
+; LINUX64-NEXT: # %bb.1: # %for.body.preheader
+; LINUX64-NEXT: xxlxor f0, f0, f0
+; LINUX64-NEXT: xxlxor f2, f2, f2
+; LINUX64-NEXT: xxspltd vs1, vs0, 0
+; LINUX64-NEXT: xvcvdpsp vs34, vs1
+; LINUX64-NEXT: xxlxor vs1, vs1, vs1
+; LINUX64-NEXT: .p2align 4
+; LINUX64-NEXT: .LBB1_2: # %for.body
+; LINUX64-NEXT: #
+; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
+; LINUX64-NEXT: xvcvdpsp vs35, vs2
+; LINUX64-NEXT: xxspltiw vs2, 1170469888
+; LINUX64-NEXT: vmrgew v3, v2, v3
+; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs35
+; LINUX64-NEXT: xvmaddasp vs2, vs35, vs1
+; LINUX64-NEXT: xxland vs2, vs3, vs2
+; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
+; LINUX64-NEXT: xscvspdpn f2, vs2
+; LINUX64-NEXT: b .LBB1_2
+;
+; CHECK32-LABEL: foo:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: andi. r3, r3, 1
+; CHECK32-NEXT: bclr 4, gt, 0
+; CHECK32-NEXT: # %bb.1: # %for.body.preheader
+; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
+; CHECK32-NEXT: xxlxor f1, f1, f1
+; CHECK32-NEXT: xxlxor vs0, vs0, vs0
+; CHECK32-NEXT: xscvdpspn vs35, f1
+; CHECK32-NEXT: lxv vs34, 0(r3)
+; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: L..BB1_2: # %for.body
+; CHECK32-NEXT: #
+; CHECK32-NEXT: xscvdpspn vs36, f1
+; CHECK32-NEXT: xxspltiw vs1, 1170469888
+; CHECK32-NEXT: vperm v4, v4, v3, v2
+; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
+; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
+; CHECK32-NEXT: xxland vs1, vs2, vs1
+; CHECK32-NEXT: xscvspdpn f1, vs1
+; CHECK32-NEXT: b L..BB1_2
entry:
br i1 %cmp97, label %for.body, label %for.end
@@ -57,122 +192,7 @@ for.end: ; preds = %entry
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
-
-; CHECK64: bar:
-; CHECK64: # %bb.0: # %entry
-; CHECK64-NEXT: lwz r5, 0(r5)
-; CHECK64-NEXT: cmpwi r5, 1
-; CHECK64-NEXT: bltlr cr0
-; CHECK64-NEXT: # %bb.1: # %for.body.preheader
-; CHECK64-NEXT: xxspltiw vs0, 1069066811
-; CHECK64-NEXT: xxspltiw vs1, 1170469888
-; CHECK64-NEXT: mtctr r5
-; CHECK64-NEXT: li r5, 0
-; CHECK64-NEXT: {{.*}}align 5
-; CHECK64-NEXT: [[L2_bar:.*]]: # %for.body
-; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK64-NEXT: lxvx vs2, r4, r5
-; CHECK64-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK64-NEXT: stxvx vs2, r3, r5
-; CHECK64-NEXT: addi r5, r5, 16
-; CHECK64-NEXT: bdnz [[L2_bar]]
-; CHECK64-NEXT: # %bb.3: # %for.end
-; CHECK64-NEXT: blr
-
-; AIX64: .foo:
-; AIX64-NEXT: # %bb.0: # %entry
-; AIX64-NEXT: andi. r3, r3, 1
-; AIX64-NEXT: bclr 4, gt, 0
-; AIX64-NEXT: # %bb.1: # %for.body.preheader
-; AIX64-NEXT: xxlxor f0, f0, f0
-; AIX64-NEXT: xxlxor vs1, vs1, vs1
-; AIX64-NEXT: xxlxor f2, f2, f2
-; AIX64-NEXT: .align 4
-; AIX64-NEXT: L..BB1_2: # %for.body
-; AIX64-NEXT: # =>This Inner Loop Header: Depth=1
-; AIX64-NEXT: xxmrghd vs2, vs2, vs0
-; AIX64-NEXT: xvcvdpsp vs34, vs2
-; AIX64-NEXT: xxmrghd vs2, vs0, vs0
-; AIX64-NEXT: xvcvdpsp vs35, vs2
-; AIX64-NEXT: xxspltiw vs2, 1170469888
-; AIX64-NEXT: vmrgew v2, v2, v3
-; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; AIX64-NEXT: xvmaddasp vs2, vs34, vs1
-; AIX64-NEXT: xxland vs2, vs3, vs2
-; AIX64-NEXT: xscvspdpn f2, vs2
-; AIX64-NEXT: b L..BB1_2
-
-; LINUX64: foo: # @foo
-; LINUX64-NEXT: .Lfunc_begin1:
-; LINUX64-NEXT: .cfi_startproc
-; LINUX64-NEXT: # %bb.0: # %entry
-; LINUX64-NEXT: andi. r3, r3, 1
-; LINUX64-NEXT: bclr 4, gt, 0
-; LINUX64-NEXT: # %bb.1: # %for.body.preheader
-; LINUX64-NEXT: xxlxor f0, f0, f0
-; LINUX64-NEXT: xxlxor vs1, vs1, vs1
-; LINUX64-NEXT: xxlxor f2, f2, f2
-; LINUX64-NEXT: .p2align 4
-; LINUX64-NEXT: .LBB1_2: # %for.body
-; LINUX64-NEXT: # =>This Inner Loop Header: Depth=1
-; LINUX64-NEXT: xxmrghd vs2, vs0, vs2
-; LINUX64-NEXT: xvcvdpsp vs34, vs2
-; LINUX64-NEXT: xxspltd vs2, vs0, 0
-; LINUX64-NEXT: xvcvdpsp vs35, vs2
-; LINUX64-NEXT: xxspltiw vs2, 1170469888
-; LINUX64-NEXT: vmrgew v2, v3, v2
-; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs34
-; LINUX64-NEXT: xvmaddasp vs2, vs34, vs1
-; LINUX64-NEXT: xxland vs2, vs3, vs2
-; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3
-; LINUX64-NEXT: xscvspdpn f2, vs2
-; LINUX64-NEXT: b .LBB1_2
-
-; CHECK32: .bar:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: lwz r5, 0(r5)
-; CHECK32-NEXT: cmpwi r5, 0
-; CHECK32-NEXT: blelr cr0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: xxspltiw vs0, 1069066811
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: li r6, 0
-; CHECK32-NEXT: li r7, 0
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: slwi r8, r7, 4
-; CHECK32-NEXT: addic r7, r7, 1
-; CHECK32-NEXT: addze r6, r6
-; CHECK32-NEXT: lxvx vs2, r4, r8
-; CHECK32-NEXT: xvmaddmsp vs2, vs0, vs1
-; CHECK32-NEXT: stxvx vs2, r3, r8
-; CHECK32-NEXT: xor r8, r7, r5
-; CHECK32-NEXT: or. r8, r8, r6
-; CHECK32-NEXT: bne cr0, [[L2_foo]]
-
-; CHECK32: .foo:
-; CHECK32-NEXT: # %bb.0: # %entry
-; CHECK32-NEXT: andi. r3, r3, 1
-; CHECK32-NEXT: bclr 4, gt, 0
-; CHECK32-NEXT: # %bb.1: # %for.body.preheader
-; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0
-; CHECK32-NEXT: xxlxor f1, f1, f1
-; CHECK32-NEXT: xxlxor vs0, vs0, vs0
-; CHECK32-NEXT: xscvdpspn vs35, f1
-; CHECK32-NEXT: lxv vs34, 0(r3)
-; CHECK32-NEXT: .align 4
-; CHECK32-NEXT: L..BB1_2: # %for.body
-; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT: xscvdpspn vs36, f1
-; CHECK32-NEXT: xxspltiw vs1, 1170469888
-; CHECK32-NEXT: vperm v4, v4, v3, v2
-; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36
-; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0
-; CHECK32-NEXT: xxland vs1, vs2, vs1
-; CHECK32-NEXT: xscvspdpn f1, vs1
-; CHECK32-NEXT: b L..BB1_2
diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll
index 6943622fac7f2..a6ad3018e052c 100644
--- a/llvm/test/CodeGen/X86/break-false-dep.ll
+++ b/llvm/test/CodeGen/X86/break-false-dep.ll
@@ -472,17 +472,17 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill
; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0
; SSE-WIN-NEXT: .seh_endprologue
-; SSE-WIN-NEXT: xorl %eax, %eax
-; SSE-WIN-NEXT: leaq v(%rip), %rcx
-; SSE-WIN-NEXT: leaq x(%rip), %rdx
-; SSE-WIN-NEXT: leaq y(%rip), %r8
-; SSE-WIN-NEXT: leaq z(%rip), %r9
-; SSE-WIN-NEXT: leaq w(%rip), %r10
+; SSE-WIN-NEXT: leaq v(%rip), %rax
+; SSE-WIN-NEXT: leaq x(%rip), %rcx
+; SSE-WIN-NEXT: leaq y(%rip), %rdx
+; SSE-WIN-NEXT: leaq z(%rip), %r8
+; SSE-WIN-NEXT: leaq w(%rip), %r9
+; SSE-WIN-NEXT: xorl %r10d, %r10d
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader
; SSE-WIN-NEXT: # =>This Loop Header: Depth=1
; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2
-; SSE-WIN-NEXT: movq %rcx, %r11
+; SSE-WIN-NEXT: movq %rax, %r11
; SSE-WIN-NEXT: xorl %esi, %esi
; SSE-WIN-NEXT: .p2align 4
; SSE-WIN-NEXT: .LBB8_2: # %for.body3
@@ -490,10 +490,10 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2
; SSE-WIN-NEXT: xorps %xmm0, %xmm0
; SSE-WIN-NEXT: cvtsi2sdl (%r11), %xmm0
+; SSE-WIN-NEXT: mulsd (%rsi,%rcx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%rdx), %xmm0
; SSE-WIN-NEXT: mulsd (%rsi,%r8), %xmm0
-; SSE-WIN-NEXT: mulsd (%rsi,%r9), %xmm0
-; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r10)
+; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%r9)
; SSE-WIN-NEXT: #APP
; SSE-WIN-NEXT: #NO_APP
; SSE-WIN-NEXT: addq $8, %rsi
@@ -502,8 +502,8 @@ define dso_local void @loopdep3() {
; SSE-WIN-NEXT: jne .LBB8_2
; SSE-WIN-NEXT: # %bb.3: # %for.inc14
; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1
-; SSE-WIN-NEXT: incl %eax
-; SSE-WIN-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; SSE-WIN-NEXT: incl %r10d
+; SSE-WIN-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; SSE-WIN-NEXT: jne .LBB8_1
; SSE-WIN-NEXT: # %bb.4: # %for.end16
; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload
@@ -550,17 +550,17 @@ define dso_local void @loopdep3() {
; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
; AVX-NEXT: .seh_savexmm %xmm6, 0
; AVX-NEXT: .seh_endprologue
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: leaq v(%rip), %rcx
-; AVX-NEXT: leaq x(%rip), %rdx
-; AVX-NEXT: leaq y(%rip), %r8
-; AVX-NEXT: leaq z(%rip), %r9
-; AVX-NEXT: leaq w(%rip), %r10
+; AVX-NEXT: leaq v(%rip), %rax
+; AVX-NEXT: leaq x(%rip), %rcx
+; AVX-NEXT: leaq y(%rip), %rdx
+; AVX-NEXT: leaq z(%rip), %r8
+; AVX-NEXT: leaq w(%rip), %r9
+; AVX-NEXT: xorl %r10d, %r10d
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_1: # %for.cond1.preheader
; AVX-NEXT: # =>This Loop Header: Depth=1
; AVX-NEXT: # Child Loop BB8_2 Depth 2
-; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rax, %r11
; AVX-NEXT: xorl %esi, %esi
; AVX-NEXT: .p2align 4
; AVX-NEXT: .LBB8_2: # %for.body3
@@ -568,10 +568,10 @@ define dso_local void @loopdep3() {
; AVX-NEXT: # => This Inner Loop Header: Depth=2
; AVX-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX-NEXT: vcvtsi2sdl (%r11), %xmm5, %xmm0
+; AVX-NEXT: vmulsd (%rsi,%rcx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0
; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0
-; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0
-; AVX-NEXT: vmovsd %xmm0, (%rsi,%r10)
+; AVX-NEXT: vmovsd %xmm0, (%rsi,%r9)
; AVX-NEXT: #APP
; AVX-NEXT: #NO_APP
; AVX-NEXT: addq $8, %rsi
@@ -580,8 +580,8 @@ define dso_local void @loopdep3() {
; AVX-NEXT: jne .LBB8_2
; AVX-NEXT: # %bb.3: # %for.inc14
; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1
-; AVX-NEXT: incl %eax
-; AVX-NEXT: cmpl $100000, %eax # imm = 0x186A0
+; AVX-NEXT: incl %r10d
+; AVX-NEXT: cmpl $100000, %r10d # imm = 0x186A0
; AVX-NEXT: jne .LBB8_1
; AVX-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/146599
More information about the llvm-commits
mailing list