[llvm] 5afa0fa - [X86] Prevent APX NDD compression when it creates a partial write (#132051)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 21 09:50:16 PDT 2025
Author: Daniel Zabawa
Date: 2025-03-22T00:50:12+08:00
New Revision: 5afa0fa9a6ba482cdc87945b71f5cd626b754d8f
URL: https://github.com/llvm/llvm-project/commit/5afa0fa9a6ba482cdc87945b71f5cd626b754d8f
DIFF: https://github.com/llvm/llvm-project/commit/5afa0fa9a6ba482cdc87945b71f5cd626b754d8f.diff
LOG: [X86] Prevent APX NDD compression when it creates a partial write (#132051)
APX NDD instructions may be compressed when the result is also a source.
For 8/16b instructions, this may create partial register write hazards
if a previous super-register def is within the partial reg update
clearance, or incorrect code if the super-register is not dead.
This change prevents compression when the super-register is marked as an
implicit define, which the virtual rewriter already adds in the case
where a subregister is defined but the super-register is not dead.
The BreakFalseDeps interface is also updated to add implicit
super-register defs for NDD instructions that would incur partial-write
stalls if compressed to legacy ops.
Added:
llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
Modified:
llvm/lib/Target/X86/X86CompressEVEX.cpp
llvm/lib/Target/X86/X86InstrInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index b80c21b008f4b..7883f720ffa79 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -237,6 +237,23 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
return 0;
return I->NewOpc;
};
+
+ // Redundant NDD ops cannot be safely compressed if either:
+ // - the legacy op would introduce a partial write that BreakFalseDeps
+ // identified as a potential stall, or
+ // - the op is writing to a subregister of a live register, i.e. the
+ // full (zeroed) result is used.
+ // Both cases are indicated by an implicit def of the superregister.
+ if (IsRedundantNDD) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (Dst &&
+ (X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
+ Register Super = getX86SubSuperRegister(Dst, 64);
+ if (MI.definesRegister(Super, /*TRI=*/nullptr))
+ IsRedundantNDD = false;
+ }
+ }
+
// NonNF -> NF only if it's not a compressible NDD instruction and eflags is
// dead.
unsigned NewOpc = IsRedundantNDD
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c8ef4d97bf088..c89c23d92c6ed 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6793,19 +6793,37 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
unsigned X86InstrInfo::getPartialRegUpdateClearance(
const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
- if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
+
+ if (OpNum != 0)
+ return 0;
+
+ // NDD ops with 8/16b results may appear to be partial register
+ // updates after register allocation.
+ bool HasNDDPartialWrite = false;
+ if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
+ Register Reg = MI.getOperand(0).getReg();
+ if (!Reg.isVirtual())
+ HasNDDPartialWrite =
+ X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
+ }
+
+ if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
return 0;
- // If MI is marked as reading Reg, the partial register update is wanted.
+ // Check if the result register is also used as a source.
+ // For non-NDD ops, this means a partial update is wanted, hence we return 0.
+ // For NDD ops, this means it is possible to compress the instruction
+ // to a legacy form in CompressEVEX, which would create an unwanted partial
+ // update, so we return the clearance.
const MachineOperand &MO = MI.getOperand(0);
Register Reg = MO.getReg();
- if (Reg.isVirtual()) {
- if (MO.readsReg() || MI.readsVirtualRegister(Reg))
- return 0;
- } else {
- if (MI.readsRegister(Reg, TRI))
- return 0;
- }
+ bool ReadsReg = false;
+ if (Reg.isVirtual())
+ ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
+ else
+ ReadsReg = MI.readsRegister(Reg, TRI);
+ if (ReadsReg != HasNDDPartialWrite)
+ return 0;
// If any instructions in the clearance range are reading Reg, insert a
// dependency breaking instruction, which is inexpensive and is likely to
@@ -7229,6 +7247,17 @@ void X86InstrInfo::breakPartialRegDependency(
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
MI.addRegisterKilled(Reg, TRI, true);
+ } else if ((X86::GR16RegClass.contains(Reg) ||
+ X86::GR8RegClass.contains(Reg)) &&
+ X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
+ // This case is only expected for NDD ops which appear to be partial
+ // writes, but are not due to the zeroing of the upper part. Here
+ // we add an implicit def of the superegister, which prevents
+ // CompressEVEX from converting this to a legacy form.
+ Register SuperReg = getX86SubSuperRegister(Reg, 64);
+ MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
+ if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
+ BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
}
}
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
new file mode 100644
index 0000000000000..5be5ca8d71947
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,RC1 %s
+#
+# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
+# if compressed to a legacy op. MIR has been modified to force
diff erent register assignments.
+#
+# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
+# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
+#
+# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
+# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
+#
+--- |
+ define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+ ; RCDEFAULT-LABEL: partial_write:
+ ; RCDEFAULT: # %bb.0: # %entry
+ ; RCDEFAULT-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+ ; RCDEFAULT-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
+ ; RCDEFAULT-NEXT: addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8]
+ ; RCDEFAULT-NEXT: retq # encoding: [0xc3]
+ ;
+ ; RC1-LABEL: partial_write:
+ ; RC1: # %bb.0: # %entry
+ ; RC1-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+ ; RC1-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
+ ; RC1-NEXT: addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8]
+ ; RC1-NEXT: retq # encoding: [0xc3]
+ entry:
+ %add = add nsw i32 %b, %a
+ store i32 %add, ptr %p, align 4
+ %add1 = trunc i32 %add to i16
+ %add2 = add i16 %add1, %x
+ ret i16 %add2
+ }
+
+ define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+ ; CHECK-LABEL: no_partial_write:
+ ; CHECK: # %bb.0: # %entry
+ ; CHECK-NEXT: addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
+ ; CHECK-NEXT: movl %edx, (%rdi) # encoding: [0x89,0x17]
+ ; CHECK-NEXT: addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
+ ; CHECK-NEXT: retq # encoding: [0xc3]
+ entry:
+ %add = add nsw i32 %b, %a
+ store i32 %add, ptr %p, align 4
+ %add1 = trunc i32 %add to i16
+ %add2 = add i16 %add1, %x
+ ret i16 %add2
+ }
+ attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
+...
+---
+name: partial_write
+tracksRegLiveness: true
+noVRegs: true
+noPhis: true
+isSSA: false
+body: |
+ bb.0.entry:
+ liveins: $ecx, $edx, $esi, $rdi
+ renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
+ renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+ RET64 $ax
+...
+---
+name: no_partial_write
+tracksRegLiveness: true
+noVRegs: true
+noPhis: true
+isSSA: false
+body: |
+ bb.0.entry:
+ liveins: $ecx, $edx, $esi, $rdi
+
+ renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
+ renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+ RET64 $ax
+...
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
new file mode 100644
index 0000000000000..d1c2c40bed494
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
@@ -0,0 +1,84 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RC1 %s
+#
+# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
+# if compressed to a legacy op. MIR has been modified to force
diff erent register assignments.
+#
+# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
+# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
+#
+# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
+# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
+#
+--- |
+ define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+ entry:
+ %add = add nsw i32 %b, %a
+ store i32 %add, ptr %p, align 4
+ %add1 = trunc i32 %add to i16
+ %add2 = add i16 %add1, %x
+ ret i16 %add2
+ }
+
+ define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+ entry:
+ %add = add nsw i32 %b, %a
+ store i32 %add, ptr %p, align 4
+ %add1 = trunc i32 %add to i16
+ %add2 = add i16 %add1, %x
+ ret i16 %add2
+ }
+ attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
+...
+---
+name: partial_write
+tracksRegLiveness: true
+noVRegs: true
+noPhis: true
+isSSA: false
+body: |
+ bb.0.entry:
+ liveins: $ecx, $edx, $esi, $rdi
+ ; RCDEFAULT-LABEL: name: partial_write
+ ; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi
+ ; RCDEFAULT-NEXT: {{ $}}
+ ; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ ; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
+ ; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax
+ ; RCDEFAULT-NEXT: RET64 $ax
+ ;
+ ; RC1-LABEL: name: partial_write
+ ; RC1: liveins: $ecx, $edx, $esi, $rdi
+ ; RC1-NEXT: {{ $}}
+ ; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ ; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
+ ; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+ ; RC1-NEXT: RET64 $ax
+ renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
+ renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+ RET64 $ax
+...
+---
+name: no_partial_write
+tracksRegLiveness: true
+noVRegs: true
+noPhis: true
+isSSA: false
+body: |
+ bb.0.entry:
+ liveins: $ecx, $edx, $esi, $rdi
+
+ ; CHECK-LABEL: name: no_partial_write
+ ; CHECK: liveins: $ecx, $edx, $esi, $rdi
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ ; CHECK-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
+ ; CHECK-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+ ; CHECK-NEXT: RET64 $ax
+ renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+ MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
+ renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+ RET64 $ax
+...
More information about the llvm-commits
mailing list