[llvm] [X86] Prevent APX NDD compression when it creates a partial write (PR #132051)

Daniel Zabawa via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 21 07:52:19 PDT 2025


https://github.com/daniel-zabawa updated https://github.com/llvm/llvm-project/pull/132051

>From ae57ca5afd70dcd6501b1b8576dad1806f6d15ad Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Wed, 19 Mar 2025 08:13:16 -0700
Subject: [PATCH 1/4] [X86] Prevent APX NDD compression when it creates a
 partial write

APX NDD instructions may be compressed when the result is also a source.
For 8/16b instructions, this may create partial register write hazards
if a previous super-register def is within the partial reg update
clearance, or incorrect code if the super-register is not dead.

This change prevents compression when the super-register is marked
as an implicit define, which the virtual rewriter already adds in
the case where a subregister is defined but the super-register is not
dead.

The BreakFalseDeps interface is also updated to add implicit
super-register defs for NDD instructions that would incur partial-write
stalls if compressed to legacy ops.
---
 llvm/lib/Target/X86/X86CompressEVEX.cpp       | 17 ++++
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 53 ++++++++--
 .../CodeGen/X86/apx/ndd-false-deps-asm.mir    | 89 +++++++++++++++++
 llvm/test/CodeGen/X86/apx/ndd-false-deps.mir  | 96 +++++++++++++++++++
 4 files changed, 246 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
 create mode 100644 llvm/test/CodeGen/X86/apx/ndd-false-deps.mir

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index b80c21b008f4b..6c22bd9c46b4f 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -237,6 +237,23 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
       return 0;
     return I->NewOpc;
   };
+
+  // Redundant NDD ops cannot be safely compressed if either:
+  // - the legacy op would introduce a partial write that BreakFalseDeps
+  // identified as a potential stall, or
+  // - the op is writing to a subregister of a live register, i.e. the
+  // full (zeroed) result is used.
+  // Both cases are indicated by an implicit def of the superregister.
+  if (IsRedundantNDD) {
+    Register Dst = MI.getOperand(0).getReg();
+    if (Dst &&
+        (X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
+      Register Super = getX86SubSuperRegister(Dst, ST.is64Bit() ? 64 : 32);
+      if (MI.definesRegister(Super, /*TRI=*/nullptr))
+        IsRedundantNDD = false;
+    }
+  }
+
   // NonNF -> NF only if it's not a compressible NDD instruction and eflags is
   // dead.
   unsigned NewOpc = IsRedundantNDD
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 5c65171dd83b0..a8a533de30cbf 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6793,19 +6793,42 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
     const TargetRegisterInfo *TRI) const {
-  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
+
+  // With the NDD/ZU features, ISel may generate NDD/ZU ops which
+  // appear to perform partial writes. We detect these based on flags
+  // and register class.
+  bool HasNDDPartialWrite = false;
+  if (OpNum == 0 && (Subtarget.hasNDD() || Subtarget.hasZU()) &&
+      X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
+    Register Reg = MI.getOperand(0).getReg();
+    if (Reg.isVirtual()) {
+      auto &MRI = MI.getParent()->getParent()->getRegInfo();
+      if (auto *TRC = MRI.getRegClassOrNull(Reg))
+        HasNDDPartialWrite = (TRC->getID() == X86::GR16RegClassID ||
+                              TRC->getID() == X86::GR8RegClassID);
+    } else
+      HasNDDPartialWrite =
+          X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
+  }
+
+  if (OpNum != 0 ||
+      !(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
     return 0;
 
-  // If MI is marked as reading Reg, the partial register update is wanted.
+  // For non-NDD ops, if MI is marked as reading Reg, the partial register
+  // update is wanted, hence we return 0.
+  // For NDD ops, if MI is marked as reading Reg, then it is possible to
+  // compress to a legacy form in CompressEVEX, which would create an
+  // unwanted partial update, so we return the clearance.
   const MachineOperand &MO = MI.getOperand(0);
   Register Reg = MO.getReg();
-  if (Reg.isVirtual()) {
-    if (MO.readsReg() || MI.readsVirtualRegister(Reg))
-      return 0;
-  } else {
-    if (MI.readsRegister(Reg, TRI))
-      return 0;
-  }
+  bool ReadsReg = false;
+  if (Reg.isVirtual())
+    ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
+  else
+    ReadsReg = MI.readsRegister(Reg, TRI);
+  if (ReadsReg != HasNDDPartialWrite)
+    return 0;
 
   // If any instructions in the clearance range are reading Reg, insert a
   // dependency breaking instruction, which is inexpensive and is likely to
@@ -7229,6 +7252,18 @@ void X86InstrInfo::breakPartialRegDependency(
         .addReg(Reg, RegState::Undef)
         .addReg(Reg, RegState::Undef);
     MI.addRegisterKilled(Reg, TRI, true);
+  } else if ((X86::GR16RegClass.contains(Reg) ||
+              X86::GR8RegClass.contains(Reg)) &&
+             X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
+    // This case is only expected for NDD ops which appear to be partial
+    // writes, but are not due to the zeroing of the upper part. Here
+    // we add an implicit def of the superegister, which prevents
+    // CompressEVEX from converting this to a legacy form.
+    Register SuperReg =
+        getX86SubSuperRegister(Reg, Subtarget.is64Bit() ? 64 : 32);
+    MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
+    if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
+      BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
new file mode 100644
index 0000000000000..e297e87413ada
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RC1 %s
+#
+# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
+# if compressed to a legacy op. MIR has been modified to force different register assignments.
+#
+# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
+# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
+#
+# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
+# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
+#
+--- |
+  define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+  ; RCDEFAULT-LABEL: partial_write:
+  ; RCDEFAULT:       # %bb.0: # %entry
+  ; RCDEFAULT-NEXT:    addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+  ; RCDEFAULT-NEXT:    movl %eax, (%rdi) # encoding: [0x89,0x07]
+  ; RCDEFAULT-NEXT:    addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8]
+  ; RCDEFAULT-NEXT:    retq # encoding: [0xc3]
+  ;
+  ; RC1-LABEL: partial_write:
+  ; RC1:       # %bb.0: # %entry
+  ; RC1-NEXT:    addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
+  ; RC1-NEXT:    movl %eax, (%rdi) # encoding: [0x89,0x07]
+  ; RC1-NEXT:    addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8]
+  ; RC1-NEXT:    retq # encoding: [0xc3]
+  entry:
+    %add = add nsw i32 %b, %a
+    store i32 %add, ptr %p, align 4, !tbaa !1
+    %add1 = trunc i32 %add to i16
+    %add2 = add i16 %add1, %x
+    ret i16 %add2
+  }
+
+  define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+  ; ASM-LABEL: no_partial_write:
+  ; ASM:       # %bb.0: # %entry
+  ; ASM-NEXT:    addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
+  ; ASM-NEXT:    movl %edx, (%rdi) # encoding: [0x89,0x17]
+  ; ASM-NEXT:    addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
+  ; ASM-NEXT:    retq # encoding: [0xc3]
+  entry:
+    %add = add nsw i32 %b, %a
+    store i32 %add, ptr %p, align 4, !tbaa !1
+    %add1 = trunc i32 %add to i16
+    %add2 = add i16 %add1, %x
+    ret i16 %add2
+  }
+  attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!2, !2, i64 0}
+  !2 = !{!"int", !3, i64 0}
+  !3 = !{!"omnipotent char", !4, i64 0}
+  !4 = !{!"Simple C/C++ TBAA"}
+...
+---
+name:            partial_write
+tracksRegLiveness: true
+noVRegs:         true
+noPhis:          true
+isSSA:           false
+body:             |
+  bb.0.entry:
+    liveins: $ecx, $edx, $esi, $rdi
+    renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+    RET64 $ax
+...
+---
+name:            no_partial_write
+tracksRegLiveness: true
+noVRegs:         true
+noPhis:          true
+isSSA:           false
+body:             |
+  bb.0.entry:
+    liveins: $ecx, $edx, $esi, $rdi
+
+    renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
+    renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+    RET64 $ax
+...
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
new file mode 100644
index 0000000000000..376a89180290d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
@@ -0,0 +1,96 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -o - | FileCheck --check-prefixes=MIR,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -o - | FileCheck --check-prefixes=MIR,RC1 %s
+#
+# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
+# if compressed to a legacy op. MIR has been modified to force different register assignments.
+#
+# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
+# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
+#
+# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
+# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
+#
+--- |
+  define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+  entry:
+    %add = add nsw i32 %b, %a
+    store i32 %add, ptr %p, align 4, !tbaa !1
+    %add1 = trunc i32 %add to i16
+    %add2 = add i16 %add1, %x
+    ret i16 %add2
+  }
+
+  define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
+  entry:
+    %add = add nsw i32 %b, %a
+    store i32 %add, ptr %p, align 4, !tbaa !1
+    %add1 = trunc i32 %add to i16
+    %add2 = add i16 %add1, %x
+    ret i16 %add2
+  }
+  attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{!2, !2, i64 0}
+  !2 = !{!"int", !3, i64 0}
+  !3 = !{!"omnipotent char", !4, i64 0}
+  !4 = !{!"Simple C/C++ TBAA"}
+...
+---
+name:            partial_write
+tracksRegLiveness: true
+noVRegs:         true
+noPhis:          true
+isSSA:           false
+body:             |
+  bb.0.entry:
+    liveins: $ecx, $edx, $esi, $rdi
+    ; RCDEFAULT-LABEL: name: partial_write
+    ; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi
+    ; RCDEFAULT-NEXT: {{  $}}
+    ; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    ; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    ; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax
+    ; RCDEFAULT-NEXT: RET64 $ax
+    ;
+    ; RC1-LABEL: name: partial_write
+    ; RC1: liveins: $ecx, $edx, $esi, $rdi
+    ; RC1-NEXT: {{  $}}
+    ; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    ; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    ; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+    ; RC1-NEXT: RET64 $ax
+    renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
+    RET64 $ax
+...
+---
+name:            no_partial_write
+tracksRegLiveness: true
+noVRegs:         true
+noPhis:          true
+isSSA:           false
+body:             |
+  bb.0.entry:
+    liveins: $ecx, $edx, $esi, $rdi
+
+    ; MIR-LABEL: name: no_partial_write
+    ; MIR: liveins: $ecx, $edx, $esi, $rdi
+    ; MIR-NEXT: {{  $}}
+    ; MIR-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    ; MIR-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
+    ; MIR-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+    ; MIR-NEXT: RET64 $ax
+    renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
+    renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+    RET64 $ax
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# MIR: {{.*}}
+# RC1: {{.*}}
+# RCDEFAULT: {{.*}}

>From 1a6499421a06bed7c6a67d4d9a79bf43024d8a03 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Thu, 20 Mar 2025 11:55:59 -0700
Subject: [PATCH 2/4] add verify-machineinstrs to tests, remove unneeded checks

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp          |  2 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp             | 12 ++++++------
 llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 6c22bd9c46b4f..7883f720ffa79 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -248,7 +248,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
     Register Dst = MI.getOperand(0).getReg();
     if (Dst &&
         (X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
-      Register Super = getX86SubSuperRegister(Dst, ST.is64Bit() ? 64 : 32);
+      Register Super = getX86SubSuperRegister(Dst, 64);
       if (MI.definesRegister(Super, /*TRI=*/nullptr))
         IsRedundantNDD = false;
     }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index a8a533de30cbf..d3fcaab8cb631 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6794,12 +6794,14 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
     const TargetRegisterInfo *TRI) const {
 
+  if (OpNum != 0)
+    return 0;
+
   // With the NDD/ZU features, ISel may generate NDD/ZU ops which
   // appear to perform partial writes. We detect these based on flags
   // and register class.
   bool HasNDDPartialWrite = false;
-  if (OpNum == 0 && (Subtarget.hasNDD() || Subtarget.hasZU()) &&
-      X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
+  if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
     Register Reg = MI.getOperand(0).getReg();
     if (Reg.isVirtual()) {
       auto &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -6811,8 +6813,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
           X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
   }
 
-  if (OpNum != 0 ||
-      !(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
+  if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
     return 0;
 
   // For non-NDD ops, if MI is marked as reading Reg, the partial register
@@ -7259,8 +7260,7 @@ void X86InstrInfo::breakPartialRegDependency(
     // writes, but are not due to the zeroing of the upper part. Here
     // we add an implicit def of the superegister, which prevents
     // CompressEVEX from converting this to a legacy form.
-    Register SuperReg =
-        getX86SubSuperRegister(Reg, Subtarget.is64Bit() ? 64 : 32);
+    Register SuperReg = getX86SubSuperRegister(Reg, 64);
     MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
     if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
       BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
index e297e87413ada..4edaa1b007bed 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RCDEFAULT %s
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RC1 %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -verify-machineinstrs -partial-reg-update-clearance=1 -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RC1 %s
 #
 # Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
 # if compressed to a legacy op. MIR has been modified to force different register assignments.

>From 53d490aa00300efccdda23c24699d3f7005274a9 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Fri, 21 Mar 2025 06:44:53 -0700
Subject: [PATCH 3/4] remove vreg case from NDD check, reword comments and
 remove chaff from test

---
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 22 +++++++-------------
 llvm/test/CodeGen/X86/apx/ndd-false-deps.mir |  4 ----
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d3fcaab8cb631..c1091e454e043 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6797,18 +6797,12 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
   if (OpNum != 0)
     return 0;
 
-  // With the NDD/ZU features, ISel may generate NDD/ZU ops which
-  // appear to perform partial writes. We detect these based on flags
-  // and register class.
+  // NDD ops with 8/16b results may appear to be partial register
+  // updates after register allocation.
   bool HasNDDPartialWrite = false;
   if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
     Register Reg = MI.getOperand(0).getReg();
-    if (Reg.isVirtual()) {
-      auto &MRI = MI.getParent()->getParent()->getRegInfo();
-      if (auto *TRC = MRI.getRegClassOrNull(Reg))
-        HasNDDPartialWrite = (TRC->getID() == X86::GR16RegClassID ||
-                              TRC->getID() == X86::GR8RegClassID);
-    } else
+    if (!Reg.isVirtual())
       HasNDDPartialWrite =
           X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
   }
@@ -6816,11 +6810,11 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
   if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
     return 0;
 
-  // For non-NDD ops, if MI is marked as reading Reg, the partial register
-  // update is wanted, hence we return 0.
-  // For NDD ops, if MI is marked as reading Reg, then it is possible to
-  // compress to a legacy form in CompressEVEX, which would create an
-  // unwanted partial update, so we return the clearance.
+  // Check if the result register is also used as a source.
+  // For non-NDD ops, this means a partial update is wanted, hence we return 0.
+  // For NDD ops, this means it is possible to compress the instruction
+  // to a legacy form in CompressEVEX, which would create an unwanted partial
+  // update, so we return the clearance.
   const MachineOperand &MO = MI.getOperand(0);
   Register Reg = MO.getReg();
   bool ReadsReg = false;
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
index 376a89180290d..a8f484f3e8509 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
@@ -90,7 +90,3 @@ body:             |
     renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
     RET64 $ax
 ...
-## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-# MIR: {{.*}}
-# RC1: {{.*}}
-# RCDEFAULT: {{.*}}

>From f485dc2735125a27ca5e9c5a0c75848748aba635 Mon Sep 17 00:00:00 2001
From: "Zabawa, Daniel" <daniel.zabawa at intel.com>
Date: Fri, 21 Mar 2025 07:52:01 -0700
Subject: [PATCH 4/4] cleanup of tests

---
 .../CodeGen/X86/apx/ndd-false-deps-asm.mir    | 32 ++++++----------
 llvm/test/CodeGen/X86/apx/ndd-false-deps.mir  | 38 ++++++++-----------
 2 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
index 4edaa1b007bed..5be5ca8d71947 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps-asm.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RCDEFAULT %s
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -verify-machineinstrs -partial-reg-update-clearance=1 -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RC1 %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -show-mc-encoding -o - | FileCheck --check-prefixes=CHECK,RC1 %s
 #
 # Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
 # if compressed to a legacy op. MIR has been modified to force different register assignments.
@@ -28,35 +28,27 @@
   ; RC1-NEXT:    retq # encoding: [0xc3]
   entry:
     %add = add nsw i32 %b, %a
-    store i32 %add, ptr %p, align 4, !tbaa !1
+    store i32 %add, ptr %p, align 4
     %add1 = trunc i32 %add to i16
     %add2 = add i16 %add1, %x
     ret i16 %add2
   }
 
   define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
-  ; ASM-LABEL: no_partial_write:
-  ; ASM:       # %bb.0: # %entry
-  ; ASM-NEXT:    addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
-  ; ASM-NEXT:    movl %edx, (%rdi) # encoding: [0x89,0x17]
-  ; ASM-NEXT:    addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
-  ; ASM-NEXT:    retq # encoding: [0xc3]
+  ; CHECK-LABEL: no_partial_write:
+  ; CHECK:       # %bb.0: # %entry
+  ; CHECK-NEXT:    addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
+  ; CHECK-NEXT:    movl %edx, (%rdi) # encoding: [0x89,0x17]
+  ; CHECK-NEXT:    addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
+  ; CHECK-NEXT:    retq # encoding: [0xc3]
   entry:
     %add = add nsw i32 %b, %a
-    store i32 %add, ptr %p, align 4, !tbaa !1
+    store i32 %add, ptr %p, align 4
     %add1 = trunc i32 %add to i16
     %add2 = add i16 %add1, %x
     ret i16 %add2
   }
   attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
-
-  !llvm.module.flags = !{!0}
-
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{!2, !2, i64 0}
-  !2 = !{!"int", !3, i64 0}
-  !3 = !{!"omnipotent char", !4, i64 0}
-  !4 = !{!"Simple C/C++ TBAA"}
 ...
 ---
 name:            partial_write
@@ -68,7 +60,7 @@ body:             |
   bb.0.entry:
     liveins: $ecx, $edx, $esi, $rdi
     renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
     renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
     RET64 $ax
 ...
@@ -83,7 +75,7 @@ body:             |
     liveins: $ecx, $edx, $esi, $rdi
 
     renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
     renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
     RET64 $ax
 ...
diff --git a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
index a8f484f3e8509..d1c2c40bed494 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
+++ b/llvm/test/CodeGen/X86/apx/ndd-false-deps.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -o - | FileCheck --check-prefixes=MIR,RCDEFAULT %s
-# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -o - | FileCheck --check-prefixes=MIR,RC1 %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RCDEFAULT %s
+# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -verify-machineinstrs -o - | FileCheck --check-prefixes=CHECK,RC1 %s
 #
 # Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
 # if compressed to a legacy op. MIR has been modified to force different register assignments.
@@ -15,7 +15,7 @@
   define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
   entry:
     %add = add nsw i32 %b, %a
-    store i32 %add, ptr %p, align 4, !tbaa !1
+    store i32 %add, ptr %p, align 4
     %add1 = trunc i32 %add to i16
     %add2 = add i16 %add1, %x
     ret i16 %add2
@@ -24,20 +24,12 @@
   define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
   entry:
     %add = add nsw i32 %b, %a
-    store i32 %add, ptr %p, align 4, !tbaa !1
+    store i32 %add, ptr %p, align 4
     %add1 = trunc i32 %add to i16
     %add2 = add i16 %add1, %x
     ret i16 %add2
   }
   attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
-
-  !llvm.module.flags = !{!0}
-
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{!2, !2, i64 0}
-  !2 = !{!"int", !3, i64 0}
-  !3 = !{!"omnipotent char", !4, i64 0}
-  !4 = !{!"Simple C/C++ TBAA"}
 ...
 ---
 name:            partial_write
@@ -52,7 +44,7 @@ body:             |
     ; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi
     ; RCDEFAULT-NEXT: {{  $}}
     ; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    ; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    ; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
     ; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax
     ; RCDEFAULT-NEXT: RET64 $ax
     ;
@@ -60,11 +52,11 @@ body:             |
     ; RC1: liveins: $ecx, $edx, $esi, $rdi
     ; RC1-NEXT: {{  $}}
     ; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    ; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    ; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
     ; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
     ; RC1-NEXT: RET64 $ax
     renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p)
     renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
     RET64 $ax
 ...
@@ -78,15 +70,15 @@ body:             |
   bb.0.entry:
     liveins: $ecx, $edx, $esi, $rdi
 
-    ; MIR-LABEL: name: no_partial_write
-    ; MIR: liveins: $ecx, $edx, $esi, $rdi
-    ; MIR-NEXT: {{  $}}
-    ; MIR-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    ; MIR-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
-    ; MIR-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
-    ; MIR-NEXT: RET64 $ax
+    ; CHECK-LABEL: name: no_partial_write
+    ; CHECK: liveins: $ecx, $edx, $esi, $rdi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
+    ; CHECK-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
+    ; CHECK-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
+    ; CHECK-NEXT: RET64 $ax
     renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
-    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
+    MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p)
     renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
     RET64 $ax
 ...



More information about the llvm-commits mailing list