[llvm] 1b86ad2 - Use 15 byte long nops on modern Intel processors

Fri Mar 13 10:51:33 PDT 2020

Author: Philip Reames
Date: 2020-03-13T10:51:09-07:00
New Revision: 1b86ad27a7d38c3751c1cbe65827cc6819dfb4fd

URL: https://github.com/llvm/llvm-project/commit/1b86ad27a7d38c3751c1cbe65827cc6819dfb4fd
DIFF: https://github.com/llvm/llvm-project/commit/1b86ad27a7d38c3751c1cbe65827cc6819dfb4fd.diff

LOG: Use 15 byte long nops on modern Intel processors

Back in D42616, we switched our default nop length from 15 to 10 bytes because some platforms have painful decode stalls when encountering multiple instruction prefixes. (10 byte long nops come from the fact that prefixes are used to pad after 8 bytes, and some platforms have issues w/more than two prefixes.)

Based on Agner's guides, it appears to be the case that modern Intel (SandyBridge and later) can decode an arbitrary number of prefixes without issue. Intel's guide only provides up to 9 bytes; I read that as providing a safe default for all their chips. Older chips and Atom series have serious decode stalls. I can't find a conclusive reference beyond those two.

Differential Revision: https://reviews.llvm.org/D75945

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86.td
    llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
    llvm/test/MC/X86/align-via-relaxation.s
    llvm/test/MC/X86/x86_long_nop.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 931367ce4de5..6c4ab8f5a6f3 100644

--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -551,7 +551,8 @@ def ProcessorFeatures {
                                                   FeatureSlow3OpsLEA,
                                                   FeatureFastScalarFSQRT,
                                                   FeatureFastSHLDRotate,
-                                                  FeatureMergeToThreeWayBranch];
+                                                  FeatureMergeToThreeWayBranch,
+                                                  FeatureFast15ByteNOP];
   list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
                                                 FeaturePOPCNTFalseDeps];
   list<SubtargetFeature> SNBInheritableFeatures =

diff  --git a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
index 292fa6b18223..7db20187d45d 100644
--- a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
+++ b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions.ll
@@ -69,8 +69,12 @@ define void @patchpoint(i64 %a, i64 %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    #noautopadding
 ; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
+; CHECK-NEXT:    .byte 102
 ; CHECK-NEXT:    nopw %cs:512(%rax,%rax)
-; CHECK-NEXT:    nopl 8(%rax,%rax)
 ; CHECK-NEXT:    #autopadding
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8

diff  --git a/llvm/test/MC/X86/align-via-relaxation.s b/llvm/test/MC/X86/align-via-relaxation.s
index 7d91eaacf9a9..f81e4c73e4f2 100644
--- a/llvm/test/MC/X86/align-via-relaxation.s
+++ b/llvm/test/MC/X86/align-via-relaxation.s
@@ -32,8 +32,7 @@ foo:
   # that would require a further round of relaxation
   # CHECK: <bar>:
   # CHECK: 22: eb fe                          jmp -2 <bar>
-  # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00  nopw %cs:(%rax,%rax)
-  # CHECK: 2e: 66 90                          nop
+  # CHECK: 24: 66 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)
   # CHECK: 30: 0f 0b                          ud2
 
 bar:  
@@ -48,8 +47,8 @@ nobypass:
   # CHECK: <loop_preheader>:
   # CHECK: 45: 48 85 c0                       testq %rax, %rax
   # CHECK: 48: 0f 8e 22 00 00 00              jle 34 <loop_exit>
-  # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00  nopw %cs:(%rax,%rax)
-  # CHECK: 58: 0f 1f 84 00 00 00 00 00        nopl (%rax,%rax)
+  # CHECK: 4e: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)
+  # CHECK: 5d: 0f 1f 00                       nopl (%rax)
   # CHECK: <loop_header>:
   # CHECK: 60: 48 83 e8 01                    subq $1, %rax
   # CHECK: 64: 48 85 c0                       testq %rax, %rax

diff  --git a/llvm/test/MC/X86/x86_long_nop.s b/llvm/test/MC/X86/x86_long_nop.s
index 68d46d0d6e16..1676bde58255 100644
--- a/llvm/test/MC/X86/x86_long_nop.s
+++ b/llvm/test/MC/X86/x86_long_nop.s
@@ -17,12 +17,12 @@
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=nehalem %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=westmere %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=sandybridge %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=ivybridge %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=haswell %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=broadwell %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=skylake %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=skx %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=sandybridge %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=ivybridge %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=haswell %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=broadwell %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=skylake %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=skx %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=knl %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=knm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s