[llvm] 8ae2a18 - [X86] Use proxy scheduler models for bdver3/bdver4 cpus (#114873)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 04:33:07 PST 2024
Author: Simon Pilgrim
Date: 2024-11-13T12:33:04Z
New Revision: 8ae2a18736c15e0d0d9d0893b21bce4f3bf581c9
URL: https://github.com/llvm/llvm-project/commit/8ae2a18736c15e0d0d9d0893b21bce4f3bf581c9
DIFF: https://github.com/llvm/llvm-project/commit/8ae2a18736c15e0d0d9d0893b21bce4f3bf581c9.diff
LOG: [X86] Use proxy scheduler models for bdver3/bdver4 cpus (#114873)
We don't have specific models for bdver3/bdver4 cpus but we can use the
bdver2/znver1 models as proxy standins - these days the models are more
useful for analysis than for perfect instruction scheduling so these
should be fine.
While they don't accurately represent the bdver3/bdver4 architecture
(specifically the different fp-pipe layout), they give more accurate
latency/throughputs (vs Agner) than the default SandyBridge model, and
enable PostRA scheduling which all recent AMD models have benefitted
from.
I had to use the znver1 model for bdver4 so that we have AVX2
instruction coverage (none of the TBM/XOP/LWP/FMA4 instructions have
explicit schedules so this shouldn't be a problem) - they both
double-pump 256-bit instructions so this works pretty well.
This patch is based off a discussion at the devmtg regarding how easily
we can provide an actual scheduler model (or at least approximation) to
more of the X86 cpu targets - we can then add specific models if the
(unlikely) need arises.
Added:
Modified:
llvm/lib/Target/X86/X86.td
llvm/test/CodeGen/X86/lwp-intrinsics.ll
llvm/test/CodeGen/X86/rotate_vec.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 509632183dc016..75224d5b26e33f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1912,11 +1912,13 @@ def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
ProcessorFeatures.BdVer2Tuning>;
// Steamroller
-def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
- ProcessorFeatures.BdVer3Tuning>;
+// NOTE: BdVer2Model is only an approx model for Steamroller.
+def : ProcModel<"bdver3", BdVer2Model, ProcessorFeatures.BdVer3Features,
+ ProcessorFeatures.BdVer3Tuning>;
// Excavator
-def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
- ProcessorFeatures.BdVer4Tuning>;
+// NOTE: Znver1Model is only an approx model for Excavator (with AVX2).
+def : ProcModel<"bdver4", Znver1Model, ProcessorFeatures.BdVer4Features,
+ ProcessorFeatures.BdVer4Tuning>;
def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
ProcessorFeatures.ZNTuning>;
diff --git a/llvm/test/CodeGen/X86/lwp-intrinsics.ll b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
index d3ce7f5dbc4d66..6f32b09c838f08 100644
--- a/llvm/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+lwp | FileCheck %s --check-prefixes=X86,X86_LWP
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver1 | FileCheck %s --check-prefixes=X86,X86_BDVER1
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=X86,X86_BDVER2
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver3 | FileCheck %s --check-prefixes=X86,X86_BDVER3
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=X86,X86_BDVER4
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver1 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver3 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=X86,X86_BDVER
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+lwp | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver1 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=X64
@@ -49,41 +49,14 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
; X86_LWP-NEXT: setb %al
; X86_LWP-NEXT: retl
;
-; X86_BDVER1-LABEL: test_lwpins32_rri:
-; X86_BDVER1: # %bb.0:
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER1-NEXT: addl %ecx, %ecx
-; X86_BDVER1-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER1-NEXT: setb %al
-; X86_BDVER1-NEXT: retl
-;
-; X86_BDVER2-LABEL: test_lwpins32_rri:
-; X86_BDVER2: # %bb.0:
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER2-NEXT: addl %ecx, %ecx
-; X86_BDVER2-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER2-NEXT: setb %al
-; X86_BDVER2-NEXT: retl
-;
-; X86_BDVER3-LABEL: test_lwpins32_rri:
-; X86_BDVER3: # %bb.0:
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER3-NEXT: addl %ecx, %ecx
-; X86_BDVER3-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER3-NEXT: setb %al
-; X86_BDVER3-NEXT: retl
-;
-; X86_BDVER4-LABEL: test_lwpins32_rri:
-; X86_BDVER4: # %bb.0:
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER4-NEXT: addl %ecx, %ecx
-; X86_BDVER4-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER4-NEXT: setb %al
-; X86_BDVER4-NEXT: retl
+; X86_BDVER-LABEL: test_lwpins32_rri:
+; X86_BDVER: # %bb.0:
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER-NEXT: addl %ecx, %ecx
+; X86_BDVER-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER-NEXT: setb %al
+; X86_BDVER-NEXT: retl
;
; X64-LABEL: test_lwpins32_rri:
; X64: # %bb.0:
@@ -124,37 +97,13 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
; X86_LWP-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
; X86_LWP-NEXT: retl
;
-; X86_BDVER1-LABEL: test_lwpval32_rri:
-; X86_BDVER1: # %bb.0:
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER1-NEXT: addl %ecx, %ecx
-; X86_BDVER1-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER1-NEXT: retl
-;
-; X86_BDVER2-LABEL: test_lwpval32_rri:
-; X86_BDVER2: # %bb.0:
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER2-NEXT: addl %ecx, %ecx
-; X86_BDVER2-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER2-NEXT: retl
-;
-; X86_BDVER3-LABEL: test_lwpval32_rri:
-; X86_BDVER3: # %bb.0:
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER3-NEXT: addl %ecx, %ecx
-; X86_BDVER3-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER3-NEXT: retl
-;
-; X86_BDVER4-LABEL: test_lwpval32_rri:
-; X86_BDVER4: # %bb.0:
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER4-NEXT: addl %ecx, %ecx
-; X86_BDVER4-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER4-NEXT: retl
+; X86_BDVER-LABEL: test_lwpval32_rri:
+; X86_BDVER: # %bb.0:
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER-NEXT: addl %ecx, %ecx
+; X86_BDVER-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER-NEXT: retl
;
; X64-LABEL: test_lwpval32_rri:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index 11d62c307a1ddf..a5349cb33193ff 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -162,21 +162,13 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
}
define <8 x i16> @or_fshl_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; XOPAVX1-LABEL: or_fshl_v8i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsllw $5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: or_fshl_v8i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllw $5, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: or_fshl_v8i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; XOP-NEXT: vpsrlw $11, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $5, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: or_fshl_v8i16:
; AVX512: # %bb.0:
@@ -193,21 +185,13 @@ define <8 x i16> @or_fshl_v8i16(<8 x i16> %x, <8 x i16> %y) {
}
define <4 x i32> @or_fshl_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; XOPAVX1-LABEL: or_fshl_v4i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrld $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpslld $21, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: or_fshl_v4i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpslld $21, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsrld $11, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: or_fshl_v4i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; XOP-NEXT: vpsrld $11, %xmm0, %xmm0
+; XOP-NEXT: vpslld $21, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: or_fshl_v4i32:
; AVX512: # %bb.0:
More information about the llvm-commits
mailing list