[llvm] [X86] Use proxy scheduler models for bdver3/bdver4 cpus (PR #114873)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 13 03:40:47 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/114873
>From a95ec51ee1538c332b8164b3bcdaca359e77a433 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 4 Nov 2024 21:33:19 +0000
Subject: [PATCH] [X86] Use proxy scheduler models for bdver3/bdver4 cpus
We don't have specific models for bdver3/bdver4 cpus but we can use the bdver2/znver1 models as proxy standins - these days the models are more useful for analysis than for perfect instruction scheduling so these should be fine.
While they don't accurately represent the bdver3/bdver4 architecture (specifically the different fp-pipe layout), they give more accurate latency/throughputs (vs Agner) then the default SandyBridge model, and enable PostRA scheduling which all recent AMD models have benefitted from.
I had to use the znver1 model for bdver4 so that we have AVX2 instruction coverage (none of the TBM/XOP/LWP instructions have explicit schedule so this shouldn't be a problems) - they both double-pump 256-bit instructions so this works pretty well.
This patch is based off a discussion at the devmtg regarding how easily we can provide an actual scheduler model (or at least approximation) to more of the X86 cpu targets.
---
llvm/lib/Target/X86/X86.td | 10 +--
llvm/test/CodeGen/X86/lwp-intrinsics.ll | 89 ++++++-------------------
llvm/test/CodeGen/X86/rotate_vec.ll | 44 ++++--------
3 files changed, 39 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 509632183dc016..75224d5b26e33f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1912,11 +1912,13 @@ def : ProcModel<"bdver1", BdVer2Model, ProcessorFeatures.BdVer1Features,
def : ProcModel<"bdver2", BdVer2Model, ProcessorFeatures.BdVer2Features,
ProcessorFeatures.BdVer2Tuning>;
// Steamroller
-def : Proc<"bdver3", ProcessorFeatures.BdVer3Features,
- ProcessorFeatures.BdVer3Tuning>;
+// NOTE: BdVer2Model is only an approx model for Steamroller.
+def : ProcModel<"bdver3", BdVer2Model, ProcessorFeatures.BdVer3Features,
+ ProcessorFeatures.BdVer3Tuning>;
// Excavator
-def : Proc<"bdver4", ProcessorFeatures.BdVer4Features,
- ProcessorFeatures.BdVer4Tuning>;
+// NOTE: Znver1Model is only an approx model for Excavator (with AVX2).
+def : ProcModel<"bdver4", Znver1Model, ProcessorFeatures.BdVer4Features,
+ ProcessorFeatures.BdVer4Tuning>;
def : ProcModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures,
ProcessorFeatures.ZNTuning>;
diff --git a/llvm/test/CodeGen/X86/lwp-intrinsics.ll b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
index d3ce7f5dbc4d66..6f32b09c838f08 100644
--- a/llvm/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+lwp | FileCheck %s --check-prefixes=X86,X86_LWP
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver1 | FileCheck %s --check-prefixes=X86,X86_BDVER1
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=X86,X86_BDVER2
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver3 | FileCheck %s --check-prefixes=X86,X86_BDVER3
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=X86,X86_BDVER4
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver1 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver3 | FileCheck %s --check-prefixes=X86,X86_BDVER
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=X86,X86_BDVER
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+lwp | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver1 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=X64
@@ -49,41 +49,14 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
; X86_LWP-NEXT: setb %al
; X86_LWP-NEXT: retl
;
-; X86_BDVER1-LABEL: test_lwpins32_rri:
-; X86_BDVER1: # %bb.0:
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER1-NEXT: addl %ecx, %ecx
-; X86_BDVER1-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER1-NEXT: setb %al
-; X86_BDVER1-NEXT: retl
-;
-; X86_BDVER2-LABEL: test_lwpins32_rri:
-; X86_BDVER2: # %bb.0:
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER2-NEXT: addl %ecx, %ecx
-; X86_BDVER2-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER2-NEXT: setb %al
-; X86_BDVER2-NEXT: retl
-;
-; X86_BDVER3-LABEL: test_lwpins32_rri:
-; X86_BDVER3: # %bb.0:
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER3-NEXT: addl %ecx, %ecx
-; X86_BDVER3-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER3-NEXT: setb %al
-; X86_BDVER3-NEXT: retl
-;
-; X86_BDVER4-LABEL: test_lwpins32_rri:
-; X86_BDVER4: # %bb.0:
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER4-NEXT: addl %ecx, %ecx
-; X86_BDVER4-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86_BDVER4-NEXT: setb %al
-; X86_BDVER4-NEXT: retl
+; X86_BDVER-LABEL: test_lwpins32_rri:
+; X86_BDVER: # %bb.0:
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER-NEXT: addl %ecx, %ecx
+; X86_BDVER-NEXT: lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER-NEXT: setb %al
+; X86_BDVER-NEXT: retl
;
; X64-LABEL: test_lwpins32_rri:
; X64: # %bb.0:
@@ -124,37 +97,13 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
; X86_LWP-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
; X86_LWP-NEXT: retl
;
-; X86_BDVER1-LABEL: test_lwpval32_rri:
-; X86_BDVER1: # %bb.0:
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER1-NEXT: addl %ecx, %ecx
-; X86_BDVER1-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER1-NEXT: retl
-;
-; X86_BDVER2-LABEL: test_lwpval32_rri:
-; X86_BDVER2: # %bb.0:
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER2-NEXT: addl %ecx, %ecx
-; X86_BDVER2-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER2-NEXT: retl
-;
-; X86_BDVER3-LABEL: test_lwpval32_rri:
-; X86_BDVER3: # %bb.0:
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER3-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER3-NEXT: addl %ecx, %ecx
-; X86_BDVER3-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER3-NEXT: retl
-;
-; X86_BDVER4-LABEL: test_lwpval32_rri:
-; X86_BDVER4: # %bb.0:
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER4-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_BDVER4-NEXT: addl %ecx, %ecx
-; X86_BDVER4-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86_BDVER4-NEXT: retl
+; X86_BDVER-LABEL: test_lwpval32_rri:
+; X86_BDVER: # %bb.0:
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER-NEXT: addl %ecx, %ecx
+; X86_BDVER-NEXT: lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER-NEXT: retl
;
; X64-LABEL: test_lwpval32_rri:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index 11d62c307a1ddf..a5349cb33193ff 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -162,21 +162,13 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
}
define <8 x i16> @or_fshl_v8i16(<8 x i16> %x, <8 x i16> %y) {
-; XOPAVX1-LABEL: or_fshl_v8i16:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpsllw $5, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: or_fshl_v8i16:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsllw $5, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsrlw $11, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: or_fshl_v8i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; XOP-NEXT: vpsrlw $11, %xmm0, %xmm0
+; XOP-NEXT: vpsllw $5, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: or_fshl_v8i16:
; AVX512: # %bb.0:
@@ -193,21 +185,13 @@ define <8 x i16> @or_fshl_v8i16(<8 x i16> %x, <8 x i16> %y) {
}
define <4 x i32> @or_fshl_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; XOPAVX1-LABEL: or_fshl_v4i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsrld $11, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpslld $21, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: or_fshl_v4i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpslld $21, %xmm1, %xmm1
-; XOPAVX2-NEXT: vpsrld $11, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; XOPAVX2-NEXT: retq
+; XOP-LABEL: or_fshl_v4i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; XOP-NEXT: vpsrld $11, %xmm0, %xmm0
+; XOP-NEXT: vpslld $21, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; XOP-NEXT: retq
;
; AVX512-LABEL: or_fshl_v4i32:
; AVX512: # %bb.0:
More information about the llvm-commits
mailing list