[llvm] [X86] Enable TuningSlowDivide64 on Barcelona/Bobcat/Bulldozer/Ryzen Families (PR #91277)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon May 6 15:14:41 PDT 2024
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/91277
Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible.
All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches Intel cpu behaviour (and the x86-64/v2/3/4 levels).
Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use old-fashioned CHECK-DAG checks for divl/divq pairs.
Fixes #90985
>From 48d26c24882b88be469cf36f9f41b516931bdf8f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 6 May 2024 23:12:30 +0100
Subject: [PATCH] [X86] Enable TuningSlowDivide64 on
Barcelona/Bobcat/Bulldozer/Ryzen Families
Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible.
All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches most Intel cpu behaviour (and the x86-64/v2/3/4 levels).
Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use a old-fashing CHECK-DAG check for divl/divq pairs.
Fixes #90985
---
llvm/lib/Target/X86/X86.td | 5 +
.../CodeGen/X86/bypass-slow-division-64.ll | 139 +++---------------
2 files changed, 28 insertions(+), 116 deletions(-)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e731947893de9..aaf1756e858208 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1341,6 +1341,7 @@ def ProcessorFeatures {
FeatureCMOV,
FeatureX86_64];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
@@ -1363,6 +1364,7 @@ def ProcessorFeatures {
list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
@@ -1385,6 +1387,7 @@ def ProcessorFeatures {
TuningFastVectorShiftMasks,
TuningFastMOVBE,
TuningSBBDepBreaking,
+ TuningSlowDivide64,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1409,6 +1412,7 @@ def ProcessorFeatures {
FeatureLWP,
FeatureLAHFSAHF64];
list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
+ TuningSlowDivide64,
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
@@ -1488,6 +1492,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVariablePerLaneShuffle,
TuningFastMOVBE,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 66d7082d9b7c55..0c46501e4b9717 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Check that 64-bit division is bypassed correctly.
; RUN: llc < %s -mtriple=x86_64-- -mattr=-idivq-to-divl | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
; RUN: llc < %s -mtriple=x86_64-- -mattr=+idivq-to-divl | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
@@ -13,17 +12,17 @@
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; AMD
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; Additional tests for 64-bit divide bypass
@@ -40,22 +39,8 @@ define i64 @sdiv_quotient(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_quotient:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB0_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB0_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = sdiv i64 %a, %b
ret i64 %result
}
@@ -92,23 +77,8 @@ define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB3_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: movq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB3_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: movl %edx, %eax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = srem i64 %a, %b
ret i64 %result
}
@@ -147,25 +117,8 @@ define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_quotient_and_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB6_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB6_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%resultdiv = sdiv i64 %a, %b
%resultrem = srem i64 %a, %b
%result = add i64 %resultdiv, %resultrem
@@ -213,22 +166,8 @@ define i64 @udiv_quotient(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_quotient:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB9_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB9_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = udiv i64 %a, %b
ret i64 %result
}
@@ -265,23 +204,8 @@ define i64 @udiv_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB12_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: movq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB12_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: movl %edx, %eax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = urem i64 %a, %b
ret i64 %result
}
@@ -320,25 +244,8 @@ define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_quotient_and_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB15_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB15_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%resultdiv = udiv i64 %a, %b
%resultrem = urem i64 %a, %b
%result = add i64 %resultdiv, %resultrem
More information about the llvm-commits
mailing list