[llvm] c55899f - [DAGCombiner] Hoist funnel shifts from logic operation
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 5 14:02:36 PDT 2022
Author: Filipp Zhinkin
Date: 2022-08-05T17:02:22-04:00
New Revision: c55899f763b6d4510fd77711af3b605fd444e7fa
URL: https://github.com/llvm/llvm-project/commit/c55899f763b6d4510fd77711af3b605fd444e7fa
DIFF: https://github.com/llvm/llvm-project/commit/c55899f763b6d4510fd77711af3b605fd444e7fa.diff
LOG: [DAGCombiner] Hoist funnel shifts from logic operation
Hoist funnel shift from logic op:
logic_op (FSH x0, x1, s), (FSH y0, y1, s) --> FSH (logic_op x0, y0), (logic_op x1, y1), s
The transformation improves code generated for some cases related to
issue https://github.com/llvm/llvm-project/issues/49541.
Reduced amount of funnel shifts can also improve throughput on x86 CPUs by utilizing more
available ports: https://quick-bench.com/q/gC7AKkJJsDZzRrs_JWDzm9t_iDM
Transformation correctness checks:
https://alive2.llvm.org/ce/z/TKPULH
https://alive2.llvm.org/ce/z/UvTd_9
https://alive2.llvm.org/ce/z/j8qW3_
https://alive2.llvm.org/ce/z/7Wq7gE
https://alive2.llvm.org/ce/z/Xr5w8R
https://alive2.llvm.org/ce/z/D5xe_E
https://alive2.llvm.org/ce/z/2yBZiy
Differential Revision: https://reviews.llvm.org/D130994
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
llvm/test/CodeGen/X86/icmp-shift-opt.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1a37c45de7c4b..6b46c28961e50 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5326,6 +5326,21 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
return DAG.getNode(HandOpcode, DL, VT, Logic);
}
+ // For funnel shifts FSHL/FSHR:
+ // logic_op (OP x, x1, s), (OP y, y1, s) -->
+ // --> OP (logic_op x, y), (logic_op, x1, y1), s
+ if ((HandOpcode == ISD::FSHL || HandOpcode == ISD::FSHR) &&
+ N0.getOperand(2) == N1.getOperand(2)) {
+ if (!N0.hasOneUse() || !N1.hasOneUse())
+ return SDValue();
+ SDValue X1 = N0.getOperand(1);
+ SDValue Y1 = N1.getOperand(1);
+ SDValue S = N0.getOperand(2);
+ SDValue Logic0 = DAG.getNode(LogicOpcode, DL, VT, X, Y);
+ SDValue Logic1 = DAG.getNode(LogicOpcode, DL, VT, X1, Y1);
+ return DAG.getNode(HandOpcode, DL, VT, Logic0, Logic1, S);
+ }
+
// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
// Only perform this optimization up until type legalization, before
// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
diff --git a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
index b20cca63aa805..fb875837cb836 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-logic-fold.ll
@@ -7,12 +7,11 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshl_from_or:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: orq %rdx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shldq %cl, %rsi, %rdi
-; X64-NEXT: shldq %cl, %rax, %rdx
-; X64-NEXT: orq %rdi, %rdx
-; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shldq %cl, %rsi, %rax
; X64-NEXT: retq
%fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
%fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -23,12 +22,11 @@ define i64 @hoist_fshl_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshl_from_and:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rcx, %rsi
+; X64-NEXT: andq %rdx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shldq %cl, %rsi, %rdi
-; X64-NEXT: shldq %cl, %rax, %rdx
-; X64-NEXT: andq %rdi, %rdx
-; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shldq %cl, %rsi, %rax
; X64-NEXT: retq
%fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
%fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -39,12 +37,11 @@ define i64 @hoist_fshl_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
define i64 @hoist_fshl_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshl_from_xor:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq %rcx, %rsi
+; X64-NEXT: xorq %rdx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shldq %cl, %rsi, %rdi
-; X64-NEXT: shldq %cl, %rax, %rdx
-; X64-NEXT: xorq %rdi, %rdx
-; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: shldq %cl, %rsi, %rax
; X64-NEXT: retq
%fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %s)
%fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 %s)
@@ -69,10 +66,10 @@ define i64 @fshl_or_with_
diff erent_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n
define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
; X64-LABEL: hoist_fshl_from_or_const_shift:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shldq $15, %rsi, %rdi
-; X64-NEXT: shldq $15, %rcx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: shldq $15, %rsi, %rax
; X64-NEXT: retq
%fshl.0 = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 15)
%fshl.1 = call i64 @llvm.fshl.i64(i64 %c, i64 %d, i64 15)
@@ -83,11 +80,11 @@ define i64 @hoist_fshl_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounw
define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshr_from_or:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: orq %rdx, %rdi
+; X64-NEXT: orq %rcx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shrdq %cl, %rdi, %rsi
-; X64-NEXT: shrdq %cl, %rdx, %rax
-; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: shrdq %cl, %rdi, %rax
; X64-NEXT: retq
%fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
%fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -98,11 +95,11 @@ define i64 @hoist_fshr_from_or(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshr_from_and:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: andq %rdx, %rdi
+; X64-NEXT: andq %rcx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shrdq %cl, %rdi, %rsi
-; X64-NEXT: shrdq %cl, %rdx, %rax
-; X64-NEXT: andq %rsi, %rax
+; X64-NEXT: shrdq %cl, %rdi, %rax
; X64-NEXT: retq
%fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
%fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -113,11 +110,11 @@ define i64 @hoist_fshr_from_and(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind
define i64 @hoist_fshr_from_xor(i64 %a, i64 %b, i64 %c, i64 %d, i64 %s) nounwind {
; X64-LABEL: hoist_fshr_from_xor:
; X64: # %bb.0:
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: xorq %rdx, %rdi
+; X64-NEXT: xorq %rcx, %rax
; X64-NEXT: movl %r8d, %ecx
-; X64-NEXT: shrdq %cl, %rdi, %rsi
-; X64-NEXT: shrdq %cl, %rdx, %rax
-; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: shrdq %cl, %rdi, %rax
; X64-NEXT: retq
%fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %s)
%fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 %s)
@@ -142,10 +139,10 @@ define i64 @fshr_or_with_
diff erent_shift_value(i64 %a, i64 %b, i64 %c, i64 %d) n
define i64 @hoist_fshr_from_or_const_shift(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
; X64-LABEL: hoist_fshr_from_or_const_shift:
; X64: # %bb.0:
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shldq $49, %rsi, %rdi
-; X64-NEXT: shldq $49, %rcx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: orl %edx, %eax
+; X64-NEXT: shldq $49, %rsi, %rax
; X64-NEXT: retq
%fshr.0 = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 15)
%fshr.1 = call i64 @llvm.fshr.i64(i64 %c, i64 %d, i64 15)
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 23524d23c3ba7..582fae8cf63c4 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -25,12 +25,11 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %edx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: shldl $4, %edx, %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: orl %ecx, %ebx
; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: shrdl $28, %edx, %ebp
; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: shrdl $28, %ebx, %ebp
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %exit
; X86-NEXT: movl %edi, (%eax)
@@ -73,19 +72,15 @@ exit:
define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
; X86-LABEL: opt_setcc_srl_eq_zero:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: shrdl $17, %ecx, %eax
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: shldl $15, %edx, %esi
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: shrdl $17, %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: shldl $15, %edx, %eax
; X86-NEXT: sete %al
-; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -102,19 +97,15 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
; X86-LABEL: opt_setcc_srl_ne_zero:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: shrdl $17, %ecx, %eax
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: shldl $15, %edx, %esi
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: shrdl $17, %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: shldl $15, %edx, %eax
; X86-NEXT: setne %al
-; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -131,19 +122,13 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
; X86-LABEL: opt_setcc_shl_eq_zero:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: shldl $17, %edx, %esi
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: shldl $17, %ecx, %edx
-; X86-NEXT: shldl $17, %eax, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shll $17, %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: sete %al
-; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -160,19 +145,13 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
; X86-LABEL: opt_setcc_shl_ne_zero:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: shldl $17, %edx, %esi
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: shldl $17, %ecx, %edx
-; X86-NEXT: shldl $17, %eax, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shll $17, %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl %ecx, %eax
; X86-NEXT: setne %al
-; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -243,13 +222,11 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
; X86-LABEL: opt_setcc_expanded_shl_correct_shifts:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: shldl $17, %ecx, %edx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: shldl $17, %eax, %ecx
-; X86-NEXT: orl %edx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
More information about the llvm-commits
mailing list