[llvm-bugs] [Bug 43205] New: [SimplifyCFG] Performance regression in mergeConditionalStoreToAddress
via llvm-bugs
llvm-bugs at lists.llvm.org
Tue Sep 3 07:37:19 PDT 2019
https://bugs.llvm.org/show_bug.cgi?id=43205
Bug ID: 43205
Summary: [SimplifyCFG] Performance regression in
mergeConditionalStoreToAddress
Product: libraries
Version: trunk
Hardware: Other
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: Scalar Optimizations
Assignee: unassignedbugs at nondot.org
Reporter: kpdev42 at gmail.com
CC: llvm-bugs at lists.llvm.org
Created attachment 22465
--> https://bugs.llvm.org/attachment.cgi?id=22465&action=edit
Test case for the regression
I encounter a performance regression on the trunk for aarch64
Test case is not very easy (unfortunately).
But in general this regression can be described as follows:
Clang tries to replace branches with select instructions but in this test case
it leads to greater number of instructions and it became x1.5 slower (on my
arm64 Mate10 device).
__attribute__((noinline)) int do_select(const int max_iters_count,
const unsigned long in,
const unsigned long out,
const unsigned long ex,
const unsigned long bit_init_val,
const unsigned long mask)
{
int retval = 0;
for(int k =0 ; k < max_iters_count; k++)
{
fd_set_bits *fds = gv_fds;
unsigned long *rinp = fds->res_in;
for(int i= 0; i < g_max_i; ++i, ++rinp)
{
unsigned long bit = bit_init_val;
unsigned long res_in = 0;
//===== INNER LOOP, WHICH HAS A PROBLEM ========
for(int j = 0; j < BITS_PER_LONG; ++j, bit <<= 1)
{
if (in & bit) {
res_in |= bit;
retval++;
fds->proc = NULL;
}
if (mask & POLLOUT_SET) {
fds->proc = NULL;
}
}
*rinp = res_in;
}
}
return retval;
}
Source code with test case is attached to the message (select.c).
Asm for body of inner loop :
latest clang:
78c: 8a010043 and x3, x2, x1
790: ea01005f tst x2, x1
794: aa030165 orr x5, x11, x3
798: 1a800400 cinc w0, w0, ne
79c: 9a8203e3 csel x3, xzr, x2, eq
7a0: b4000045 cbz x5, 7a8 <do_select+0x60>
7a4: f9001dbf str xzr, [x13,#56]
7a8: aa030210 orr x16, x16, x3
latest clang with -mllvm -phi-node-folding-threshold=1:
78c: ea01025f tst x18, x1
790: 54000080 b.eq 7a0 <do_select+0x58>
794: aa120210 orr x16, x16, x18
798: 11000400 add w0, w0, #0x1
79c: f9001dbf str xzr, [x13,#56]
7a0: b400004b cbz x11, 7a8 <do_select+0x60>
7a4: f9001dbf str xzr, [x13,#56]
I have tested this regression with a plenty of input data, and there are no
cases when clang with default phi-node-folding-threshold=2
is better than clang with phi-node-folding-threshold=1.
Additional (unoptimal) instructions are generated in SimplifyCFG pass
1) Function mergeConditionalStoreToAddress transforms the blocks, moving store
instructions out of two blocks
===============================================================================================================
IR before mergeConditionalStoreToAddress (attention to blocks if.end if.then
if.then15)
; Function Attrs: noinline norecurse nounwind
define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64
%ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #
0 !dbg !59 {
entry:
call void @llvm.dbg.value(metadata i32 %max_iters_count, metadata !65,
metadata !DIExpression()), !dbg !86
call void @llvm.dbg.value(metadata i64 %in, metadata !66, metadata
!DIExpression()), !dbg !87
call void @llvm.dbg.value(metadata i64 %out, metadata !67, metadata
!DIExpression()), !dbg !88
call void @llvm.dbg.value(metadata i64 %ex, metadata !68, metadata
!DIExpression()), !dbg !89
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !69, metadata
!DIExpression()), !dbg !90
call void @llvm.dbg.value(metadata i64 %mask, metadata !70, metadata
!DIExpression()), !dbg !91
call void @llvm.dbg.value(metadata i32 0, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !72, metadata
!DIExpression()), !dbg !93
%cmp52 = icmp sgt i32 %max_iters_count, 0, !dbg !94
br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !95
for.body.lr.ph: ; preds = %entry
%and13 = and i64 %mask, 780
%tobool14 = icmp eq i64 %and13, 0
br label %for.body, !dbg !95
for.cond.cleanup: ; preds = %for.cond.cleanup5,
%entry
%retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ], !dbg !92
call void @llvm.dbg.value(metadata i32 %retval1.0.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
ret i32 %retval1.0.lcssa, !dbg !96
for.body: ; preds = %for.cond.cleanup5,
%for.body.lr.ph
%retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ]
%k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ]
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %k.053, metadata
!72, metadata !DIExpression()), !dbg !93
%0 = load volatile %struct.fd_set_bits*, %struct.fd_set_bits** @gv_fds111,
align 8, !dbg !97, !tbaa !98
call void @llvm.dbg.value(metadata %struct.fd_set_bits* %0, metadata !74,
metadata !DIExpression()), !dbg !102
%res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits*
%0, i64 0, i32 3, !dbg !103
%1 = load i64*, i64** %res_in, align 8, !dbg !103, !tbaa !104
call void @llvm.dbg.value(metadata i64* %1, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 0, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
%2 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp348 = icmp eq i64 %2, 0, !dbg !112
br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph,
!dbg !113
for.cond8.preheader.lr.ph: ; preds = %for.body
%proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0,
i64 0, i32 7
br label %for.cond8.preheader, !dbg !113
for.cond8.preheader: ; preds =
%for.cond8.preheader.lr.ph, %for.cond.cleanup11
%indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next,
%for.cond.cleanup11 ]
%rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr,
%for.cond.cleanup11 ]
%retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [
%retval1.3.lcssa, %for.cond.cleanup11 ]
call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i64* %rinp.050, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 0, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
br label %for.body12, !dbg !117
for.cond.cleanup5: ; preds =
%for.cond.cleanup11, %for.body
%retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa,
%for.cond.cleanup11 ], !dbg !92
%inc23 = add nuw nsw i32 %k.053, 1, !dbg !118
call void @llvm.dbg.value(metadata i32 %retval1.1.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %inc23, metadata !72, metadata
!DIExpression()), !dbg !93
%exitcond56 = icmp eq i32 %inc23, %max_iters_count, !dbg !94
br i1 %exitcond56, label %for.cond.cleanup, label %for.body, !dbg !95,
!llvm.loop !119
for.cond.cleanup11: ; preds = %for.inc
%retval1.3.lcssa = phi i32 [ %retval1.3, %for.inc ], !dbg !121
%res_in7.1.lcssa = phi i64 [ %res_in7.1, %for.inc ], !dbg !122
call void @llvm.dbg.value(metadata i64 %res_in7.1.lcssa, metadata !83,
metadata !DIExpression()), !dbg !115
store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8, !dbg !123, !tbaa !110
%indvars.iv.next = add nuw i64 %indvars.iv, 1, !dbg !124
%incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1, !dbg !125
call void @llvm.dbg.value(metadata i32 undef, metadata !78, metadata
!DIExpression(DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !108
call void @llvm.dbg.value(metadata i64* %incdec.ptr, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.3.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
%3 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp3 = icmp ugt i64 %3, %indvars.iv.next, !dbg !112
br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5, !dbg !113,
!llvm.loop !126
for.body12: ; preds = %for.inc,
%for.cond8.preheader
%j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %for.inc ]
%res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %for.inc ]
%bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %for.inc
]
%retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3,
%for.inc ]
call void @llvm.dbg.value(metadata i32 %j.047, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.046, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit.044, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.243, metadata !71, metadata
!DIExpression()), !dbg !92
%and = and i64 %bit.044, %in, !dbg !128
%tobool = icmp eq i64 %and, 0, !dbg !128
br i1 %tobool, label %if.end, label %if.then, !dbg !132
if.then: ; preds = %for.body12
%or = or i64 %res_in7.046, %bit.044, !dbg !133
call void @llvm.dbg.value(metadata i64 %or, metadata !83, metadata
!DIExpression()), !dbg !115
%inc = add nsw i32 %retval1.243, 1, !dbg !135
call void @llvm.dbg.value(metadata i32 %inc, metadata !71, metadata
!DIExpression()), !dbg !92
store i8* null, i8** %proc, align 8, !dbg !136, !tbaa !137
br label %if.end, !dbg !138
if.end: ; preds = %for.body12,
%if.then
%retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ], !dbg
!121
%res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ], !dbg
!122
br i1 %tobool14, label %for.inc, label %if.then15, !dbg !139
if.then15: ; preds = %if.end
store i8* null, i8** %proc, align 8, !dbg !140, !tbaa !137
br label %for.inc, !dbg !143
for.inc: ; preds = %if.end, %if.then15
%inc18 = add nuw nsw i32 %j.047, 1, !dbg !144
%shl = shl i64 %bit.044, 1, !dbg !145
call void @llvm.dbg.value(metadata i32 %inc18, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.1, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %shl, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.3, metadata !71, metadata
!DIExpression()), !dbg !92
%exitcond = icmp eq i32 %inc18, 64, !dbg !146
br i1 %exitcond, label %for.cond.cleanup11, label %for.body12, !dbg !117,
!llvm.loop !147
}
===============================================================================================================
IR after mergeConditionalStoreToAddress (attention to blocks if.end if.then
if.then15 - how they were transformed)
; Function Attrs: noinline norecurse nounwind
define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64
%ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #0 !dbg !59 {
entry:
call void @llvm.dbg.value(metadata i32 %max_iters_count, metadata !65,
metadata !DIExpression()), !dbg !86
call void @llvm.dbg.value(metadata i64 %in, metadata !66, metadata
!DIExpression()), !dbg !87
call void @llvm.dbg.value(metadata i64 %out, metadata !67, metadata
!DIExpression()), !dbg !88
call void @llvm.dbg.value(metadata i64 %ex, metadata !68, metadata
!DIExpression()), !dbg !89
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !69, metadata
!DIExpression()), !dbg !90
call void @llvm.dbg.value(metadata i64 %mask, metadata !70, metadata
!DIExpression()), !dbg !91
call void @llvm.dbg.value(metadata i32 0, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !72, metadata
!DIExpression()), !dbg !93
%cmp52 = icmp sgt i32 %max_iters_count, 0, !dbg !94
br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !95
for.body.lr.ph: ; preds = %entry
%and13 = and i64 %mask, 780
%tobool14 = icmp eq i64 %and13, 0
br label %for.body, !dbg !95
for.cond.cleanup: ; preds = %for.cond.cleanup5,
%entry
%retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ], !dbg !92
call void @llvm.dbg.value(metadata i32 %retval1.0.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
ret i32 %retval1.0.lcssa, !dbg !96
for.body: ; preds = %for.cond.cleanup5,
%for.body.lr.ph
%retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ]
%k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ]
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %k.053, metadata !72, metadata
!DIExpression()), !dbg !93
%0 = load volatile %struct.fd_set_bits*, %struct.fd_set_bits** @gv_fds111,
align 8, !dbg !97, !tbaa !98
call void @llvm.dbg.value(metadata %struct.fd_set_bits* %0, metadata !74,
metadata !DIExpression()), !dbg !102
%res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits*
%0, i64 0, i32 3, !dbg !103
%1 = load i64*, i64** %res_in, align 8, !dbg !103, !tbaa !104
call void @llvm.dbg.value(metadata i64* %1, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 0, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
%2 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp348 = icmp eq i64 %2, 0, !dbg !112
br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph,
!dbg !113
for.cond8.preheader.lr.ph: ; preds = %for.body
%proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0,
i64 0, i32 7
br label %for.cond8.preheader, !dbg !113
for.cond8.preheader: ; preds =
%for.cond8.preheader.lr.ph, %for.cond.cleanup11
%indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next,
%for.cond.cleanup11 ]
%rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr,
%for.cond.cleanup11 ]
%retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [
%retval1.3.lcssa, %for.cond.cleanup11 ]
call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i64* %rinp.050, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 0, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
br label %for.body12, !dbg !117
for.cond.cleanup5: ; preds =
%for.cond.cleanup11, %for.body
%retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa,
%for.cond.cleanup11 ], !dbg !92
%inc23 = add nuw nsw i32 %k.053, 1, !dbg !118
call void @llvm.dbg.value(metadata i32 %retval1.1.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %inc23, metadata !72, metadata
!DIExpression()), !dbg !93
%exitcond56 = icmp eq i32 %inc23, %max_iters_count, !dbg !94
br i1 %exitcond56, label %for.cond.cleanup, label %for.body, !dbg !95,
!llvm.loop !119
for.cond.cleanup11: ; preds = %8
%retval1.3.lcssa = phi i32 [ %retval1.3, %8 ], !dbg !121
%res_in7.1.lcssa = phi i64 [ %res_in7.1, %8 ], !dbg !122
call void @llvm.dbg.value(metadata i64 %res_in7.1.lcssa, metadata !83,
metadata !DIExpression()), !dbg !115
store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8, !dbg !123, !tbaa !110
%indvars.iv.next = add nuw i64 %indvars.iv, 1, !dbg !124
%incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1, !dbg !125
call void @llvm.dbg.value(metadata i32 undef, metadata !78, metadata
!DIExpression(DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !108
call void @llvm.dbg.value(metadata i64* %incdec.ptr, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.3.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
%3 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp3 = icmp ugt i64 %3, %indvars.iv.next, !dbg !112
br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5, !dbg !113,
!llvm.loop !126
for.body12: ; preds = %8,
%for.cond8.preheader
%j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %8 ]
%res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %8 ]
%bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %8 ]
%retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3,
%8 ]
call void @llvm.dbg.value(metadata i32 %j.047, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.046, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit.044, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.243, metadata !71, metadata
!DIExpression()), !dbg !92
%and = and i64 %bit.044, %in, !dbg !128
%tobool = icmp eq i64 %and, 0, !dbg !128
br i1 %tobool, label %if.end, label %if.then, !dbg !132
if.then: ; preds = %for.body12
%or = or i64 %res_in7.046, %bit.044, !dbg !133
call void @llvm.dbg.value(metadata i64 %or, metadata !83, metadata
!DIExpression()), !dbg !115
%inc = add nsw i32 %retval1.243, 1, !dbg !135
call void @llvm.dbg.value(metadata i32 %inc, metadata !71, metadata
!DIExpression()), !dbg !92
br label %if.end, !dbg !136
if.end: ; preds = %for.body12,
%if.then
%retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ], !dbg
!121
%res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ], !dbg
!122
br i1 %tobool14, label %for.inc, label %if.then15, !dbg !137
if.then15: ; preds = %if.end
br label %for.inc, !dbg !138
for.inc: ; preds = %if.end, %if.then15
%simplifycfg.merge = phi i8* [ null, %if.then15 ], [ null, %if.end ]
%4 = xor i1 %tobool, true, !dbg !141
%5 = xor i1 %tobool14, true, !dbg !141
%6 = or i1 %4, %5, !dbg !141
br i1 %6, label %7, label %8, !dbg !141
; <label>:7: ; preds = %for.inc
store i8* %simplifycfg.merge, i8** %proc, align 8, !dbg !141, !tbaa !142
br label %8, !dbg !141
; <label>:8: ; preds = %for.inc, %7
%inc18 = add nuw nsw i32 %j.047, 1, !dbg !141
%shl = shl i64 %bit.044, 1, !dbg !143
call void @llvm.dbg.value(metadata i32 %inc18, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.1, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %shl, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.3, metadata !71, metadata
!DIExpression()), !dbg !92
%exitcond = icmp eq i32 %inc18, 64, !dbg !144
br i1 %exitcond, label %for.cond.cleanup11, label %for.body12, !dbg !117,
!llvm.loop !145
}
2) Function FoldTwoEntryPHINode tries to merge these blocks and inserts two
select instructions which leads to regression
===============================================================================================================
IR after FoldTwoEntryPHINode (attention to blocks if.inc were merged to
for.body12 - and it leads to regression)
; Function Attrs: noinline norecurse nounwind
define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64
%ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #0 !dbg !59 {
entry:
call void @llvm.dbg.value(metadata i32 %max_iters_count, metadata !65,
metadata !DIExpression()), !dbg !86
call void @llvm.dbg.value(metadata i64 %in, metadata !66, metadata
!DIExpression()), !dbg !87
call void @llvm.dbg.value(metadata i64 %out, metadata !67, metadata
!DIExpression()), !dbg !88
call void @llvm.dbg.value(metadata i64 %ex, metadata !68, metadata
!DIExpression()), !dbg !89
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !69, metadata
!DIExpression()), !dbg !90
call void @llvm.dbg.value(metadata i64 %mask, metadata !70, metadata
!DIExpression()), !dbg !91
call void @llvm.dbg.value(metadata i32 0, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !72, metadata
!DIExpression()), !dbg !93
%cmp52 = icmp sgt i32 %max_iters_count, 0, !dbg !94
br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup, !dbg !95
for.body.lr.ph: ; preds = %entry
%and13 = and i64 %mask, 780
%tobool14 = icmp eq i64 %and13, 0
br label %for.body, !dbg !95
for.cond.cleanup: ; preds = %for.cond.cleanup5,
%entry
%retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ], !dbg !92
call void @llvm.dbg.value(metadata i32 %retval1.0.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
ret i32 %retval1.0.lcssa, !dbg !96
for.body: ; preds = %for.cond.cleanup5,
%for.body.lr.ph
%retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa,
%for.cond.cleanup5 ]
%k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ]
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %k.053, metadata !72, metadata
!DIExpression()), !dbg !93
%0 = load volatile %struct.fd_set_bits*, %struct.fd_set_bits** @gv_fds111,
align 8, !dbg !97, !tbaa !98
call void @llvm.dbg.value(metadata %struct.fd_set_bits* %0, metadata !74,
metadata !DIExpression()), !dbg !102
%res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits*
%0, i64 0, i32 3, !dbg !103
%1 = load i64*, i64** %res_in, align 8, !dbg !103, !tbaa !104
call void @llvm.dbg.value(metadata i64* %1, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 0, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i32 %retval1.054, metadata !71, metadata
!DIExpression()), !dbg !92
%2 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp348 = icmp eq i64 %2, 0, !dbg !112
br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph,
!dbg !113
for.cond8.preheader.lr.ph: ; preds = %for.body
%proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0,
i64 0, i32 7
br label %for.cond8.preheader, !dbg !113
for.cond8.preheader: ; preds =
%for.cond8.preheader.lr.ph, %for.cond.cleanup11
%indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next,
%for.cond.cleanup11 ]
%rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr,
%for.cond.cleanup11 ]
%retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [
%retval1.3.lcssa, %for.cond.cleanup11 ]
call void @llvm.dbg.value(metadata i64 %indvars.iv, metadata !78, metadata
!DIExpression()), !dbg !108
call void @llvm.dbg.value(metadata i64* %rinp.050, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 0, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 0, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit_init_val, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.149, metadata !71, metadata
!DIExpression()), !dbg !92
br label %for.body12, !dbg !117
for.cond.cleanup5: ; preds =
%for.cond.cleanup11, %for.body
%retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa,
%for.cond.cleanup11 ], !dbg !92
%inc23 = add nuw nsw i32 %k.053, 1, !dbg !118
call void @llvm.dbg.value(metadata i32 %retval1.1.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
call void @llvm.dbg.value(metadata i32 %inc23, metadata !72, metadata
!DIExpression()), !dbg !93
%exitcond56 = icmp eq i32 %inc23, %max_iters_count, !dbg !94
br i1 %exitcond56, label %for.cond.cleanup, label %for.body, !dbg !95,
!llvm.loop !119
for.cond.cleanup11: ; preds = %8
%retval1.3.lcssa = phi i32 [ %retval1.3, %8 ], !dbg !121
%res_in7.1.lcssa = phi i64 [ %res_in7.1, %8 ], !dbg !122
call void @llvm.dbg.value(metadata i64 %res_in7.1.lcssa, metadata !83,
metadata !DIExpression()), !dbg !115
store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8, !dbg !123, !tbaa !110
%indvars.iv.next = add nuw i64 %indvars.iv, 1, !dbg !124
%incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1, !dbg !125
call void @llvm.dbg.value(metadata i32 undef, metadata !78, metadata
!DIExpression(DW_OP_plus_uconst, 1, DW_OP_stack_value)), !dbg !108
call void @llvm.dbg.value(metadata i64* %incdec.ptr, metadata !77, metadata
!DIExpression()), !dbg !107
call void @llvm.dbg.value(metadata i32 %retval1.3.lcssa, metadata !71,
metadata !DIExpression()), !dbg !92
%3 = load volatile i64, i64* @g_max_i, align 8, !dbg !109, !tbaa !110
%cmp3 = icmp ugt i64 %3, %indvars.iv.next, !dbg !112
br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5, !dbg !113,
!llvm.loop !126
for.body12: ; preds = %8,
%for.cond8.preheader
%j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %8 ]
%res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %8 ]
%bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %8 ]
%retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3,
%8 ]
call void @llvm.dbg.value(metadata i32 %j.047, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.046, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %bit.044, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.243, metadata !71, metadata
!DIExpression()), !dbg !92
%and = and i64 %bit.044, %in, !dbg !128
%tobool = icmp eq i64 %and, 0, !dbg !128
%or = or i64 %res_in7.046, %bit.044, !dbg !132
%inc = add nsw i32 %retval1.243, 1, !dbg !132
%retval1.3 = select i1 %tobool, i32 %retval1.243, i32 %inc, !dbg !132
%res_in7.1 = select i1 %tobool, i64 %res_in7.046, i64 %or, !dbg !132
br label %if.end, !dbg !132
if.then: ; No predecessors!
br label %if.end, !dbg !132
if.end: ; preds = %for.body12,
%if.then
br i1 %tobool14, label %for.inc, label %if.then15, !dbg !133
if.then15: ; preds = %if.end
br label %for.inc, !dbg !134
for.inc: ; preds = %if.end, %if.then15
%simplifycfg.merge = phi i8* [ null, %if.then15 ], [ null, %if.end ]
%4 = xor i1 %tobool, true, !dbg !137
%5 = xor i1 %tobool14, true, !dbg !137
%6 = or i1 %4, %5, !dbg !137
br i1 %6, label %7, label %8, !dbg !137
; <label>:7: ; preds = %for.inc
store i8* %simplifycfg.merge, i8** %proc, align 8, !dbg !137, !tbaa !138
br label %8, !dbg !137
; <label>:8: ; preds = %for.inc, %7
%inc18 = add nuw nsw i32 %j.047, 1, !dbg !137
%shl = shl i64 %bit.044, 1, !dbg !139
call void @llvm.dbg.value(metadata i32 %inc18, metadata !84, metadata
!DIExpression()), !dbg !114
call void @llvm.dbg.value(metadata i64 %res_in7.1, metadata !83, metadata
!DIExpression()), !dbg !115
call void @llvm.dbg.value(metadata i64 %shl, metadata !80, metadata
!DIExpression()), !dbg !116
call void @llvm.dbg.value(metadata i32 %retval1.3, metadata !71, metadata
!DIExpression()), !dbg !92
%exitcond = icmp eq i32 %inc18, 64, !dbg !140
br i1 %exitcond, label %for.cond.cleanup11, label %for.body12, !dbg !117,
!llvm.loop !141
}
===============================================================================================================
After FoldTwoEntryPHINode one IR instruction (`or`) will be splitted into two
machine instructions (`csel` and `orr` )
This regression can be avoided in two ways:
1) By adding an option -mllvm -phi-node-folding-threshold=1 (default
threshold=2) to compiler and linker (because I use a LTO). But it will
influence some others transformations too
2) By changing source code of SimplifyCFG.cpp with adding some heuristics to
increase cost of folding these basic blocks (depending on backend).
In this case changes can be applied to lambda IsWorthwhile in function
mergeConditionalStoreToAddress (SimplifyCFG.cpp).
Something like that (it is just a concept of the patch):
auto IsWorthwhile = [&](BasicBlock *BB) {
if (!BB)
return true;
// Heuristic: if the block can be if-converted/phi-folded and the
// instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
// thread this store.
unsigned N = 0;
for (auto &I : BB->instructionsWithoutDebug()) {
// Cheap instructions viable for folding.
if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
isa<StoreInst>(I))
isa<StoreInst>(I)) {
++N;
+ // Check if this "cheap" instruction has additional cost
+ // for blocks folding
+ if (TTI.isExpensiveForFolding(I)) {
+ ++N;
+ }
}
But maybe this solution is too naive and the real reason of the regression
somewhere in another optimization,
which cannot handle this pattern correctly. I will appreciate any ideas about
it.
As I see this bug is similar to this one
https://bugs.llvm.org/show_bug.cgi?id=22616 but I am not sure if it is a
duplicate
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20190903/cac331da/attachment-0001.html>
More information about the llvm-bugs
mailing list