[PATCH] D108935: [SimplifyCFG] Add bonus when seeing vector ops to branch fold to common dest
Arthur Eubanks via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 30 16:33:47 PDT 2021
aeubanks added inline comments.
================
Comment at: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll:268-269
+; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[DOTNOT]], label [[IF_END:%.*]], label [[RETURN:%.*]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
----------------
lebedev.ri wrote:
> Looks like there are other folds that need to be upated?
> `foldtwoentryphinode` and `speculativelyexecutebb` i guess.
>
Bumping up the threshold in `SpeculativelyExecuteBB()` does fix this issue.
but the first pass of simplifycfg makes this
```
define float @test_separate_anyof_v4sf(<4 x float> %t) {
entry:
%vecext = extractelement <4 x float> %t, i32 0
%conv = fpext float %vecext to double
%cmp = fcmp olt double %conv, 0.000000e+00
%vecext2 = extractelement <4 x float> %t, i32 1
%conv3 = fpext float %vecext2 to double
%cmp4 = fcmp olt double %conv3, 0.000000e+00
%or.cond = select i1 %cmp, i1 true, i1 %cmp4
%vecext7 = extractelement <4 x float> %t, i32 2
%conv8 = fpext float %vecext7 to double
%cmp9 = fcmp olt double %conv8, 0.000000e+00
%or.cond1 = select i1 %or.cond, i1 true, i1 %cmp9
%vecext12 = extractelement <4 x float> %t, i32 3
%conv13 = fpext float %vecext12 to double
%cmp14 = fcmp olt double %conv13, 0.000000e+00
%or.cond2 = select i1 %or.cond1, i1 true, i1 %cmp14
br i1 %or.cond2, label %return, label %if.end
if.end: ; preds = %entry
%vecext16 = extractelement <4 x float> %t, i32 0
%conv17 = fpext float %vecext16 to double
%cmp18 = fcmp ogt double %conv17, 1.000000e+00
%vecext21 = extractelement <4 x float> %t, i32 1
%conv22 = fpext float %vecext21 to double
%cmp23 = fcmp ogt double %conv22, 1.000000e+00
%or.cond3 = select i1 %cmp18, i1 true, i1 %cmp23
%vecext26 = extractelement <4 x float> %t, i32 2
%conv27 = fpext float %vecext26 to double
%cmp28 = fcmp ogt double %conv27, 1.000000e+00
%or.cond4 = select i1 %or.cond3, i1 true, i1 %cmp28
%vecext31 = extractelement <4 x float> %t, i32 3
%conv32 = fpext float %vecext31 to double
%cmp33 = fcmp ogt double %conv32, 1.000000e+00
%or.cond5 = select i1 %or.cond4, i1 true, i1 %cmp33
br i1 %or.cond5, label %return, label %if.end36
if.end36: ; preds = %if.end
%vecext37 = extractelement <4 x float> %t, i32 0
%vecext38 = extractelement <4 x float> %t, i32 1
%add = fadd float %vecext37, %vecext38
br label %return
return: ; preds = %if.end, %entry, %if.end36
%retval.0 = phi float [ %add, %if.end36 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.end ]
ret float %retval.0
}
```
then the last simplifycfg sees
```
define float @test_separate_anyof_v4sf(<4 x float> %t) local_unnamed_addr {
entry:
%vecext = extractelement <4 x float> %t, i32 0
%cmp = fcmp olt float %vecext, 0.000000e+00
%vecext2 = extractelement <4 x float> %t, i32 1
%cmp4 = fcmp olt float %vecext2, 0.000000e+00
%or.cond = select i1 %cmp, i1 true, i1 %cmp4
%vecext7 = extractelement <4 x float> %t, i32 2
%cmp9 = fcmp olt float %vecext7, 0.000000e+00
%or.cond1 = select i1 %or.cond, i1 true, i1 %cmp9
%vecext12 = extractelement <4 x float> %t, i32 3
%cmp14 = fcmp olt float %vecext12, 0.000000e+00
%or.cond2 = select i1 %or.cond1, i1 true, i1 %cmp14
br i1 %or.cond2, label %return, label %if.end
if.end: ; preds = %entry
%cmp18 = fcmp ogt float %vecext, 1.000000e+00
%cmp23 = fcmp ogt float %vecext2, 1.000000e+00
%or.cond3 = select i1 %cmp18, i1 true, i1 %cmp23
%cmp28 = fcmp ogt float %vecext7, 1.000000e+00
%or.cond4 = select i1 %or.cond3, i1 true, i1 %cmp28
%cmp33 = fcmp ogt float %vecext12, 1.000000e+00
%or.cond5 = select i1 %or.cond4, i1 true, i1 %cmp33
%add = fadd float %vecext, %vecext2
%spec.select = select i1 %or.cond5, float 0.000000e+00, float %add
br label %return
return: ; preds = %if.end, %entry
%retval.0 = phi float [ 0.000000e+00, %entry ], [ %spec.select, %if.end ]
ret float %retval.0
}
```
and refuses to merge any blocks unless we make the allowed number of instructions to speculate in `SpeculativelyExecuteBB()` very large
is the previous IR really better than the new IR? or vice versa for the other changed test below?
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D108935/new/
https://reviews.llvm.org/D108935
More information about the llvm-commits
mailing list