[PATCH] D108935: [SimplifyCFG] Add bonus when seeing vector ops to branch fold to common dest

Mon Aug 30 16:33:47 PDT 2021

aeubanks added inline comments.

================
Comment at: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll:268-269
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i4 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[IF_END:%.*]], label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[T_FR]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
----------------
lebedev.ri wrote:
> Looks like there are other folds that need to be upated?
> `foldtwoentryphinode` and `speculativelyexecutebb` i guess.
> 
Bumping up the threshold in `SpeculativelyExecuteBB()` does fix this issue.

but the first pass of simplifycfg makes this
```
define float @test_separate_anyof_v4sf(<4 x float> %t) {
entry:
  %vecext = extractelement <4 x float> %t, i32 0
  %conv = fpext float %vecext to double
  %cmp = fcmp olt double %conv, 0.000000e+00
  %vecext2 = extractelement <4 x float> %t, i32 1
  %conv3 = fpext float %vecext2 to double
  %cmp4 = fcmp olt double %conv3, 0.000000e+00
  %or.cond = select i1 %cmp, i1 true, i1 %cmp4
  %vecext7 = extractelement <4 x float> %t, i32 2
  %conv8 = fpext float %vecext7 to double
  %cmp9 = fcmp olt double %conv8, 0.000000e+00
  %or.cond1 = select i1 %or.cond, i1 true, i1 %cmp9
  %vecext12 = extractelement <4 x float> %t, i32 3
  %conv13 = fpext float %vecext12 to double
  %cmp14 = fcmp olt double %conv13, 0.000000e+00
  %or.cond2 = select i1 %or.cond1, i1 true, i1 %cmp14
  br i1 %or.cond2, label %return, label %if.end

if.end:                                           ; preds = %entry
  %vecext16 = extractelement <4 x float> %t, i32 0
  %conv17 = fpext float %vecext16 to double
  %cmp18 = fcmp ogt double %conv17, 1.000000e+00
  %vecext21 = extractelement <4 x float> %t, i32 1
  %conv22 = fpext float %vecext21 to double
  %cmp23 = fcmp ogt double %conv22, 1.000000e+00
  %or.cond3 = select i1 %cmp18, i1 true, i1 %cmp23
  %vecext26 = extractelement <4 x float> %t, i32 2
  %conv27 = fpext float %vecext26 to double
  %cmp28 = fcmp ogt double %conv27, 1.000000e+00
  %or.cond4 = select i1 %or.cond3, i1 true, i1 %cmp28
  %vecext31 = extractelement <4 x float> %t, i32 3
  %conv32 = fpext float %vecext31 to double
  %cmp33 = fcmp ogt double %conv32, 1.000000e+00
  %or.cond5 = select i1 %or.cond4, i1 true, i1 %cmp33
  br i1 %or.cond5, label %return, label %if.end36

if.end36:                                         ; preds = %if.end
  %vecext37 = extractelement <4 x float> %t, i32 0
  %vecext38 = extractelement <4 x float> %t, i32 1
  %add = fadd float %vecext37, %vecext38
  br label %return

return:                                           ; preds = %if.end, %entry, %if.end36
  %retval.0 = phi float [ %add, %if.end36 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.end ]
  ret float %retval.0
}
```

then the last simplifycfg sees

```
define float @test_separate_anyof_v4sf(<4 x float> %t) local_unnamed_addr {
entry:
  %vecext = extractelement <4 x float> %t, i32 0
  %cmp = fcmp olt float %vecext, 0.000000e+00
  %vecext2 = extractelement <4 x float> %t, i32 1
  %cmp4 = fcmp olt float %vecext2, 0.000000e+00
  %or.cond = select i1 %cmp, i1 true, i1 %cmp4
  %vecext7 = extractelement <4 x float> %t, i32 2
  %cmp9 = fcmp olt float %vecext7, 0.000000e+00
  %or.cond1 = select i1 %or.cond, i1 true, i1 %cmp9
  %vecext12 = extractelement <4 x float> %t, i32 3
  %cmp14 = fcmp olt float %vecext12, 0.000000e+00
  %or.cond2 = select i1 %or.cond1, i1 true, i1 %cmp14
  br i1 %or.cond2, label %return, label %if.end

if.end:                                           ; preds = %entry
  %cmp18 = fcmp ogt float %vecext, 1.000000e+00
  %cmp23 = fcmp ogt float %vecext2, 1.000000e+00
  %or.cond3 = select i1 %cmp18, i1 true, i1 %cmp23
  %cmp28 = fcmp ogt float %vecext7, 1.000000e+00
  %or.cond4 = select i1 %or.cond3, i1 true, i1 %cmp28
  %cmp33 = fcmp ogt float %vecext12, 1.000000e+00
  %or.cond5 = select i1 %or.cond4, i1 true, i1 %cmp33
  %add = fadd float %vecext, %vecext2
  %spec.select = select i1 %or.cond5, float 0.000000e+00, float %add
  br label %return

return:                                           ; preds = %if.end, %entry
  %retval.0 = phi float [ 0.000000e+00, %entry ], [ %spec.select, %if.end ]
  ret float %retval.0
}
```

and refuses to merge any blocks unless we make the allowed number of instructions to speculate in `SpeculativelyExecuteBB()` very large

is the previous IR really better than the new IR? or vice versa for the other changed test below?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D108935/new/

https://reviews.llvm.org/D108935