[PATCH] D89479: [SimplifyCFG] Be more conservative when speculating in loops. (WIP)

Thu Oct 15 13:45:09 PDT 2020

fhahn added a comment.

In D89479#2333065 <https://reviews.llvm.org/D89479#2333065>, @spatel wrote:

> Another reason that we would likely want a finer-grain solution: recent AMD implementations appear to have full-speed lzcnt (1 cycle and full throughput according to Agner's tables for Jaguar and Ryzen).

Yeah, the issue here is really the throughput/number of execution units available together with the number of cycles. I guess we could ask TTI about that and get roughly sane results?

================
Comment at: llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll:416
 ; ALL-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0
+; ALL-NEXT:    br i1 [[TOBOOL]], label [[COND_END]], label [[COND_TRUE:%.*]]
+; ALL:       cond.true:
----------------
craig.topper wrote:
> Do you have a better example more like the loops you're seeing performance issues on?
> 
> This one looks kind of silly since %x is loop invariant.
I can make it more complex. The origin IR from the benchmark is below (I can also provide a run-able version, but it require downloading some swift libraries for macOS)

```
define hidden swiftcc i64 @wobble(i64 %arg, %struct.blam* %arg1) local_unnamed_addr #3 {
bb:
  %tmp = alloca <{ %struct.pluto, %struct.pluto }>, align 8
  %tmp2 = ptrtoint %struct.blam* %arg1 to i64
  %tmp3 = and i64 %tmp2, 2305843009213693952
  %tmp4 = icmp eq i64 %tmp3, 0
  %tmp5 = lshr i64 %tmp2, 56
  %tmp6 = and i64 %tmp5, 15
  %tmp7 = and i64 %arg, 281474976710655
  %tmp8 = select i1 %tmp4, i64 %tmp7, i64 %tmp6
  %tmp9 = icmp eq i64 %tmp8, 0
  br i1 %tmp9, label %bb64, label %bb10, !prof !16, !misexpect !17

bb10:                                             ; preds = %bb
  %tmp11 = and i64 %tmp2, 1152921504606846976
  %tmp12 = icmp eq i64 %tmp11, 0
  %tmp13 = bitcast <{ %struct.pluto, %struct.pluto }>* %tmp to i8*
  %tmp14 = and i64 %tmp2, 72057594037927935
  %tmp15 = getelementptr inbounds <{ %struct.pluto, %struct.pluto }>, <{ %struct.pluto, %struct.pluto }>* %tmp, i64 0, i32 0, i32 0
  %tmp16 = getelementptr inbounds <{ %struct.pluto, %struct.pluto }>, <{ %struct.pluto, %struct.pluto }>* %tmp, i64 0, i32 1, i32 0
  %tmp17 = bitcast <{ %struct.pluto, %struct.pluto }>* %tmp to %struct.barney*
  %tmp18 = and i64 %arg, 1152921504606846976
  %tmp19 = icmp eq i64 %tmp18, 0
  %tmp20 = and i64 %tmp2, 1152921504606846975
  %tmp21 = add nuw nsw i64 %tmp20, 32
  br label %bb22

bb22:                                             ; preds = %bb59, %bb10
  %tmp23 = phi i64 [ 0, %bb10 ], [ %tmp60, %bb59 ]
  %tmp24 = phi i64 [ 0, %bb10 ], [ %tmp56, %bb59 ]
  br i1 %tmp12, label %bb25, label %bb26

bb25:                                             ; preds = %bb22
  br i1 %tmp4, label %bb30, label %bb31

bb26:                                             ; preds = %bb22
  %tmp27 = shl i64 %tmp24, 16
  %tmp28 = tail call swiftcc { i32, i64 } @snork(i64 %tmp27, i64 %arg, %struct.blam* %arg1)
  %tmp29 = extractvalue { i32, i64 } %tmp28, 1
  br label %bb54

bb30:                                             ; preds = %bb25
  br i1 %tmp19, label %bb41, label %bb44, !prof !16, !misexpect !18

bb31:                                             ; preds = %bb25
  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %tmp13)
  store i64 %arg, i64* %tmp15, align 8
  store i64 %tmp14, i64* %tmp16, align 8
  %tmp32 = getelementptr inbounds %struct.barney, %struct.barney* %tmp17, i64 %tmp24, i32 0
  %tmp33 = load i8, i8* %tmp32, align 1
  %tmp34 = icmp sgt i8 %tmp33, -1
  br i1 %tmp34, label %bb39, label %bb35

bb35:                                             ; preds = %bb31
  %tmp36 = xor i8 %tmp33, -1
  %tmp37 = tail call i8 @llvm.ctlz.i8(i8 %tmp36, i1 false), !range !19
  %tmp38 = zext i8 %tmp37 to i64
  br label %bb39

bb39:                                             ; preds = %bb35, %bb31
  %tmp40 = phi i64 [ 1, %bb31 ], [ %tmp38, %bb35 ]
  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %tmp13)
  br label %bb54

bb41:                                             ; preds = %bb30
  %tmp42 = tail call swiftcc { i64, i64 } @bar(i64 %arg, %struct.blam* %arg1)
  %tmp43 = extractvalue { i64, i64 } %tmp42, 0
  br label %bb44

bb44:                                             ; preds = %bb41, %bb30
  %tmp45 = phi i64 [ %tmp43, %bb41 ], [ %tmp21, %bb30 ]
  %tmp46 = inttoptr i64 %tmp45 to %struct.barney*
  %tmp47 = getelementptr inbounds %struct.barney, %struct.barney* %tmp46, i64 %tmp24, i32 0
  %tmp48 = load i8, i8* %tmp47, align 1
  %tmp49 = icmp sgt i8 %tmp48, -1
  br i1 %tmp49, label %bb54, label %bb50

bb50:                                             ; preds = %bb44
  %tmp51 = xor i8 %tmp48, -1
  %tmp52 = tail call i8 @llvm.ctlz.i8(i8 %tmp51, i1 false), !range !19
  %tmp53 = zext i8 %tmp52 to i64
  br label %bb54

bb54:                                             ; preds = %bb50, %bb44, %bb39, %bb26
  %tmp55 = phi i64 [ %tmp29, %bb26 ], [ %tmp40, %bb39 ], [ 1, %bb44 ], [ %tmp53, %bb50 ]
  %tmp56 = add i64 %tmp55, %tmp24
  %tmp57 = tail call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %tmp23, i64 1)
  %tmp58 = extractvalue { i64, i1 } %tmp57, 1
  br i1 %tmp58, label %bb66, label %bb59, !prof !16, !misexpect !18

bb59:                                             ; preds = %bb54
  %tmp60 = extractvalue { i64, i1 } %tmp57, 0
  %tmp61 = icmp slt i64 %tmp56, %tmp8
  br i1 %tmp61, label %bb22, label %bb62, !prof !20, !misexpect !17

bb62:                                             ; preds = %bb59
  %tmp63 = extractvalue { i64, i1 } %tmp57, 0
  br label %bb64

bb64:                                             ; preds = %bb62, %bb
  %tmp65 = phi i64 [ 0, %bb ], [ %tmp63, %bb62 ]
  ret i64 %tmp65

bb66:                                             ; preds = %bb54
  tail call void asm sideeffect "", "n"(i32 0) #5
  tail call void @llvm.trap()
  unreachable
}
```

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D89479/new/

https://reviews.llvm.org/D89479