[PATCH] D92668: [SLP]Merge reorder and reuse shuffles.

Wed Dec 30 16:27:58 PST 2020

rupprecht added a comment.

Finally got a reasonably sized reduction. It's probably so long because I'm just using clang to build, and if it gets smaller than clang optimizes too much of it away; if I used opt to control the passes I could probably get it smaller. Anyway, hope this is good enough to demonstrate the issue:

  ; ModuleID = './repro.ll'
  source_filename = "repro.ll"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"

  %struct.widget = type { %struct.baz }
  %struct.baz = type { i32 (...)**, i32, i32, i32, i32, i8* }
  %struct.snork = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8*, %struct.spam*, i8*, i8* }
  %struct.spam = type { i8, i8, i8, i32, i32, i32, i32 }
  %struct.zot = type { i8 }
  %struct.wombat = type { i8 }

  define void @wombat(%struct.widget* %arg) local_unnamed_addr align 32 personality i8* undef {
  bb:
    %tmp = alloca float, align 4
    %tmp1 = alloca float, align 4
    %tmp2 = alloca float, align 4
    %tmp3 = alloca float, align 4
    %tmp4 = alloca %struct.snork, align 8
    %tmp7 = getelementptr inbounds %struct.widget, %struct.widget* %arg, i64 0, i32 0, i32 2
    %tmp8 = getelementptr inbounds %struct.widget, %struct.widget* %arg, i64 0, i32 0, i32 1
    %tmp9 = getelementptr inbounds %struct.snork, %struct.snork* %tmp4, i64 0, i32 11
    invoke void bitcast (void ()* @quux to void (%struct.snork*, %struct.zot*)*)(%struct.snork* %tmp4, %struct.zot* null)
            to label %bb11 unwind label %bb47

  bb11:                                             ; preds = %bb
    %tmp12 = load i32, i32* %tmp7, align 4
    %tmp13 = load i32, i32* %tmp8, align 8
    invoke void bitcast (void ()* @wobble to void (%struct.wombat*, float*, float*, float*, float*)*)(%struct.wombat* undef, float* %tmp, float* %tmp1, float* %tmp2, float* %tmp3)
            to label %bb14 unwind label %bb47

  bb14:                                             ; preds = %bb11
    %tmp15 = load float, float* %tmp, align 4
    %tmp16 = load float, float* %tmp1, align 4
    %tmp17 = load float, float* %tmp2, align 4
    %tmp18 = load float, float* %tmp3, align 4
    %tmp20 = load %struct.spam*, %struct.spam** %tmp9, align 8
    %tmp21 = add nsw i32 %tmp12, -1
    %tmp22 = fptosi float %tmp15 to i32
    %tmp23 = icmp slt i32 %tmp22, 0
    %tmp24 = icmp sgt i32 %tmp12, %tmp22
    %tmp25 = select i1 %tmp24, i32 %tmp22, i32 %tmp21
    %tmp26 = select i1 %tmp23, i32 0, i32 %tmp25
    %tmp27 = getelementptr inbounds %struct.spam, %struct.spam* %tmp20, i64 0, i32 3
    store i32 %tmp26, i32* %tmp27, align 8
    %tmp28 = add nsw i32 %tmp13, -1
    %tmp29 = fptosi float %tmp16 to i32
    %tmp30 = icmp slt i32 %tmp29, 0
    %tmp31 = icmp sgt i32 %tmp13, %tmp29
    %tmp32 = select i1 %tmp31, i32 %tmp29, i32 %tmp28
    %tmp33 = select i1 %tmp30, i32 0, i32 %tmp32
    %tmp34 = getelementptr inbounds %struct.spam, %struct.spam* %tmp20, i64 0, i32 4
    store i32 %tmp33, i32* %tmp34, align 4
    %tmp35 = fptosi float %tmp17 to i32
    %tmp36 = icmp slt i32 %tmp35, 0
    %tmp37 = icmp sgt i32 %tmp12, %tmp35
    %tmp38 = select i1 %tmp37, i32 %tmp35, i32 %tmp21
    %tmp39 = select i1 %tmp36, i32 0, i32 %tmp38
    %tmp40 = getelementptr inbounds %struct.spam, %struct.spam* %tmp20, i64 0, i32 5
    store i32 %tmp39, i32* %tmp40, align 8
    %tmp41 = fptosi float %tmp18 to i32
    %tmp42 = icmp slt i32 %tmp41, 0
    %tmp43 = icmp sgt i32 %tmp13, %tmp41
    %tmp44 = select i1 %tmp43, i32 %tmp41, i32 %tmp28
    %tmp45 = select i1 %tmp42, i32 0, i32 %tmp44
    %tmp46 = getelementptr inbounds %struct.spam, %struct.spam* %tmp20, i64 0, i32 6
    store i32 %tmp45, i32* %tmp46, align 4
    ret void

  bb47:                                             ; preds = %bb11, %bb10
    %tmp48 = landingpad { i8*, i32 }
            cleanup
    ret void
  }

  declare void @quux() unnamed_addr

  declare void @wobble() local_unnamed_addr

Building with `clang -c repro.ll -O3 -o optimized.ll -fno-discard-value-names -S -emit-llvm`, before this patch we had:

    %1 = load <2 x i32>, <2 x i32>* %0, align 8
    %reorder_shuffle = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
  ...
    %shuffle = shufflevector <2 x i32> %reorder_shuffle, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  ...
    %2 = add nsw <2 x i32> %reorder_shuffle, <i32 -1, i32 -1>

And after, we have:

    %1 = load <2 x i32>, <2 x i32>* %0, align 8
  ...
    %shuffle = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
  ...
    %2 = add nsw <2 x i32> %1, <i32 -1, i32 -1>

As with the previous snippet,

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D92668/new/

https://reviews.llvm.org/D92668