[PATCH] D127392: [AggressiveInstCombine] Combine consecutive loads which are being merged to form a wider load.
Biplob Mishra via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 27 05:09:59 PDT 2022
bipmis added a comment.
In D127392#3815730 <https://reviews.llvm.org/D127392#3815730>, @aeubanks wrote:
> it ended up being a function that we always compile with -O3...
>
> anyway, reduced test case
>
> $ cat /tmp/a.ll
> target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
>
> define i64 @eggs(ptr noundef readonly %arg) {
> %tmp3 = load i8, ptr %arg, align 1
> %tmp4 = getelementptr inbounds i8, ptr %arg, i64 1
> %tmp5 = load i8, ptr %tmp4, align 1
> %tmp6 = getelementptr inbounds i8, ptr %arg, i64 2
> %tmp7 = load i8, ptr %tmp6, align 1
> %tmp8 = getelementptr inbounds i8, ptr %arg, i64 3
> %tmp9 = load i8, ptr %tmp8, align 1
> %tmp10 = getelementptr inbounds i8, ptr %arg, i64 4
> %tmp11 = load i8, ptr %tmp10, align 1
> %tmp12 = getelementptr inbounds i8, ptr %arg, i64 5
> %tmp13 = load i8, ptr %tmp12, align 1
> %tmp14 = getelementptr inbounds i8, ptr %arg, i64 6
> %tmp15 = load i8, ptr %tmp14, align 1
> %tmp16 = getelementptr inbounds i8, ptr %arg, i64 7
> %tmp17 = load i8, ptr %tmp16, align 1
> %tmp18 = zext i8 %tmp17 to i64
> %tmp19 = shl nuw i64 %tmp18, 56
> %tmp20 = zext i8 %tmp15 to i64
> %tmp21 = shl nuw nsw i64 %tmp20, 48
> %tmp22 = or i64 %tmp19, %tmp21
> %tmp23 = zext i8 %tmp13 to i64
> %tmp24 = shl nuw nsw i64 %tmp23, 40
> %tmp25 = or i64 %tmp22, %tmp24
> %tmp26 = zext i8 %tmp11 to i64
> %tmp27 = shl nuw nsw i64 %tmp26, 32
> %tmp28 = or i64 %tmp25, %tmp27
> %tmp29 = zext i8 %tmp9 to i64
> %tmp30 = shl nuw nsw i64 %tmp29, 24
> %tmp31 = or i64 %tmp28, %tmp30
> %tmp32 = zext i8 %tmp7 to i64
> %tmp33 = shl nuw nsw i64 %tmp32, 16
> %tmp34 = zext i8 %tmp5 to i64
> %tmp35 = shl nuw nsw i64 %tmp34, 8
> %tmp36 = or i64 %tmp31, %tmp33
> %tmp37 = zext i8 %tmp3 to i64
> %tmp38 = or i64 %tmp36, %tmp35
> %tmp39 = or i64 %tmp38, %tmp37
> ret i64 %tmp39
> }
> $ ./build/rel/bin/opt -passes=aggressive-instcombine -S /tmp/a.ll
> ; ModuleID = '/tmp/b.ll'
> source_filename = "/tmp/a.ll"
> target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-grtev4-linux-gnu"
>
> define i64 @eggs(ptr noundef readonly %arg) {
> %tmp3 = load i8, ptr %arg, align 1
> %tmp4 = getelementptr inbounds i8, ptr %arg, i64 1
> %tmp5 = load i32, ptr %tmp4, align 1
> %1 = zext i32 %tmp5 to i64
> %2 = shl i64 %1, 8
> %tmp37 = zext i8 %tmp3 to i64
> %tmp39 = or i64 %2, %tmp37
> ret i64 %tmp39
> }
>
> that does look wrong, it looks like it should be optimized to `load i64` rather than `zext(load i32 (%a + 1)) | zext(load i8 %a)`
Thanks for the test case.
So in this patch we have implemented forward load merge as can be seen by the test example. However I had to handle the child node "or(load,load)" as this is being reversed by InstCombine currently as seen below. This needed a corrected check.
define i32 @Load32(ptr noundef %ptr) local_unnamed_addr #0 {
entry:
%0 = load i8, ptr %ptr, align 1, !tbaa !5
%conv = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, ptr %ptr, i64 1
%1 = load i8, ptr %arrayidx1, align 1, !tbaa !5
%conv2 = zext i8 %1 to i32
%shl = shl i32 %conv2, 8
** %or = or i32 %conv, %shl**
%arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 2
%2 = load i8, ptr %arrayidx3, align 1, !tbaa !5
%conv4 = zext i8 %2 to i32
%shl5 = shl i32 %conv4, 16
%or6 = or i32 %or, %shl5
%arrayidx7 = getelementptr inbounds i8, ptr %ptr, i64 3
%3 = load i8, ptr %arrayidx7, align 1, !tbaa !5
%conv8 = zext i8 %3 to i32
%shl9 = shl i32 %conv8, 24
%or10 = or i32 %or6, %shl9
ret i32 %or10
}
*** IR Dump After InstCombinePass on Load32 ***
; Function Attrs: nounwind uwtable
define i32 @Load32(ptr noundef %ptr) local_unnamed_addr #0 {
entry:
%0 = load i8, ptr %ptr, align 1, !tbaa !5
%conv = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, ptr %ptr, i64 1
%1 = load i8, ptr %arrayidx1, align 1, !tbaa !5
%conv2 = zext i8 %1 to i32
%shl = shl nuw nsw i32 %conv2, 8
** %or = or i32 %shl, %conv**
%arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 2
%2 = load i8, ptr %arrayidx3, align 1, !tbaa !5
%conv4 = zext i8 %2 to i32
%shl5 = shl nuw nsw i32 %conv4, 16
%or6 = or i32 %or, %shl5
%arrayidx7 = getelementptr inbounds i8, ptr %ptr, i64 3
%3 = load i8, ptr %arrayidx7, align 1, !tbaa !5
%conv8 = zext i8 %3 to i32
%shl9 = shl nuw i32 %conv8, 24
%or10 = or i32 %or6, %shl9
ret i32 %or10
The full implementation of reverse load merge pattern is planned subsequently. So now you will see the lowest node loads being merged but not the other ones. I am updating the patch. Would be great if you can test the same.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D127392/new/
https://reviews.llvm.org/D127392
More information about the llvm-commits
mailing list