[PATCH] D127392: [AggressiveInstCombine] Combine consecutive loads which are being merged to form a wider load.

Tue Sep 27 05:09:59 PDT 2022

bipmis added a comment.

In D127392#3815730 <https://reviews.llvm.org/D127392#3815730>, @aeubanks wrote:

> it ended up being a function that we always compile with -O3...
>
> anyway, reduced test case
>
>   $ cat /tmp/a.ll
>   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"                                                                                                       
>   target triple = "x86_64-grtev4-linux-gnu"                                                                                                                                                          
>                                                                                                                                                                                                      
>   define i64 @eggs(ptr noundef readonly %arg) {                                                                                                                                                      
>     %tmp3 = load i8, ptr %arg, align 1                                                             
>     %tmp4 = getelementptr inbounds i8, ptr %arg, i64 1                                             
>     %tmp5 = load i8, ptr %tmp4, align 1                                                            
>     %tmp6 = getelementptr inbounds i8, ptr %arg, i64 2                                             
>     %tmp7 = load i8, ptr %tmp6, align 1                                                            
>     %tmp8 = getelementptr inbounds i8, ptr %arg, i64 3                                             
>     %tmp9 = load i8, ptr %tmp8, align 1                                                            
>     %tmp10 = getelementptr inbounds i8, ptr %arg, i64 4                                            
>     %tmp11 = load i8, ptr %tmp10, align 1                                                          
>     %tmp12 = getelementptr inbounds i8, ptr %arg, i64 5                                            
>     %tmp13 = load i8, ptr %tmp12, align 1                                                          
>     %tmp14 = getelementptr inbounds i8, ptr %arg, i64 6                                            
>     %tmp15 = load i8, ptr %tmp14, align 1                                                          
>     %tmp16 = getelementptr inbounds i8, ptr %arg, i64 7                                            
>     %tmp17 = load i8, ptr %tmp16, align 1
>     %tmp18 = zext i8 %tmp17 to i64     
>     %tmp19 = shl nuw i64 %tmp18, 56    
>     %tmp20 = zext i8 %tmp15 to i64     
>     %tmp21 = shl nuw nsw i64 %tmp20, 48
>     %tmp22 = or i64 %tmp19, %tmp21     
>     %tmp23 = zext i8 %tmp13 to i64     
>     %tmp24 = shl nuw nsw i64 %tmp23, 40
>     %tmp25 = or i64 %tmp22, %tmp24     
>     %tmp26 = zext i8 %tmp11 to i64     
>     %tmp27 = shl nuw nsw i64 %tmp26, 32
>     %tmp28 = or i64 %tmp25, %tmp27     
>     %tmp29 = zext i8 %tmp9 to i64      
>     %tmp30 = shl nuw nsw i64 %tmp29, 24
>     %tmp31 = or i64 %tmp28, %tmp30     
>     %tmp32 = zext i8 %tmp7 to i64      
>     %tmp33 = shl nuw nsw i64 %tmp32, 16
>     %tmp34 = zext i8 %tmp5 to i64     
>     %tmp35 = shl nuw nsw i64 %tmp34, 8
>     %tmp36 = or i64 %tmp31, %tmp33
>     %tmp37 = zext i8 %tmp3 to i64 
>     %tmp38 = or i64 %tmp36, %tmp35                                                                                                                                                                   
>     %tmp39 = or i64 %tmp38, %tmp37                                                                                                                                                                   
>     ret i64 %tmp39                                                                                                                                                                                   
>   }
>   $ ./build/rel/bin/opt -passes=aggressive-instcombine -S /tmp/a.ll
>   ; ModuleID = '/tmp/b.ll'
>   source_filename = "/tmp/a.ll"
>   target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
>   target triple = "x86_64-grtev4-linux-gnu"
>   
>   define i64 @eggs(ptr noundef readonly %arg) {
>     %tmp3 = load i8, ptr %arg, align 1
>     %tmp4 = getelementptr inbounds i8, ptr %arg, i64 1
>     %tmp5 = load i32, ptr %tmp4, align 1
>     %1 = zext i32 %tmp5 to i64
>     %2 = shl i64 %1, 8
>     %tmp37 = zext i8 %tmp3 to i64
>     %tmp39 = or i64 %2, %tmp37
>     ret i64 %tmp39
>   }
>
> that does look wrong, it looks like it should be optimized to `load i64` rather than `zext(load i32 (%a + 1)) | zext(load i8 %a)`

Thanks for the test case. 
So in this patch we have implemented forward load merge as can be seen by the test example. However I had to handle the child node "or(load,load)" as this is being reversed by InstCombine currently as seen below. This needed a corrected check.

  define i32 @Load32(ptr noundef %ptr) local_unnamed_addr #0 {
  entry:
    %0 = load i8, ptr %ptr, align 1, !tbaa !5
    %conv = zext i8 %0 to i32
    %arrayidx1 = getelementptr inbounds i8, ptr %ptr, i64 1
    %1 = load i8, ptr %arrayidx1, align 1, !tbaa !5
    %conv2 = zext i8 %1 to i32
    %shl = shl i32 %conv2, 8
   ** %or = or i32 %conv, %shl**
    %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 2
    %2 = load i8, ptr %arrayidx3, align 1, !tbaa !5
    %conv4 = zext i8 %2 to i32
    %shl5 = shl i32 %conv4, 16
    %or6 = or i32 %or, %shl5
    %arrayidx7 = getelementptr inbounds i8, ptr %ptr, i64 3
    %3 = load i8, ptr %arrayidx7, align 1, !tbaa !5
    %conv8 = zext i8 %3 to i32
    %shl9 = shl i32 %conv8, 24
    %or10 = or i32 %or6, %shl9
    ret i32 %or10
  }

  *** IR Dump After InstCombinePass on Load32 ***
  ; Function Attrs: nounwind uwtable
  define i32 @Load32(ptr noundef %ptr) local_unnamed_addr #0 {
  entry:
    %0 = load i8, ptr %ptr, align 1, !tbaa !5
    %conv = zext i8 %0 to i32
    %arrayidx1 = getelementptr inbounds i8, ptr %ptr, i64 1
    %1 = load i8, ptr %arrayidx1, align 1, !tbaa !5
    %conv2 = zext i8 %1 to i32
    %shl = shl nuw nsw i32 %conv2, 8
  **  %or = or i32 %shl, %conv**
    %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 2
    %2 = load i8, ptr %arrayidx3, align 1, !tbaa !5
    %conv4 = zext i8 %2 to i32
    %shl5 = shl nuw nsw i32 %conv4, 16
    %or6 = or i32 %or, %shl5
    %arrayidx7 = getelementptr inbounds i8, ptr %ptr, i64 3
    %3 = load i8, ptr %arrayidx7, align 1, !tbaa !5
    %conv8 = zext i8 %3 to i32
    %shl9 = shl nuw i32 %conv8, 24
    %or10 = or i32 %or6, %shl9
    ret i32 %or10

The full implementation of reverse load merge pattern is planned subsequently. So now you will see the lowest node loads being merged but not the other ones. I am updating the patch. Would be great if you can test the same.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D127392/new/

https://reviews.llvm.org/D127392