[llvm] 754d258 - [CGP] Update MemIntrinsic alignment if possible

Thu Nov 17 03:59:54 PST 2022

Author: Alex Richardson
Date: 2022-11-17T11:59:35Z
New Revision: 754d25844a7e5793b1b6523ce223061dba7da7c1

URL: https://github.com/llvm/llvm-project/commit/754d25844a7e5793b1b6523ce223061dba7da7c1
DIFF: https://github.com/llvm/llvm-project/commit/754d25844a7e5793b1b6523ce223061dba7da7c1.diff

LOG: [CGP] Update MemIntrinsic alignment if possible

Previously it was only being done if shouldAlignPointerArgs() returned
true, which right now is only true for ARM targets.
Updating the argument alignment attributes of memcpy/memset intrinsics
if the underlying object has larger alignment can be beneficial even
when CGP didn't increase alignment (as can be seen from the test changes),
so invert the loop and if condition.

Differential Revision: https://reviews.llvm.org/D134281

Added: 
    

Modified: 
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
    llvm/test/CodeGen/X86/mcu-abi.ll
    llvm/test/CodeGen/X86/memset-2.ll
    llvm/test/CodeGen/X86/memset64-on-x86-32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index cf2b32c74eb5..355f5a217a9a 100644

--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2252,19 +2252,19 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
           DL->getTypeAllocSize(GV->getValueType()) >= MinSize + Offset2)
         GV->setAlignment(PrefAlign);
     }
-    // If this is a memcpy (or similar) then we may be able to improve the
-    // alignment
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
-      MaybeAlign MIDestAlign = MI->getDestAlign();
-      if (!MIDestAlign || DestAlign > *MIDestAlign)
-        MI->setDestAlignment(DestAlign);
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-        MaybeAlign MTISrcAlign = MTI->getSourceAlign();
-        Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
-        if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
-          MTI->setSourceAlignment(SrcAlign);
-      }
+  }
+  // If this is a memcpy (or similar) then we may be able to improve the
+  // alignment.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+    Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+    MaybeAlign MIDestAlign = MI->getDestAlign();
+    if (!MIDestAlign || DestAlign > *MIDestAlign)
+      MI->setDestAlignment(DestAlign);
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+      Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+      if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
+        MTI->setSourceAlignment(SrcAlign);
     }
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 77c2205018f9..ee1bc4376a1c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,16 +19,16 @@ define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    ds_read_u8 v1, v0 offset:1
+; GCN-NEXT:    v_mov_b32_e32 v1, 2
+; GCN-NEXT:    ds_write_b8 v0, v1
 ; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    ds_write_b8 v0, v3
-; GCN-NEXT:    ds_write_b8 v0, v3 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v1 offset:5
 ; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: @test(

diff  --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
index fbec4dc813c2..53c228943d91 100644
--- a/llvm/test/CodeGen/X86/mcu-abi.ll
+++ b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -64,13 +64,14 @@ entry:
 define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 {
 ; CHECK-LABEL: ret_large_struct:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl $48, %ecx
-; CHECK-NEXT:    calll memcpy
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $12, %ecx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
 ; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
   call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false)

diff  --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
index 72ef36fc8273..a4fbbc570047 100644
--- a/llvm/test/CodeGen/X86/memset-2.ll
+++ b/llvm/test/CodeGen/X86/memset-2.ll
@@ -1,31 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
 
-define fastcc void @t1() nounwind {
+define fastcc void @t1(ptr nocapture %s) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    pushl $188
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 0, i32 188, i1 false)
   unreachable
 }
 
-define fastcc void @t2(i8 signext %c) nounwind {
+define fastcc void @t2(ptr nocapture %s, i8 signext %c) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    pushl $76
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
+; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr undef, i8 %c, i32 76, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 %c, i32 76, i1 false)
   unreachable
 }
 

diff  --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index 7b939dfb4259..c6eecdcdf99c 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -3,55 +3,57 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_64
 
-define void @bork() nounwind {
+define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; FAST-LABEL: bork:
 ; FAST:       # %bb.0:
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    xorps %xmm0, %xmm0
-; FAST-NEXT:    movups %xmm0, 64
-; FAST-NEXT:    movups %xmm0, 48
-; FAST-NEXT:    movups %xmm0, 32
-; FAST-NEXT:    movups %xmm0, 16
-; FAST-NEXT:    movups %xmm0, 0
+; FAST-NEXT:    movups %xmm0, 64(%eax)
+; FAST-NEXT:    movups %xmm0, 48(%eax)
+; FAST-NEXT:    movups %xmm0, 32(%eax)
+; FAST-NEXT:    movups %xmm0, 16(%eax)
+; FAST-NEXT:    movups %xmm0, (%eax)
 ; FAST-NEXT:    retl
 ;
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
-; SLOW_32-NEXT:    movl $0, 4
-; SLOW_32-NEXT:    movl $0, 0
-; SLOW_32-NEXT:    movl $0, 12
-; SLOW_32-NEXT:    movl $0, 8
-; SLOW_32-NEXT:    movl $0, 20
-; SLOW_32-NEXT:    movl $0, 16
-; SLOW_32-NEXT:    movl $0, 28
-; SLOW_32-NEXT:    movl $0, 24
-; SLOW_32-NEXT:    movl $0, 36
-; SLOW_32-NEXT:    movl $0, 32
-; SLOW_32-NEXT:    movl $0, 44
-; SLOW_32-NEXT:    movl $0, 40
-; SLOW_32-NEXT:    movl $0, 52
-; SLOW_32-NEXT:    movl $0, 48
-; SLOW_32-NEXT:    movl $0, 60
-; SLOW_32-NEXT:    movl $0, 56
-; SLOW_32-NEXT:    movl $0, 68
-; SLOW_32-NEXT:    movl $0, 64
-; SLOW_32-NEXT:    movl $0, 76
-; SLOW_32-NEXT:    movl $0, 72
+; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW_32-NEXT:    movl $0, 4(%eax)
+; SLOW_32-NEXT:    movl $0, (%eax)
+; SLOW_32-NEXT:    movl $0, 12(%eax)
+; SLOW_32-NEXT:    movl $0, 8(%eax)
+; SLOW_32-NEXT:    movl $0, 20(%eax)
+; SLOW_32-NEXT:    movl $0, 16(%eax)
+; SLOW_32-NEXT:    movl $0, 28(%eax)
+; SLOW_32-NEXT:    movl $0, 24(%eax)
+; SLOW_32-NEXT:    movl $0, 36(%eax)
+; SLOW_32-NEXT:    movl $0, 32(%eax)
+; SLOW_32-NEXT:    movl $0, 44(%eax)
+; SLOW_32-NEXT:    movl $0, 40(%eax)
+; SLOW_32-NEXT:    movl $0, 52(%eax)
+; SLOW_32-NEXT:    movl $0, 48(%eax)
+; SLOW_32-NEXT:    movl $0, 60(%eax)
+; SLOW_32-NEXT:    movl $0, 56(%eax)
+; SLOW_32-NEXT:    movl $0, 68(%eax)
+; SLOW_32-NEXT:    movl $0, 64(%eax)
+; SLOW_32-NEXT:    movl $0, 76(%eax)
+; SLOW_32-NEXT:    movl $0, 72(%eax)
 ; SLOW_32-NEXT:    retl
 ;
 ; SLOW_64-LABEL: bork:
 ; SLOW_64:       # %bb.0:
-; SLOW_64-NEXT:    movq $0, 72
-; SLOW_64-NEXT:    movq $0, 64
-; SLOW_64-NEXT:    movq $0, 56
-; SLOW_64-NEXT:    movq $0, 48
-; SLOW_64-NEXT:    movq $0, 40
-; SLOW_64-NEXT:    movq $0, 32
-; SLOW_64-NEXT:    movq $0, 24
-; SLOW_64-NEXT:    movq $0, 16
-; SLOW_64-NEXT:    movq $0, 8
-; SLOW_64-NEXT:    movq $0, 0
+; SLOW_64-NEXT:    movq $0, 72(%rdi)
+; SLOW_64-NEXT:    movq $0, 64(%rdi)
+; SLOW_64-NEXT:    movq $0, 56(%rdi)
+; SLOW_64-NEXT:    movq $0, 48(%rdi)
+; SLOW_64-NEXT:    movq $0, 40(%rdi)
+; SLOW_64-NEXT:    movq $0, 32(%rdi)
+; SLOW_64-NEXT:    movq $0, 24(%rdi)
+; SLOW_64-NEXT:    movq $0, 16(%rdi)
+; SLOW_64-NEXT:    movq $0, 8(%rdi)
+; SLOW_64-NEXT:    movq $0, (%rdi)
 ; SLOW_64-NEXT:    retq
-  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false)
   ret void
 }