[llvm] bd87a24 - [CGP] Add generic TargetLowering::shouldAlignPointerArgs() implementation

Thu Feb 9 02:14:11 PST 2023

Author: Alex Richardson
Date: 2023-02-09T10:11:40Z
New Revision: bd87a2449da0c82e63cebdf9c131c54a5472e3a7

URL: https://github.com/llvm/llvm-project/commit/bd87a2449da0c82e63cebdf9c131c54a5472e3a7
DIFF: https://github.com/llvm/llvm-project/commit/bd87a2449da0c82e63cebdf9c131c54a5472e3a7.diff

LOG: [CGP] Add generic TargetLowering::shouldAlignPointerArgs() implementation

This function was added for ARM targets, but aligning global/stack pointer
arguments passed to memcpy/memmove/memset can improve code size and
performance for all targets that don't have fast unaligned accesses.
This adds a generic implementation that adjusts the alignment to pointer
size if unaligned accesses are slow.
Review D134168 suggests that this significantly improves performance on
synthetic benchmarks such as Dhrystone on RV32 as it avoids memcpy() calls.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D134282

Added: 
    llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/CodeGenPrepare.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/test/CodeGen/RISCV/memcpy-inline.ll
    llvm/test/CodeGen/WebAssembly/bulk-memory.ll
    llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
    llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2046f4b8a0854..567121933da1f 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1932,10 +1932,10 @@ class TargetLoweringBase {
   /// the object whose address is being passed. If so then MinSize is set to the
   /// minimum size the object must be to be aligned and PrefAlign is set to the
   /// preferred alignment.
-  virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
-                                      Align & /*PrefAlign*/) const {
-    return false;
-  }
+  virtual bool
+  shouldUpdatePointerArgAlignment(const CallInst *CI, unsigned &MinSize,
+                                  Align &PrefAlign,
+                                  const TargetTransformInfo &TTI) const;
 
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations

diff  --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index dd431cc6f4f5f..e2d1461baa378 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2221,10 +2221,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   }
 
   // Align the pointer arguments to this call if the target thinks it's a good
-  // idea
+  // idea (generally only useful for memcpy/memmove/memset).
   unsigned MinSize;
   Align PrefAlign;
-  if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+  if (TLI->shouldUpdatePointerArgAlignment(CI, MinSize, PrefAlign, *TTI)) {
     for (auto &Arg : CI->args()) {
       // We want to align both objects whose address is used directly and
       // objects whose address is used in casts and GEPs, though it only makes

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 01da7b74b3220..397326792a521 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
@@ -948,6 +949,42 @@ bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
+bool TargetLoweringBase::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+    const TargetTransformInfo &TTI) const {
+  // For now, we only adjust alignment for memcpy/memmove/memset calls.
+  auto *MemCI = dyn_cast<MemIntrinsic>(CI);
+  if (!MemCI)
+    return false;
+  auto AddrSpace = MemCI->getDestAddressSpace();
+  // We assume that scalar register sized values can be loaded/stored
+  // efficiently. If this is not the case for a given target it should override
+  // this function.
+  auto PrefSizeBits =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedSize();
+  PrefAlign = Align(PrefSizeBits / 8);
+  // When building with -Oz, we only increase the alignment if the object is
+  // at least 8 bytes in size to avoid increased stack/global padding.
+  // Otherwise, we require at least PrefAlign bytes to be copied.
+  MinSize = PrefAlign.value();
+  if (CI->getFunction()->hasMinSize())
+    MinSize = std::max(MinSize, 8u);
+
+  // XXX: we could determine the MachineMemOperand flags instead of assuming
+  // load+store (but it probably makes no 
diff erence for supported targets).
+  unsigned FastUnalignedAccess = 0;
+  if (allowsMisalignedMemoryAccesses(
+          LLT::scalar(PrefSizeBits), AddrSpace, Align(1),
+          MachineMemOperand::MOStore | MachineMemOperand::MOLoad,
+          &FastUnalignedAccess) &&
+      FastUnalignedAccess) {
+    // If unaligned loads&stores are fast, there is no need to adjust
+    // alignment.
+    return false;
+  }
+  return true; // unaligned accesses are not possible or slow.
+}
+
 void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
   // If the command-line option was specified, ignore this request.
   if (!JumpIsExpensiveOverride.getNumOccurrences())

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6370ac39c264e..8e5a6c94c9280 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1920,8 +1920,9 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
-bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                               Align &PrefAlign) const {
+bool ARMTargetLowering::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+    const TargetTransformInfo &TTI) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
   MinSize = 8;

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 3bc936b6cce20..e063441dff03f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -572,8 +572,9 @@ class VectorType;
     const TargetRegisterClass *
     getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
-    bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                Align &PrefAlign) const override;
+    bool shouldUpdatePointerArgAlignment(
+        const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+        const TargetTransformInfo &TTI) const override;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.

diff  --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
index 05cb2a83e1e70..e53e45cf7fc6c 100644
--- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
@@ -295,50 +295,35 @@ entry:
 }
 
 define void @t6() nounwind {
-; RV32ALIGNED-LABEL: t6:
-; RV32ALIGNED:       # %bb.0: # %entry
-; RV32ALIGNED-NEXT:    addi sp, sp, -16
-; RV32ALIGNED-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV32ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV32ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV32ALIGNED-NEXT:    li a2, 14
-; RV32ALIGNED-NEXT:    call memcpy at plt
-; RV32ALIGNED-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32ALIGNED-NEXT:    addi sp, sp, 16
-; RV32ALIGNED-NEXT:    ret
+; RV32-LABEL: t6:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a0, %hi(spool.splbuf)
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV32-NEXT:    lui a1, 361862
+; RV32-NEXT:    addi a1, a1, -1960
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
+; RV32-NEXT:    lui a1, 362199
+; RV32-NEXT:    addi a1, a1, 559
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
+; RV32-NEXT:    lui a1, 460503
+; RV32-NEXT:    addi a1, a1, 1071
+; RV32-NEXT:    sw a1, %lo(spool.splbuf)(a0)
+; RV32-NEXT:    ret
 ;
 ; RV64ALIGNED-LABEL: t6:
 ; RV64ALIGNED:       # %bb.0: # %entry
-; RV64ALIGNED-NEXT:    addi sp, sp, -16
-; RV64ALIGNED-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV64ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV64ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV64ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV64ALIGNED-NEXT:    li a2, 14
-; RV64ALIGNED-NEXT:    call memcpy at plt
-; RV64ALIGNED-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64ALIGNED-NEXT:    addi sp, sp, 16
+; RV64ALIGNED-NEXT:    li a1, 88
+; RV64ALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV64ALIGNED-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64ALIGNED-NEXT:    ld a1, %lo(.LCPI6_0)(a1)
+; RV64ALIGNED-NEXT:    lui a2, 361862
+; RV64ALIGNED-NEXT:    addiw a2, a2, -1960
+; RV64ALIGNED-NEXT:    sw a2, %lo(spool.splbuf+8)(a0)
+; RV64ALIGNED-NEXT:    sd a1, %lo(spool.splbuf)(a0)
 ; RV64ALIGNED-NEXT:    ret
 ;
-; RV32UNALIGNED-LABEL: t6:
-; RV32UNALIGNED:       # %bb.0: # %entry
-; RV32UNALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32UNALIGNED-NEXT:    li a1, 88
-; RV32UNALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 361862
-; RV32UNALIGNED-NEXT:    addi a1, a1, -1960
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 362199
-; RV32UNALIGNED-NEXT:    addi a1, a1, 559
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 460503
-; RV32UNALIGNED-NEXT:    addi a1, a1, 1071
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf)(a0)
-; RV32UNALIGNED-NEXT:    ret
-;
 ; RV64UNALIGNED-LABEL: t6:
 ; RV64UNALIGNED:       # %bb.0: # %entry
 ; RV64UNALIGNED-NEXT:    lui a0, %hi(.L.str6)

diff  --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
index dc29dc81c13ec..96f1d50c53a51 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
@@ -154,7 +154,7 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
@@ -171,7 +171,7 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
@@ -188,7 +188,7 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]

diff  --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
index 8ee5f6314381c..a900d4bf2c149 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -157,7 +157,7 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
@@ -174,7 +174,7 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
@@ -191,7 +191,7 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]

diff  --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
index 8decd9397c36a..3d9f0f7495d9d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
@@ -134,7 +134,7 @@ define i64 @test_return_i2(i64 %i.coerce) {
   ; ALL:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval
   ; ALL:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i
   ; ALL:   G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store (s64) into %ir.0, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 4), (load (s8) from %ir.2, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 8), (load (s8) from %ir.2, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.3, align 4)
   ; ALL:   $rax = COPY [[LOAD]](s64)
   ; ALL:   RET 0, implicit $rax
@@ -166,9 +166,9 @@ define { i64, i32 } @test_return_i3(i64 %i.coerce0, i32 %i.coerce1) {
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.1)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 4), (load (s8) from %ir.3, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 4), (load (s8) from %ir.5, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 8), (load (s8) from %ir.3, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 8), (load (s8) from %ir.5, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX3]](p0) :: (dereferenceable load (s64) from %ir.tmp)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s32) from %ir.tmp + 8, align 8)
@@ -210,7 +210,7 @@ define { i64, i64 } @test_return_i4(i64 %i.coerce0, i64 %i.coerce1) {
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.2, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 4), (load (s8) from %ir.4, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 8), (load (s8) from %ir.4, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.5, align 4)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s64) from %ir.5 + 8, align 4)

diff  --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll
new file mode 100644
index 0000000000000..5133cc5089b53
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll
@@ -0,0 +1,55 @@
+; RUN: opt -mtriple=riscv32 -data-layout="e-m:e-p:32:32" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=4'
+; RUN: opt -mtriple=riscv64 -data-layout="e-m:e-p:64:64" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=8'
+
+ at str = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align 1
+
+
+declare void @use(ptr %arg)
+
+
+; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align [[#NEW_ALIGNMENT]]
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [45 x i8], align [[#NEW_ALIGNMENT]]
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align [[#NEW_ALIGNMENT]] [[DST]], ptr align [[#NEW_ALIGNMENT]] dereferenceable(31) @str, i32 31, i1 false)
+; CHECK-NEXT:    ret void
+
+entry:
+  %dst = alloca [45 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr align 1 dereferenceable(31) @str, i32 31, i1 false)
+  ret void
+}
+
+; negative test - check that we don't align objects that are too small
+define void @no_align(ptr %src) {
+; CHECK-LABEL: @no_align(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[SRC:%.*]], i32 31, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca [3 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr %src, i32 31, i1 false)
+  ret void
+}
+
+; negative test - check that minsize requires at least 8 byte object size
+define void @no_align_minsize(ptr %src) minsize {
+; CHECK-LABEL: @no_align_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [7 x i8], align 1
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[SRC:%.*]], i32 31, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca [7 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr %src, i32 31, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1)