[llvm] [LowerMemIntrinsics] Avoid udiv/urem when type size is a power of 2 (PR #81238)

Fri Feb 9 01:57:51 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

See #64620 - does not fix the issue but improves the generated code a bit.

---

Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81238.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+20-4) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+107-62) 


``````````diff

diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index c75de8687879d7..66290b413e7081 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
 
@@ -155,6 +156,22 @@ void llvm::createMemCpyLoopKnownSize(
          "Bytes copied should match size in the call!");
 }
 
+// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
+static Value* getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B, Value* Len, Value* OpSize, unsigned OpSizeVal) {
+  // For powers of 2, we can lshr by log2 instead of using udiv.
+  if(isPowerOf2_32(OpSizeVal))
+    return B.CreateLShr(Len, Log2_32(OpSizeVal));
+  return B.CreateUDiv(Len, OpSize);
+}
+
+// \returns \p Len urem \p OpSize, checking for optimization opportunities.
+static Value* getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B, Value* Len, Value* OpSize, unsigned OpSizeVal) {
+  // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
+  if(isPowerOf2_32(OpSizeVal))
+    return B.CreateAnd(Len, OpSizeVal-1);
+  return B.CreateURem(Len, OpSize);
+}
+
 void llvm::createMemCpyLoopUnknownSize(
     Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
     Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -195,8 +212,8 @@ void llvm::createMemCpyLoopUnknownSize(
   bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
   Value *RuntimeLoopCount = LoopOpIsInt8 ?
-                            CopyLen :
-                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+                            CopyLen : getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
@@ -239,8 +256,7 @@ void llvm::createMemCpyLoopUnknownSize(
     assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
            "Store size is expected to match type size");
 
-    // Add in the
-    Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+    Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
     Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
 
     // Loop body for the residual copy.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 8018e0cbf1cc39..f88050533f7183 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -189,8 +189,8 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1)
 
 define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller0(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -225,8 +225,8 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @variable_memcpy_caller1(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -261,8 +261,8 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
 
 define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
 ; OPT-LABEL: @memcpy_multi_use_one_function(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
@@ -286,8 +286,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 ; OPT-NEXT:    [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
 ; OPT-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
 ; OPT:       post-loop-memcpy-expansion1:
-; OPT-NEXT:    [[TMP16:%.*]] = udiv i64 [[M:%.*]], 16
-; OPT-NEXT:    [[TMP17:%.*]] = urem i64 [[M]], 16
+; OPT-NEXT:    [[TMP16:%.*]] = lshr i64 [[M:%.*]], 4
+; OPT-NEXT:    [[TMP17:%.*]] = and i64 [[M]], 15
 ; OPT-NEXT:    [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
 ; OPT-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP16]], 0
 ; OPT-NEXT:    br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -326,8 +326,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
 
 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -363,8 +363,8 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
 ; One of the uses in the function should be expanded, the other left alone.
 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -395,8 +395,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; ALL-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; ALL-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ALL-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -909,8 +909,8 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
 
 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -945,8 +945,8 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -981,8 +981,8 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT:    [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT:    [[TMP2:%.*]] = and i64 [[N]], 15
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1017,8 +1017,8 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
 
 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1053,8 +1053,8 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 2
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 2
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 1
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 1
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1089,8 +1089,8 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
 
 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1161,8 +1161,8 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
 
 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT:    [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT:    [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT:    [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT:    [[TMP2:%.*]] = and i32 [[N]], 7
 ; OPT-NEXT:    [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
 ; OPT-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; OPT-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1488,9 +1488,9 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1511,9 +1511,9 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT:    store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
 ; ALL-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1561,9 +1561,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 ; ALL:       load-store-loop:
 ; ALL-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
 ; ALL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
 ; ALL-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 1
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
 ; ALL-NEXT:    br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1576,17 +1576,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
 
 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; MAX1024-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
 ; MAX1024-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
 ; MAX1024-NEXT:    br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
 ; MAX1024:       loop-memcpy-expansion:
 ; MAX1024-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
 ; MAX1024-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META0:![0-9]+]]
 ; MAX1024-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP8]] = add i32 [[LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
 ; MAX1024-NEXT:    br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1594,9 +1594,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
 ; MAX1024-NEXT:    [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
 ; MAX1024-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
 ; MAX1024-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT:    store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
 ; MAX1024-NEXT:    [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
 ; MAX1024-NEXT:    [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
 ; MAX1024-NEXT:    br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1607,17 +1607,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
 ; MAX1024-NEXT:    br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
 ;
 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT:    [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT:    [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[SIZE]], 7
 ; ALL-NEXT:    [[TMP3:%.*]] = sub i32 [[SIZ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/81238