[llvm] [LowerMemIntrinsics] Avoid udiv/urem when type size is a power of 2 (PR #81238)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 9 01:57:51 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
See #<!-- -->64620 - does not fix the issue but improves the generated code a bit.
---
Patch is 32.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81238.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+20-4)
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+107-62)
``````````diff
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index c75de8687879d7..66290b413e7081 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -13,6 +13,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <optional>
@@ -155,6 +156,22 @@ void llvm::createMemCpyLoopKnownSize(
"Bytes copied should match size in the call!");
}
+// \returns \p Len udiv \p OpSize, checking for optimization opportunities.
+static Value* getRuntimeLoopCount(const DataLayout &DL, IRBuilderBase &B, Value* Len, Value* OpSize, unsigned OpSizeVal) {
+ // For powers of 2, we can lshr by log2 instead of using udiv.
+ if(isPowerOf2_32(OpSizeVal))
+ return B.CreateLShr(Len, Log2_32(OpSizeVal));
+ return B.CreateUDiv(Len, OpSize);
+}
+
+// \returns \p Len urem \p OpSize, checking for optimization opportunities.
+static Value* getRuntimeLoopRemainder(const DataLayout &DL, IRBuilderBase &B, Value* Len, Value* OpSize, unsigned OpSizeVal) {
+ // For powers of 2, we can and by (OpSizeVal - 1) instead of using urem.
+ if(isPowerOf2_32(OpSizeVal))
+ return B.CreateAnd(Len, OpSizeVal-1);
+ return B.CreateURem(Len, OpSize);
+}
+
void llvm::createMemCpyLoopUnknownSize(
Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen,
Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile,
@@ -195,8 +212,8 @@ void llvm::createMemCpyLoopUnknownSize(
bool LoopOpIsInt8 = LoopOpType == Int8Type;
ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
Value *RuntimeLoopCount = LoopOpIsInt8 ?
- CopyLen :
- PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+ CopyLen : getRuntimeLoopCount(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
+
BasicBlock *LoopBB =
BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
IRBuilder<> LoopBuilder(LoopBB);
@@ -239,8 +256,7 @@ void llvm::createMemCpyLoopUnknownSize(
assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
"Store size is expected to match type size");
- // Add in the
- Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+ Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen, CILoopOpSize, LoopOpSize);
Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
// Loop body for the residual copy.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 8018e0cbf1cc39..f88050533f7183 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -189,8 +189,8 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1)
define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @variable_memcpy_caller0(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -225,8 +225,8 @@ define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr ad
define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @variable_memcpy_caller1(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -261,8 +261,8 @@ define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr ad
define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
; OPT-LABEL: @memcpy_multi_use_one_function(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
@@ -286,8 +286,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
; OPT: post-loop-memcpy-expansion1:
-; OPT-NEXT: [[TMP16:%.*]] = udiv i64 [[M:%.*]], 16
-; OPT-NEXT: [[TMP17:%.*]] = urem i64 [[M]], 16
+; OPT-NEXT: [[TMP16:%.*]] = lshr i64 [[M:%.*]], 4
+; OPT-NEXT: [[TMP17:%.*]] = and i64 [[M]], 15
; OPT-NEXT: [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
; OPT-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], 0
; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -326,8 +326,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0,
define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_alt_type(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 7
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -363,8 +363,8 @@ define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(
; One of the uses in the function should be expanded, the other left alone.
define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
-; MAX1024-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; MAX1024-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; MAX1024-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; MAX1024-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; MAX1024-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -395,8 +395,8 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspac
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
-; ALL-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; ALL-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; ALL-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; ALL-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; ALL-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; ALL-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -909,8 +909,8 @@ define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspa
define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -945,8 +945,8 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 2
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 2
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 1
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -981,8 +981,8 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i64 [[N:%.*]], 16
-; OPT-NEXT: [[TMP2:%.*]] = urem i64 [[N]], 16
+; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[N:%.*]], 4
+; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N]], 15
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1017,8 +1017,8 @@ define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrs
define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 7
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1053,8 +1053,8 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 2
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 2
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 1
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 1
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1089,8 +1089,8 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 7
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1125,8 +1125,8 @@ define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspa
define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 7
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1161,8 +1161,8 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
-; OPT-NEXT: [[TMP1:%.*]] = udiv i32 [[N:%.*]], 8
-; OPT-NEXT: [[TMP2:%.*]] = urem i32 [[N]], 8
+; OPT-NEXT: [[TMP1:%.*]] = lshr i32 [[N:%.*]], 3
+; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N]], 7
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
@@ -1488,9 +1488,9 @@ define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope !0
+; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias !0
+; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1511,9 +1511,9 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !3
+; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
-; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias !3
+; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 16
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1561,9 +1561,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
; ALL: load-store-loop:
; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope !6
+; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias !6
+; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 1
; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 32
; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@@ -1576,17 +1576,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3)
define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; MAX1024-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; MAX1024-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; MAX1024-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 7
; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP1]], 0
; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
; MAX1024: loop-memcpy-expansion:
; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope !0
+; MAX1024-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP5]], align 1, !alias.scope [[META0:![0-9]+]]
; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
-; MAX1024-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias !0
+; MAX1024-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1, !noalias [[META0]]
; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 1
; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP1]]
; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
@@ -1594,9 +1594,9 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
-; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope !0
+; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
-; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias !0
+; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1607,17 +1607,17 @@ define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr
; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
;
; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
-; ALL-NEXT: [[TMP1:%.*]] = udiv i32 [[SIZE:%.*]], 8
-; ALL-NEXT: [[TMP2:%.*]] = urem i32 [[SIZE]], 8
+; ALL-NEXT: [[TMP1:%.*]] = lshr i32 [[SIZE:%.*]], 3
+; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE]], 7
; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81238
More information about the llvm-commits
mailing list