[llvm] [CodeGen] Improve ExpandMemCmp for more efficient non-register aligned sizes handling (PR #70469)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 27 09:11:26 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-analysis
Author: Igor Kirillov (igogo-x86)
<details>
<summary>Changes</summary>
* Enhanced the logic of ExpandMemCmp pass to merge contiguous subsequences
in LoadSequence, based on sizes allowed in `AllowedTailExpansions`.
* This enhancement seeks to minimize the number of basic blocks and produce
optimized code when using memcmp with non-register aligned sizes.
* Enable this feature for AArch64 with memcmp sizes modulo 8 equal to
3, 5, and 6.
Reapplication of https://github.com/llvm/llvm-project/pull/69942 after fixing a bug
---
Patch is 148.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70469.diff
5 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+11)
- (modified) llvm/lib/CodeGen/ExpandMemCmp.cpp (+75-20)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+1)
- (added) llvm/test/CodeGen/AArch64/memcmp.ll (+3005)
- (added) llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll (+881)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..3ec80d99b392b2e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,6 +907,17 @@ class TargetTransformInfo {
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
// requires all loads in LoadSizes to be doable in an unaligned way.
bool AllowOverlappingLoads = false;
+
+ // Sometimes, the amount of data that needs to be compared is smaller than
+ // the standard register size, but it cannot be loaded with just one load
+ // instruction. For example, if the size of the memory comparison is 6
+ // bytes, we can handle it more efficiently by loading all 6 bytes in a
+ // single block and generating an 8-byte number, instead of generating two
+ // separate blocks with conditional jumps for 4 and 2 byte loads. This
+ // approach simplifies the process and produces the comparison result as
+ // normal. This array lists the allowed sizes of memcmp tails that can be
+ // merged into one block
+ SmallVector<unsigned, 4> AllowedTailExpansions;
};
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..28e258be226a695 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
Value *Lhs = nullptr;
Value *Rhs = nullptr;
};
- LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
- unsigned OffsetBytes);
+ LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
+ Type *CmpSizeType, unsigned OffsetBytes);
static LoadEntryVector
computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,6 +128,11 @@ class MemCmpExpansion {
unsigned MaxNumLoads,
unsigned &NumLoadsNonOneByte);
+ static void optimiseLoadSequence(
+ LoadEntryVector &LoadSequence,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ bool IsUsedForZeroCmp);
+
public:
MemCmpExpansion(CallInst *CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -210,6 +215,37 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
return LoadSequence;
}
+void MemCmpExpansion::optimiseLoadSequence(
+ LoadEntryVector &LoadSequence,
+ const TargetTransformInfo::MemCmpExpansionOptions &Options,
+ bool IsUsedForZeroCmp) {
+ // This part of code attempts to optimize the LoadSequence by merging allowed
+ // subsequences into single loads of allowed sizes from
+ // `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
+ // comparison or if no allowed tail expansions are specified, we exit early.
+ if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
+ return;
+
+ while (LoadSequence.size() >= 2) {
+ auto Last = LoadSequence[LoadSequence.size() - 1];
+ auto PreLast = LoadSequence[LoadSequence.size() - 2];
+
+ // Exit the loop if the two sequences are not contiguous
+ if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
+ break;
+
+ auto LoadSize = Last.LoadSize + PreLast.LoadSize;
+ if (find(Options.AllowedTailExpansions, LoadSize) ==
+ Options.AllowedTailExpansions.end())
+ break;
+
+ // Remove the last two sequences and replace with the combined sequence
+ LoadSequence.pop_back();
+ LoadSequence.pop_back();
+ LoadSequence.emplace_back(PreLast.Offset, LoadSize);
+ }
+}
+
// Initialize the basic block structure required for expansion of memcmp call
// with given maximum load size and memcmp size parameter.
// This structure includes:
@@ -255,6 +291,7 @@ MemCmpExpansion::MemCmpExpansion(
}
}
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+ optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
}
unsigned MemCmpExpansion::getNumBlocks() {
@@ -278,7 +315,7 @@ void MemCmpExpansion::createResultBlock() {
}
MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
- bool NeedsBSwap,
+ Type *BSwapSizeType,
Type *CmpSizeType,
unsigned OffsetBytes) {
// Get the memory source at offset `OffsetBytes`.
@@ -307,16 +344,22 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
if (!Rhs)
Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+ // Zero extend if Byte Swap intrinsic has different type
+ if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
+ Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
+ Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
+ }
+
// Swap bytes if required.
- if (NeedsBSwap) {
- Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
+ if (BSwapSizeType) {
+ Function *Bswap = Intrinsic::getDeclaration(
+ CI->getModule(), Intrinsic::bswap, BSwapSizeType);
Lhs = Builder.CreateCall(Bswap, Lhs);
Rhs = Builder.CreateCall(Bswap, Rhs);
}
// Zero extend if required.
- if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+ if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
}
@@ -332,7 +375,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
BasicBlock *BB = LoadCmpBlocks[BlockIndex];
Builder.SetInsertPoint(BB);
const LoadPair Loads =
- getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+ getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
Type::getInt32Ty(CI->getContext()), OffsetBytes);
Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
@@ -385,11 +428,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
IntegerType *const MaxLoadType =
NumLoads == 1 ? nullptr
: IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
const LoadPair Loads = getLoadPair(
- IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
- /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+ IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
+ MaxLoadType, CurLoadEntry.Offset);
if (NumLoads != 1) {
// If we have multiple loads per block, we need to generate a composite
@@ -475,14 +519,20 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
Type *LoadSizeType =
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
- Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ Type *BSwapSizeType =
+ DL.isLittleEndian()
+ ? IntegerType::get(CI->getContext(),
+ PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
+ : nullptr;
+ Type *MaxLoadType = IntegerType::get(
+ CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
- const LoadPair Loads =
- getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
- CurLoadEntry.Offset);
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
+ CurLoadEntry.Offset);
// Add the loaded values to the phi nodes for calculating memcmp result only
// if result is not used in a zero equality.
@@ -587,19 +637,24 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
/// A memcmp expansion that only has one block of load and compare can bypass
/// the compare, branch, and phi IR that is required in the general case.
Value *MemCmpExpansion::getMemCmpOneBlock() {
- Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
+ Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+ Type *BSwapSizeType =
+ NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
+ : nullptr;
+ Type *MaxLoadType =
+ IntegerType::get(CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
// The i8 and i16 cases don't need compares. We zext the loaded values and
// subtract them to get the suitable negative, zero, or positive i32 result.
- if (Size < 4) {
- const LoadPair Loads =
- getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
- /*Offset*/ 0);
+ if (Size == 1 || Size == 2) {
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
+ Builder.getInt32Ty(), /*Offset*/ 0);
return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
}
- const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+ const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
/*Offset*/ 0);
// The result of memcmp is negative, zero, or positive, so produce that by
// subtracting 2 extended compare bits: sub (ugt, ult).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa38..1d9dcfc4e9f446c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2961,6 +2961,7 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// they may wake up the FP unit, which raises the power consumption. Perhaps
// they could be used with no holds barred (-O3).
Options.LoadSizes = {8, 4, 2, 1};
+ Options.AllowedTailExpansions = {3, 5, 6};
return Options;
}
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
new file mode 100644
index 000000000000000..d13a416a28761ca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -0,0 +1,3005 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ ret i32 %m
+ }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length2_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: mov w2, #2 // =0x2
+; CHECK-NEXT: bl memcmp
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #2]
+; CHECK-NEXT: ldrh w11, [x1]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: orr w9, w11, w10, lsl #16
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #2]
+; CHECK-NEXT: ldrb w11, [x1, #2]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length4_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: movk w9, #13363, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #4]
+; CHECK-NEXT: ldr w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #4]
+; CHECK-NEXT: ldr w11, [x1]
+; CHECK-NEXT: orr x8, x9, x8, lsl #32
+; CHECK-NEXT: orr x9, x11, x10, lsl #32
+; CHECK-NEXT: rev x8, x8
+; CHECK-NEXT: rev x9, x9
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #4]
+; CHECK-NEXT: ldrb w11, [x1, #4]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_lt:
+; CHECK: // %bb.0:
+; CHECK-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70469
More information about the llvm-commits
mailing list