[llvm] [CodeGen] Improve ExpandMemCmp for more efficient non-register aligned sizes handling (PR #69942)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 23 09:40:24 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Igor Kirillov (igogo-x86)
<details>
<summary>Changes</summary>
* Enhanced the logic of ExpandMemCmp pass to merge contiguous subsequences
in LoadSequence, based on sizes allowed in `AllowedTailExpansions`.
* This enhancement seeks to minimize the number of basic blocks and produce
optimized code when using memcmp with non-register aligned sizes.
* Enable this feature for AArch64 with memcmp sizes modulo 8 equal to
3, 5, and 6.
---
Patch is 147.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69942.diff
5 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+11)
- (modified) llvm/lib/CodeGen/ExpandMemCmp.cpp (+59-15)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+1)
- (added) llvm/test/CodeGen/AArch64/memcmp.ll (+3002)
- (added) llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll (+877)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..3ec80d99b392b2e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,6 +907,17 @@ class TargetTransformInfo {
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
// requires all loads in LoadSizes to be doable in an unaligned way.
bool AllowOverlappingLoads = false;
+
+ // Sometimes, the amount of data that needs to be compared is smaller than
+ // the standard register size, but it cannot be loaded with just one load
+ // instruction. For example, if the size of the memory comparison is 6
+ // bytes, we can handle it more efficiently by loading all 6 bytes in a
+ // single block and generating an 8-byte number, instead of generating two
+ // separate blocks with conditional jumps for 4 and 2 byte loads. This
+ // approach simplifies the process and produces the comparison result as
+ // normal. This array lists the allowed sizes of memcmp tails that can be
+ // merged into one block
+ SmallVector<unsigned, 4> AllowedTailExpansions;
};
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..d9c2c6f5f39ba6d 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
Value *Lhs = nullptr;
Value *Rhs = nullptr;
};
- LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
- unsigned OffsetBytes);
+ LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *BSwapSizeType,
+ Type *CmpSizeType, unsigned OffsetBytes);
static LoadEntryVector
computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -255,6 +255,31 @@ MemCmpExpansion::MemCmpExpansion(
}
}
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+ // This part of code attempts to optimize the LoadSequence by merging allowed
+ // subsequences into single loads of allowed sizes from
+ // `AllowedTailExpansions`. If it is for zero comparison or if no allowed tail
+ // expansions are specified, we exit early.
+ if (IsUsedForZeroCmp || !Options.AllowedTailExpansions.size())
+ return;
+
+ while (LoadSequence.size() >= 2) {
+ auto Last = LoadSequence[LoadSequence.size() - 1];
+ auto PreLast = LoadSequence[LoadSequence.size() - 2];
+
+ // Exit the loop if the two sequences are not contiguous
+ if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
+ break;
+
+ auto LoadSize = Last.LoadSize + PreLast.LoadSize;
+ if (find(Options.AllowedTailExpansions, LoadSize) ==
+ Options.AllowedTailExpansions.end())
+ break;
+
+ // Remove the last two sequences and replace with the combined sequence
+ LoadSequence.pop_back();
+ LoadSequence.pop_back();
+ LoadSequence.emplace_back(PreLast.Offset, LoadSize);
+ }
}
unsigned MemCmpExpansion::getNumBlocks() {
@@ -279,6 +304,7 @@ void MemCmpExpansion::createResultBlock() {
MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
bool NeedsBSwap,
+ Type *BSwapSizeType,
Type *CmpSizeType,
unsigned OffsetBytes) {
// Get the memory source at offset `OffsetBytes`.
@@ -307,16 +333,22 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
if (!Rhs)
Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+ // Zero extend if Byte Swap intrinsic has different type
+ if (NeedsBSwap && LoadSizeType != BSwapSizeType) {
+ Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
+ Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
+ }
+
// Swap bytes if required.
if (NeedsBSwap) {
- Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
- Intrinsic::bswap, LoadSizeType);
+ Function *Bswap = Intrinsic::getDeclaration(
+ CI->getModule(), Intrinsic::bswap, BSwapSizeType);
Lhs = Builder.CreateCall(Bswap, Lhs);
Rhs = Builder.CreateCall(Bswap, Rhs);
}
// Zero extend if required.
- if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+ if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
}
@@ -333,7 +365,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
Builder.SetInsertPoint(BB);
const LoadPair Loads =
getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
- Type::getInt32Ty(CI->getContext()), OffsetBytes);
+ nullptr, Type::getInt32Ty(CI->getContext()), OffsetBytes);
Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
PhiRes->addIncoming(Diff, BB);
@@ -385,11 +417,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
IntegerType *const MaxLoadType =
NumLoads == 1 ? nullptr
: IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
const LoadPair Loads = getLoadPair(
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
- /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+ /*NeedsBSwap=*/false, nullptr, MaxLoadType, CurLoadEntry.Offset);
if (NumLoads != 1) {
// If we have multiple loads per block, we need to generate a composite
@@ -475,14 +508,18 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
Type *LoadSizeType =
IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
- Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+ Type *BSwapSizeType = IntegerType::get(
+ CI->getContext(), PowerOf2Ceil(CurLoadEntry.LoadSize * 8));
+ Type *MaxLoadType = IntegerType::get(
+ CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
const LoadPair Loads =
- getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
- CurLoadEntry.Offset);
+ getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(),
+ BSwapSizeType, MaxLoadType, CurLoadEntry.Offset);
// Add the loaded values to the phi nodes for calculating memcmp result only
// if result is not used in a zero equality.
@@ -588,19 +625,26 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
/// the compare, branch, and phi IR that is required in the general case.
Value *MemCmpExpansion::getMemCmpOneBlock() {
Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+ Type *BSwapSizeType =
+ IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8));
+ Type *MaxLoadType =
+ IntegerType::get(CI->getContext(),
+ std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
+
bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
// The i8 and i16 cases don't need compares. We zext the loaded values and
// subtract them to get the suitable negative, zero, or positive i32 result.
if (Size < 4) {
- const LoadPair Loads =
- getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
- /*Offset*/ 0);
+ const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, BSwapSizeType,
+ Builder.getInt32Ty(),
+ /*Offset*/ 0);
return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
}
- const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
- /*Offset*/ 0);
+ const LoadPair Loads =
+ getLoadPair(LoadSizeType, NeedsBSwap, BSwapSizeType, MaxLoadType,
+ /*Offset*/ 0);
// The result of memcmp is negative, zero, or positive, so produce that by
// subtracting 2 extended compare bits: sub (ugt, ult).
// If a target prefers to use selects to get -1/0/1, they should be able
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d7123759..388034dd76ca4cb 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2938,6 +2938,7 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// they may wake up the FP unit, which raises the power consumption. Perhaps
// they could be used with no holds barred (-O3).
Options.LoadSizes = {8, 4, 2, 1};
+ Options.AllowedTailExpansions = {3, 5, 6};
return Options;
}
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
new file mode 100644
index 000000000000000..b38acbae1091536
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -0,0 +1,3002 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ ret i32 %m
+ }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w0, w8, w9, lsr #16
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: add w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: lsr w8, w8, #16
+; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length2_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: mov w2, #2 // =0x2
+; CHECK-NEXT: bl memcmp
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: ldrh w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #2]
+; CHECK-NEXT: ldrh w11, [x1]
+; CHECK-NEXT: orr w8, w9, w8, lsl #16
+; CHECK-NEXT: orr w9, w11, w10, lsl #16
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: ldrh w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #2]
+; CHECK-NEXT: ldrb w11, [x1, #2]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_gt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: rev w8, w8
+; CHECK-NEXT: rev w9, w9
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+ %c = icmp sgt i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length4_eq_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: mov w9, #12849 // =0x3231
+; CHECK-NEXT: movk w9, #13363, lsl #16
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #4]
+; CHECK-NEXT: ldr w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #4]
+; CHECK-NEXT: ldr w11, [x1]
+; CHECK-NEXT: orr x8, x9, x8, lsl #32
+; CHECK-NEXT: orr x9, x11, x10, lsl #32
+; CHECK-NEXT: rev x8, x8
+; CHECK-NEXT: rev x9, x9
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ldr w9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #4]
+; CHECK-NEXT: ldrb w11, [x1, #4]
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ %c = icmp ne i32 %m, 0
+ ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_lt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrb w8, [x0, #4]
+; CHECK-NEXT: ldr w9, [x0]
+; CHECK-NEXT: ldrb w10, [x1, #4]
+; CHECK-NEXT: ldr w11, [x1]
+; CHECK-NEXT: orr x8, x9, x8, lsl #32
+; CHECK-NEXT: orr x9, x11, x10, lsl #32
+; CHECK-NEXT: rev x8, x8
+; CHECK-NEXT: rev x9, x9
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w8, w8, w9
+; CHECK-NEXT: lsr w0, w8, #31
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+ %c = icmp slt i32 %m, 0
+ ret i1 %c
+}
+
+define i32 @length6(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length6:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrh w8, [x0, #4]
+; CHECK-NEXT: ldr w9, [x0]
+; CHECK-NEXT: ldrh w10, [x1, #4]
+; CHECK-NEXT: ldr w11, [x1]
+; CHECK-NEXT: orr x8, x9, x8, lsl #32
+; CHECK-NEXT: orr x9, x11, x10, lsl #32
+; CHECK-NEXT: rev x8, x8
+; CHECK-NEXT: rev x9, x9
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: cset w8, hi
+; CHECK-NEXT: cset w9, lo
+; CHECK-NEXT: sub w0, w8, w9
+; CHECK-NEXT: ret
+ %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+ ret i32 %m
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr w8, [x0]
+; CHECK-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/69942
More information about the llvm-commits
mailing list