[llvm] [llvm] Optimize misaligned accesses with early profitable splitting (PR #145168)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 21 06:21:27 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Acthinks Yang (Acthinks)
<details>
<summary>Changes</summary>
Misaligned load/store splitting remains in DAG legalization for correctness, but its naive half-based approach misses optimization opportunities when pointer alignment information exists. Additionally, MachineMemOperand alignment is weakened by commonAlignment(getBaseAlign(), getOffset()).
This change introduces a new IR-stage transformation that conditionally splits misaligned accesses only when profitable. Key aspects:
1. Not a replacement for DAG legalization - DAG splitting is preserved
2. Uses precise pointer alignment knowledge to generate optimal splits
3. Strictly profit-driven: Applied only when splitting reduces costs
4. Preserves max alignment through pipeline via accurate MMOs
Fixes #<!-- -->143215
---
Patch is 28.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145168.diff
7 Files Affected:
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+138)
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+24-46)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+6-16)
- (modified) llvm/test/CodeGen/AVR/calling-conv/c/basic.ll (+8-3)
- (modified) llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll (+20-10)
- (modified) llvm/test/CodeGen/XCore/unaligned_load.ll (+5-2)
- (added) llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll (+286)
``````````diff
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 43574a54c37dd..a6e14dd6b581e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -436,6 +436,8 @@ class CodeGenPrepare {
bool optimizeExt(Instruction *&I);
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *Load);
+ bool optimizeStoreMisalign(StoreInst *ST);
+ bool optimizeLoadMisalign(LoadInst *ST);
bool optimizeShiftInst(BinaryOperator *BO);
bool optimizeFunnelShift(IntrinsicInst *Fsh);
bool optimizeSelectInst(SelectInst *SI);
@@ -7353,6 +7355,138 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
return true;
}
+static bool isOptimizeMisalignCandidate(Instruction *I, const DataLayout *DL,
+ const TargetLowering *TLI,
+ const DominatorTree *DT) {
+ if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
+ return false;
+
+ Value *Ptr = I->getOperand(isa<StoreInst>(I) ? 1 : 0);
+ Align Alignment = isa<StoreInst>(I) ? cast<StoreInst>(I)->getAlign()
+ : cast<LoadInst>(I)->getAlign();
+ Type *ValTy = isa<StoreInst>(I) ? I->getOperand(0)->getType() : I->getType();
+
+ if (ValTy->isScalableTy() || !ValTy->isSized())
+ return false;
+
+ unsigned BitWidth = DL->getTypeSizeInBits(ValTy);
+
+ // DAG legalization can handle this situation well
+ if (Alignment.value() * 8 >= BitWidth / 2)
+ return false;
+
+ Type *PtrTy = Ptr->getType();
+ EVT ValVT = TLI->getValueType(*DL, ValTy, true);
+ if (!ValVT.isSimple() || ValVT == MVT::Other ||
+ TLI->allowsMisalignedMemoryAccesses(
+ ValVT, PtrTy->getPointerAddressSpace(), Alignment))
+ return false;
+
+ KnownBits Known = computeKnownBits(Ptr, *DL, nullptr, I, DT);
+ if (Known.isUnknown())
+ return false;
+
+ unsigned PtrWidth = DL->getPointerTypeSizeInBits(PtrTy);
+ KnownBits AlignKnown =
+ KnownBits::makeConstant(APInt(PtrWidth, Alignment.value()));
+
+ if (KnownBits::add(Known, AlignKnown).countMinTrailingZeros() <=
+ AlignKnown.countMinTrailingZeros())
+ return false;
+ return true;
+}
+
+bool CodeGenPrepare::optimizeStoreMisalign(StoreInst *SI) {
+ if (!isOptimizeMisalignCandidate(SI, DL, TLI, DT.get()))
+ return false;
+
+ IRBuilder<> Builder(SI);
+ Value *Val = SI->getValueOperand();
+ unsigned BitWidth = DL->getTypeSizeInBits(Val->getType());
+ if (!Val->getType()->isIntegerTy())
+ Val =
+ Builder.CreateBitCast(Val, Type::getIntNTy(SI->getContext(), BitWidth));
+
+ bool IsLE = DL->isLittleEndian();
+ bool IsVolatile = SI->isVolatile();
+ Align Alignment = SI->getAlign();
+ Value *Ptr = SI->getPointerOperand();
+ unsigned RemainingBits = BitWidth;
+ Type *Int8Ty = Type::getInt8Ty(SI->getContext());
+ Type *Int32Ty = Type::getInt32Ty(SI->getContext());
+
+ while (RemainingBits > 0) {
+ unsigned ChunkBits =
+ std::min((uint64_t)(RemainingBits), 8 * Alignment.value());
+ Type *ChunkTy = Type::getIntNTy(SI->getContext(), ChunkBits);
+ Value *ChunkVal;
+ if (IsLE) {
+ ChunkVal = Builder.CreateTrunc(Val, ChunkTy);
+ } else {
+ Value *ShiftR = Builder.CreateLShr(Val, BitWidth - ChunkBits);
+ ChunkVal = Builder.CreateTrunc(ShiftR, ChunkTy);
+ }
+ Builder.CreateAlignedStore(ChunkVal, Ptr, Alignment, IsVolatile);
+ RemainingBits -= ChunkBits;
+ if (RemainingBits == 0)
+ break;
+
+ Val = IsLE ? Builder.CreateLShr(Val, ChunkBits)
+ : Builder.CreateShl(Val, ChunkBits);
+ Ptr = Builder.CreateGEP(Int8Ty, Ptr,
+ ConstantInt::get(Int32Ty, ChunkBits / 8));
+ Alignment = getKnownAlignment(Ptr, *DL);
+ }
+
+ SI->eraseFromParent();
+ return true;
+}
+
+bool CodeGenPrepare::optimizeLoadMisalign(LoadInst *LI) {
+ if (!isOptimizeMisalignCandidate(LI, DL, TLI, DT.get()))
+ return false;
+
+ IRBuilder<> Builder(LI);
+ Type *ValTy = LI->getType();
+
+ unsigned BitWidth = DL->getTypeSizeInBits(LI->getType());
+ bool IsLE = DL->isLittleEndian();
+ bool IsVolatile = LI->isVolatile();
+ Align Alignment = LI->getAlign();
+ Value *Ptr = LI->getPointerOperand();
+ unsigned RemainingBits = BitWidth;
+ Type *IntTy = Type::getIntNTy(LI->getContext(), BitWidth);
+ Type *Int8Ty = Type::getInt8Ty(LI->getContext());
+ Type *Int32Ty = Type::getInt32Ty(LI->getContext());
+ Value *Val = ConstantInt::get(IntTy, 0);
+
+ while (RemainingBits > 0) {
+ unsigned ChunkBits =
+ std::min((uint64_t)(RemainingBits), 8 * Alignment.value());
+ Type *ChunkTy = Type::getIntNTy(LI->getContext(), ChunkBits);
+ Value *ChunkVal = Builder.CreateZExt(
+ Builder.CreateAlignedLoad(ChunkTy, Ptr, Alignment, IsVolatile), IntTy);
+ if (IsLE) {
+ ChunkVal = Builder.CreateShl(ChunkVal, BitWidth - RemainingBits);
+ } else {
+ ChunkVal = Builder.CreateShl(Val, RemainingBits - ChunkBits);
+ }
+ Val = Builder.CreateOr(Val, ChunkVal);
+ RemainingBits -= ChunkBits;
+ if (RemainingBits == 0)
+ break;
+ Ptr = Builder.CreateGEP(Int8Ty, Ptr,
+ ConstantInt::get(Int32Ty, ChunkBits / 8));
+ Alignment = getKnownAlignment(Ptr, *DL);
+ }
+
+ if (!ValTy->isIntegerTy())
+ Val = Builder.CreateBitCast(Val, ValTy);
+ LI->replaceAllUsesWith(Val);
+ LI->eraseFromParent();
+ return true;
+}
+
/// Check if V (an operand of a select instruction) is an expensive instruction
/// that is only used once.
static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
@@ -8750,6 +8884,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
return true;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (optimizeLoadMisalign(LI))
+ return true;
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
bool Modified = optimizeLoadExt(LI);
unsigned AS = LI->getPointerAddressSpace();
@@ -8760,6 +8896,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
if (splitMergedValStore(*SI, *DL, *TLI))
return true;
+ if (optimizeStoreMisalign(SI))
+ return true;
SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
unsigned AS = SI->getPointerAddressSpace();
return optimizeMemoryInst(I, SI->getOperand(1),
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index d95f528442efd..a34aacbea5668 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1451,63 +1451,41 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspac
define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
; CI-LABEL: read2_v2i32_align1_odd_offset:
; CI: ; %bb.0: ; %entry
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_read_u8 v1, v0 offset:70
-; CI-NEXT: ds_read_u8 v2, v0 offset:72
-; CI-NEXT: ds_read_u8 v3, v0 offset:71
-; CI-NEXT: ds_read_u8 v4, v0 offset:69
-; CI-NEXT: ds_read_u8 v5, v0 offset:68
-; CI-NEXT: s_waitcnt lgkmcnt(4)
-; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT: s_waitcnt lgkmcnt(3)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: s_waitcnt lgkmcnt(2)
-; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: v_or_b32_e32 v1, v1, v4
-; CI-NEXT: ds_read_u8 v4, v0 offset:66
-; CI-NEXT: ds_read_u8 v6, v0 offset:67
-; CI-NEXT: ds_read_u8 v0, v0 offset:65
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT: ds_read_u8 v2, v1 offset:65
+; CI-NEXT: ds_read_u16 v3, v1 offset:66
+; CI-NEXT: ds_read_b32 v0, v1 offset:68
+; CI-NEXT: ds_read_u8 v4, v1 offset:72
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_or_b32_e32 v1, v2, v1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
-; CI-NEXT: v_or_b32_e32 v0, v2, v0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; CI-NEXT: v_or_b32_e32 v2, v2, v6
-; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 24
+; CI-NEXT: v_or_b32_e32 v2, v2, v3
+; CI-NEXT: v_or_b32_e32 v0, v0, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, 24, v4
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_or_b32_e32 v0, v2, v0
+; CI-NEXT: v_or_b32_e32 v1, v1, v2
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
; GFX9-ALIGNED: ; %bb.0: ; %entry
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65
-; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66
-; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67
-; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68
-; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:70
-; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:69
-; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72
-; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71
+; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 offset:66
+; GFX9-ALIGNED-NEXT: ds_read_b32 v0, v1 offset:68
+; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:65
+; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:72
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v7
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5
-; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v2
+; GFX9-ALIGNED-NEXT: v_lshlrev_b64 v[2:3], 24, v[0:1]
+; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v0, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-ALIGNED-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 24, v5
+; GFX9-ALIGNED-NEXT: v_or_b32_e32 v3, v3, v0
+; GFX9-ALIGNED-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 41e3d5f10f6dd..31c6739ce5559 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1009,15 +1009,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b8 v1, v0 offset:65
-; CI-NEXT: v_mov_b32_e32 v0, 1
-; CI-NEXT: ds_write_b8 v1, v0 offset:70
-; CI-NEXT: v_mov_b32_e32 v0, 0xc8
-; CI-NEXT: ds_write_b8 v1, v0 offset:69
-; CI-NEXT: ds_write_b8 v1, v1 offset:68
-; CI-NEXT: ds_write_b8 v1, v1 offset:67
-; CI-NEXT: ds_write_b8 v1, v1 offset:66
+; CI-NEXT: ds_write_b16 v1, v1 offset:66
+; CI-NEXT: v_mov_b32_e32 v0, 0x1c800
+; CI-NEXT: ds_write_b32 v1, v0 offset:68
; CI-NEXT: ds_write_b8 v1, v1 offset:72
-; CI-NEXT: ds_write_b8 v1, v1 offset:71
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
@@ -1025,15 +1020,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66
+; GFX9-ALIGNED-NEXT: ds_write_b16 v1, v1 offset:66
+; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x1c800
+; GFX9-ALIGNED-NEXT: ds_write_b32 v1, v0 offset:68
; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72
-; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
index 3d783d143192d..278df39938342 100644
--- a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
@@ -12,10 +12,15 @@ define void @ret_void_args_i8_i32(i8 %a, i32 %b) {
; CHECK: sts 4, r24
store volatile i8 %a, ptr inttoptr (i64 4 to ptr)
- ; CHECK-NEXT: sts 8, r23
- ; CHECK-NEXT: sts 7, r22
- ; CHECK-NEXT: sts 6, r21
; CHECK-NEXT: sts 5, r20
+
+ ; redundant instructions, should be deleted
+ ; CHECK-NEXT: mov r24, r21
+ ; CHECK-NEXT: mov r25, r22
+
+ ; CHECK-NEXT: sts 7, r25
+ ; CHECK-NEXT: sts 6, r24
+ ; CHECK-NEXT: sts 8, r23
store volatile i32 %b, ptr inttoptr (i64 5 to ptr)
ret void
}
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
index 07d58841dd802..cd1817cd245be 100644
--- a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
@@ -7,10 +7,15 @@ start:
%0 = extractvalue { i8, i32 } %a, 0
store volatile i8 %0, ptr inttoptr (i64 4 to ptr)
- ; CHECK-NEXT: sts 8, r24
- ; CHECK-NEXT: sts 7, r23
- ; CHECK-NEXT: sts 6, r22
; CHECK-NEXT: sts 5, r21
+
+ ; redundant instructions, should be deleted
+ ; CHECK-NEXT: mov r18, r22
+ ; CHECK-NEXT: mov r19, r23
+
+ ; CHECK-NEXT: sts 7, r19
+ ; CHECK-NEXT: sts 6, r18
+ ; CHECK-NEXT: sts 8, r24
%1 = extractvalue { i8, i32 } %a, 1
store volatile i32 %1, ptr inttoptr (i64 5 to ptr)
ret void
@@ -62,17 +67,22 @@ start:
%0 = extractvalue { i8, i32 } %a, 0
store volatile i8 %0, ptr inttoptr (i64 4 to ptr)
- ; CHECK-NEXT: sts 8, r24
- ; CHECK-NEXT: sts 7, r23
- ; CHECK-NEXT: sts 6, r22
; CHECK-NEXT: sts 5, r21
+
+ ; redundant instructions, should be deleted
+ ; CHECK-NEXT: mov r20, r22
+ ; CHECK-NEXT: mov r21, r23
+
+ ; CHECK-NEXT: sts 7, r21
+ ; CHECK-NEXT: sts 6, r20
+ ; CHECK-NEXT: sts 8, r24
%1 = extractvalue { i8, i32 } %a, 1
store volatile i32 %1, ptr inttoptr (i64 5 to ptr)
- ; CHECK-NEXT: sts 9, r17
- ; CHECK-NEXT: sts 8, r16
- ; CHECK-NEXT: sts 7, r15
- ; CHECK-NEXT: sts 6, r14
+ ; CHECK-NEXT: sts 9, r17
+ ; CHECK-NEXT: sts 8, r16
+ ; CHECK-NEXT: sts 7, r15
+ ; CHECK-NEXT: sts 6, r14
%2 = extractvalue { i32, i8 } %b, 0
store volatile i32 %2, ptr inttoptr (i64 6 to ptr)
diff --git a/llvm/test/CodeGen/XCore/unaligned_load.ll b/llvm/test/CodeGen/XCore/unaligned_load.ll
index ee9aea4689503..ce27c1ac49801 100644
--- a/llvm/test/CodeGen/XCore/unaligned_load.ll
+++ b/llvm/test/CodeGen/XCore/unaligned_load.ll
@@ -24,8 +24,11 @@ entry:
; Constant offset from word aligned base.
; CHECK-LABEL: align3:
-; CHECK: ldw {{r[0-9]+}}, dp
-; CHECK: ldw {{r[0-9]+}}, dp
+; CHECK: ldaw {{r[0-9]+}}, dp
+; CHECK: ld8u
+; CHECK: ld16s
+; CHECK: or
+; CHECK: ld8u
; CHECK: or
define i32 @align3() nounwind {
entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll
new file mode 100644
index 0000000000000..a4d28aac256fc
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 %s \
+; RUN: | FileCheck --check-prefixes=LE %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 -data-layout="E-m:e-p:32:32-i64:64-n32-S128" %s \
+; RUN: | FileCheck --check-prefixes=BE %s
+
+
+define void @foo-i32(ptr align 4 %p, i32 %v) {
+; LE-LABEL: @foo-i32(
+; LE-NEXT: entry:
+; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT: [[TMP0:%.*]] = trunc i32 [[V:%.*]] to i8
+; LE-NEXT: store i8 [[TMP0]], ptr [[LEN]], align 1
+; LE-NEXT: [[TMP1:%.*]] = lshr i32 [[V]], 8
+; LE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP1]] to i24
+; LE-NEXT: store i24 [[TMP3]], ptr [[TMP2]], align 4
+; LE-NEXT: ret void
+;
+; BE-LABEL: @foo-i32(
+; BE-NEXT: entry:
+; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT: [[TMP0:%.*]] = lshr i32 [[V:%.*]], 24
+; BE-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; BE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1
+; BE-NEXT: [[TMP2:%.*]] = shl i32 [[V]], 8
+; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; BE-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP2]], 8
+; BE-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; BE-NEXT: store i24 [[TMP5]], ptr [[TMP3]], align 4
+; BE-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+define void @foo-i64(ptr align 4 %p, i64 %v) {
+; LE-LABEL: @foo-i64(
+; LE-NEXT: entry:
+; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT: [[TMP0:%.*]] = trunc i64 [[V:%.*]] to i8
+; LE-NEXT: store i8 [[TMP0]], ptr [[LEN]], align 1
+; LE-NEXT: [[TMP1:%.*]] = lshr i64 [[V]], 8
+; LE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; LE-NEXT: store i32 [[TMP3]], ptr [[TMP2]], align 4
+; LE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP1]], 32
+; LE-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 4
+; LE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP4]] to i24
+; LE-NEXT: store i24 [[TMP6]], ptr [[TMP5]], align 4
+; LE-NEXT: ret void
+;
+; BE-LABEL: @foo-i64(
+; BE-NEXT: entry:
+; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT: [[TMP0:%.*]] = lshr i64 [[V:%.*]], 56
+; BE-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8
+; BE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1
+; BE-NEXT: [[TMP2:%.*]] = shl i64 [[V]], 8
+; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; BE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 32
+; BE-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; BE-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 4
+; BE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP2]], 32
+; BE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP3]], i32 4
+; BE-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP6]], 40
+; BE-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i24
+; BE-NEXT: store i24 [[TMP9]], ptr [[TMP7]], align 4
+; BE-NEXT: ret void
+;
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i64 %v, ptr %len, align 1
+ ret void
+}
+
+define void @foo-float(ptr align 4 %p, float %v) {
+; LE-LABEL: @foo-float(
+; LE-NEXT: entry:
+; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT: [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32
+; LE-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; LE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1
+; LE-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 8
+; LE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP2]] to i24
+; LE-NEXT: store i24 [[TMP4]], ptr [[TMP3]], align 4
+; LE-NEXT: ret void
+;
+; BE-LABEL: @foo-float(
+; BE-NEXT: entry:
+; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT: [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32
+; BE-NEXT: [[TMP1:%.*]...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/145168
More information about the llvm-commits
mailing list