[llvm] [llvm] Optimize misaligned accesses with early profitable splitting (PR #145168)

Sat Jun 21 06:21:28 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Acthinks Yang (Acthinks)

<details>
<summary>Changes</summary>

Misaligned load/store splitting remains in DAG legalization for correctness, but its naive half-based approach misses optimization opportunities when pointer alignment information exists. Additionally, MachineMemOperand alignment is weakened by commonAlignment(getBaseAlign(), getOffset()).

This change introduces a new IR-stage transformation that conditionally splits misaligned accesses only when profitable. Key aspects:
1. Not a replacement for DAG legalization - DAG splitting is preserved
2. Uses precise pointer alignment knowledge to generate optimal splits
3. Strictly profit-driven: Applied only when splitting reduces costs
4. Preserves max alignment through pipeline via accurate MMOs

Fixes #143215

---

Patch is 28.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145168.diff


7 Files Affected:

- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+138) 
- (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+24-46) 
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+6-16) 
- (modified) llvm/test/CodeGen/AVR/calling-conv/c/basic.ll (+8-3) 
- (modified) llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll (+20-10) 
- (modified) llvm/test/CodeGen/XCore/unaligned_load.ll (+5-2) 
- (added) llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll (+286) 


``````````diff

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 43574a54c37dd..a6e14dd6b581e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -436,6 +436,8 @@ class CodeGenPrepare {
   bool optimizeExt(Instruction *&I);
   bool optimizeExtUses(Instruction *I);
   bool optimizeLoadExt(LoadInst *Load);
+  bool optimizeStoreMisalign(StoreInst *ST);
+  bool optimizeLoadMisalign(LoadInst *ST);
   bool optimizeShiftInst(BinaryOperator *BO);
   bool optimizeFunnelShift(IntrinsicInst *Fsh);
   bool optimizeSelectInst(SelectInst *SI);
@@ -7353,6 +7355,138 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   return true;
 }
 
+static bool isOptimizeMisalignCandidate(Instruction *I, const DataLayout *DL,
+                                        const TargetLowering *TLI,
+                                        const DominatorTree *DT) {
+  if (!isa<StoreInst>(I) && !isa<LoadInst>(I))
+    return false;
+
+  Value *Ptr = I->getOperand(isa<StoreInst>(I) ? 1 : 0);
+  Align Alignment = isa<StoreInst>(I) ? cast<StoreInst>(I)->getAlign()
+                                      : cast<LoadInst>(I)->getAlign();
+  Type *ValTy = isa<StoreInst>(I) ? I->getOperand(0)->getType() : I->getType();
+
+  if (ValTy->isScalableTy() || !ValTy->isSized())
+    return false;
+
+  unsigned BitWidth = DL->getTypeSizeInBits(ValTy);
+
+  // DAG legalization can handle this situation well
+  if (Alignment.value() * 8 >= BitWidth / 2)
+    return false;
+
+  Type *PtrTy = Ptr->getType();
+  EVT ValVT = TLI->getValueType(*DL, ValTy, true);
+  if (!ValVT.isSimple() || ValVT == MVT::Other ||
+      TLI->allowsMisalignedMemoryAccesses(
+          ValVT, PtrTy->getPointerAddressSpace(), Alignment))
+    return false;
+
+  KnownBits Known = computeKnownBits(Ptr, *DL, nullptr, I, DT);
+  if (Known.isUnknown())
+    return false;
+
+  unsigned PtrWidth = DL->getPointerTypeSizeInBits(PtrTy);
+  KnownBits AlignKnown =
+      KnownBits::makeConstant(APInt(PtrWidth, Alignment.value()));
+
+  if (KnownBits::add(Known, AlignKnown).countMinTrailingZeros() <=
+      AlignKnown.countMinTrailingZeros())
+    return false;
+  return true;
+}
+
+bool CodeGenPrepare::optimizeStoreMisalign(StoreInst *SI) {
+  if (!isOptimizeMisalignCandidate(SI, DL, TLI, DT.get()))
+    return false;
+
+  IRBuilder<> Builder(SI);
+  Value *Val = SI->getValueOperand();
+  unsigned BitWidth = DL->getTypeSizeInBits(Val->getType());
+  if (!Val->getType()->isIntegerTy())
+    Val =
+        Builder.CreateBitCast(Val, Type::getIntNTy(SI->getContext(), BitWidth));
+
+  bool IsLE = DL->isLittleEndian();
+  bool IsVolatile = SI->isVolatile();
+  Align Alignment = SI->getAlign();
+  Value *Ptr = SI->getPointerOperand();
+  unsigned RemainingBits = BitWidth;
+  Type *Int8Ty = Type::getInt8Ty(SI->getContext());
+  Type *Int32Ty = Type::getInt32Ty(SI->getContext());
+
+  while (RemainingBits > 0) {
+    unsigned ChunkBits =
+        std::min((uint64_t)(RemainingBits), 8 * Alignment.value());
+    Type *ChunkTy = Type::getIntNTy(SI->getContext(), ChunkBits);
+    Value *ChunkVal;
+    if (IsLE) {
+      ChunkVal = Builder.CreateTrunc(Val, ChunkTy);
+    } else {
+      Value *ShiftR = Builder.CreateLShr(Val, BitWidth - ChunkBits);
+      ChunkVal = Builder.CreateTrunc(ShiftR, ChunkTy);
+    }
+    Builder.CreateAlignedStore(ChunkVal, Ptr, Alignment, IsVolatile);
+    RemainingBits -= ChunkBits;
+    if (RemainingBits == 0)
+      break;
+
+    Val = IsLE ? Builder.CreateLShr(Val, ChunkBits)
+               : Builder.CreateShl(Val, ChunkBits);
+    Ptr = Builder.CreateGEP(Int8Ty, Ptr,
+                            ConstantInt::get(Int32Ty, ChunkBits / 8));
+    Alignment = getKnownAlignment(Ptr, *DL);
+  }
+
+  SI->eraseFromParent();
+  return true;
+}
+
+bool CodeGenPrepare::optimizeLoadMisalign(LoadInst *LI) {
+  if (!isOptimizeMisalignCandidate(LI, DL, TLI, DT.get()))
+    return false;
+
+  IRBuilder<> Builder(LI);
+  Type *ValTy = LI->getType();
+
+  unsigned BitWidth = DL->getTypeSizeInBits(LI->getType());
+  bool IsLE = DL->isLittleEndian();
+  bool IsVolatile = LI->isVolatile();
+  Align Alignment = LI->getAlign();
+  Value *Ptr = LI->getPointerOperand();
+  unsigned RemainingBits = BitWidth;
+  Type *IntTy = Type::getIntNTy(LI->getContext(), BitWidth);
+  Type *Int8Ty = Type::getInt8Ty(LI->getContext());
+  Type *Int32Ty = Type::getInt32Ty(LI->getContext());
+  Value *Val = ConstantInt::get(IntTy, 0);
+
+  while (RemainingBits > 0) {
+    unsigned ChunkBits =
+        std::min((uint64_t)(RemainingBits), 8 * Alignment.value());
+    Type *ChunkTy = Type::getIntNTy(LI->getContext(), ChunkBits);
+    Value *ChunkVal = Builder.CreateZExt(
+        Builder.CreateAlignedLoad(ChunkTy, Ptr, Alignment, IsVolatile), IntTy);
+    if (IsLE) {
+      ChunkVal = Builder.CreateShl(ChunkVal, BitWidth - RemainingBits);
+    } else {
+      ChunkVal = Builder.CreateShl(Val, RemainingBits - ChunkBits);
+    }
+    Val = Builder.CreateOr(Val, ChunkVal);
+    RemainingBits -= ChunkBits;
+    if (RemainingBits == 0)
+      break;
+    Ptr = Builder.CreateGEP(Int8Ty, Ptr,
+                            ConstantInt::get(Int32Ty, ChunkBits / 8));
+    Alignment = getKnownAlignment(Ptr, *DL);
+  }
+
+  if (!ValTy->isIntegerTy())
+    Val = Builder.CreateBitCast(Val, ValTy);
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+  return true;
+}
+
 /// Check if V (an operand of a select instruction) is an expensive instruction
 /// that is only used once.
 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
@@ -8750,6 +8884,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (optimizeLoadMisalign(LI))
+      return true;
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
     unsigned AS = LI->getPointerAddressSpace();
@@ -8760,6 +8896,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (splitMergedValStore(*SI, *DL, *TLI))
       return true;
+    if (optimizeStoreMisalign(SI))
+      return true;
     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     unsigned AS = SI->getPointerAddressSpace();
     return optimizeMemoryInst(I, SI->getOperand(1),
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index d95f528442efd..a34aacbea5668 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1451,63 +1451,41 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspac
 define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
 ; CI-LABEL: read2_v2i32_align1_odd_offset:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    ds_read_u8 v1, v0 offset:70
-; CI-NEXT:    ds_read_u8 v2, v0 offset:72
-; CI-NEXT:    ds_read_u8 v3, v0 offset:71
-; CI-NEXT:    ds_read_u8 v4, v0 offset:69
-; CI-NEXT:    ds_read_u8 v5, v0 offset:68
-; CI-NEXT:    s_waitcnt lgkmcnt(4)
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; CI-NEXT:    s_waitcnt lgkmcnt(3)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT:    s_waitcnt lgkmcnt(2)
-; CI-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-NEXT:    s_waitcnt lgkmcnt(1)
-; CI-NEXT:    v_or_b32_e32 v1, v1, v4
-; CI-NEXT:    ds_read_u8 v4, v0 offset:66
-; CI-NEXT:    ds_read_u8 v6, v0 offset:67
-; CI-NEXT:    ds_read_u8 v0, v0 offset:65
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    ds_read_u8 v2, v1 offset:65
+; CI-NEXT:    ds_read_u16 v3, v1 offset:66
+; CI-NEXT:    ds_read_b32 v0, v1 offset:68
+; CI-NEXT:    ds_read_u8 v4, v1 offset:72
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT:    v_or_b32_e32 v1, v2, v1
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; CI-NEXT:    v_or_b32_e32 v0, v2, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; CI-NEXT:    v_or_b32_e32 v2, v2, v6
-; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 24
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_or_b32_e32 v0, v0, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_or_b32_e32 v0, v2, v0
+; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:65
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:66
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:67
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:68
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:70
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:69
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:72
-; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:71
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-ALIGNED-NEXT:    ds_read_u16 v2, v1 offset:66
+; GFX9-ALIGNED-NEXT:    ds_read_b32 v0, v1 offset:68
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v1 offset:65
+; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v1 offset:72
 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v7
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
-; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v2
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b64 v[2:3], 24, v[0:1]
+; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v0, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v2, v0, v2
+; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 24, v5
+; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v3, v3, v0
+; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
 ; GFX9-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 41e3d5f10f6dd..31c6739ce5559 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -1009,15 +1009,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write_b8 v1, v0 offset:65
-; CI-NEXT:    v_mov_b32_e32 v0, 1
-; CI-NEXT:    ds_write_b8 v1, v0 offset:70
-; CI-NEXT:    v_mov_b32_e32 v0, 0xc8
-; CI-NEXT:    ds_write_b8 v1, v0 offset:69
-; CI-NEXT:    ds_write_b8 v1, v1 offset:68
-; CI-NEXT:    ds_write_b8 v1, v1 offset:67
-; CI-NEXT:    ds_write_b8 v1, v1 offset:66
+; CI-NEXT:    ds_write_b16 v1, v1 offset:66
+; CI-NEXT:    v_mov_b32_e32 v0, 0x1c800
+; CI-NEXT:    ds_write_b32 v1, v0 offset:68
 ; CI-NEXT:    ds_write_b8 v1, v1 offset:72
-; CI-NEXT:    ds_write_b8 v1, v1 offset:71
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset:
@@ -1025,15 +1020,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:65
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 1
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:70
-; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0xc8
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v0 offset:69
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:68
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:67
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:66
+; GFX9-ALIGNED-NEXT:    ds_write_b16 v1, v1 offset:66
+; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x1c800
+; GFX9-ALIGNED-NEXT:    ds_write_b32 v1, v0 offset:68
 ; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:72
-; GFX9-ALIGNED-NEXT:    ds_write_b8 v1, v1 offset:71
 ; GFX9-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
index 3d783d143192d..278df39938342 100644
--- a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll
@@ -12,10 +12,15 @@ define void @ret_void_args_i8_i32(i8 %a, i32 %b) {
   ; CHECK:      sts     4, r24
   store volatile i8 %a, ptr inttoptr (i64 4 to ptr)
 
-  ; CHECK-NEXT: sts     8, r23
-  ; CHECK-NEXT: sts     7, r22
-  ; CHECK-NEXT: sts     6, r21
   ; CHECK-NEXT: sts     5, r20
+
+  ; redundant instructions, should be deleted
+  ; CHECK-NEXT: mov     r24, r21
+  ; CHECK-NEXT: mov     r25, r22
+
+  ; CHECK-NEXT: sts     7, r25
+  ; CHECK-NEXT: sts     6, r24
+  ; CHECK-NEXT: sts     8, r23
   store volatile i32 %b, ptr inttoptr (i64 5 to ptr)
   ret void
 }
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
index 07d58841dd802..cd1817cd245be 100644
--- a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll
@@ -7,10 +7,15 @@ start:
   %0 = extractvalue { i8, i32 } %a, 0
   store volatile i8 %0, ptr inttoptr (i64 4 to ptr)
 
-  ; CHECK-NEXT: sts     8, r24
-  ; CHECK-NEXT: sts     7, r23
-  ; CHECK-NEXT: sts     6, r22
   ; CHECK-NEXT: sts     5, r21
+
+  ; redundant instructions, should be deleted
+  ; CHECK-NEXT:	mov     r18, r22
+  ; CHECK-NEXT:	mov     r19, r23
+
+  ; CHECK-NEXT: sts     7, r19
+  ; CHECK-NEXT: sts     6, r18
+  ; CHECK-NEXT: sts     8, r24
   %1 = extractvalue { i8, i32 } %a, 1
   store volatile i32 %1, ptr inttoptr (i64 5 to ptr)
   ret void
@@ -62,17 +67,22 @@ start:
   %0 = extractvalue { i8, i32 } %a, 0
   store volatile i8 %0, ptr inttoptr (i64 4 to ptr)
 
-  ; CHECK-NEXT: sts     8, r24
-  ; CHECK-NEXT: sts     7, r23
-  ; CHECK-NEXT: sts     6, r22
   ; CHECK-NEXT: sts     5, r21
+
+  ; redundant instructions, should be deleted
+  ; CHECK-NEXT:	mov     r20, r22
+  ; CHECK-NEXT:	mov     r21, r23
+
+  ; CHECK-NEXT: sts     7, r21
+  ; CHECK-NEXT: sts     6, r20
+  ; CHECK-NEXT: sts     8, r24
   %1 = extractvalue { i8, i32 } %a, 1
   store volatile i32 %1, ptr inttoptr (i64 5 to ptr)
 
-  ; CHECK-NEXT:      sts     9, r17
-  ; CHECK-NEXT:      sts     8, r16
-  ; CHECK-NEXT:      sts     7, r15
-  ; CHECK-NEXT:      sts     6, r14
+  ; CHECK-NEXT: sts     9, r17
+  ; CHECK-NEXT: sts     8, r16
+  ; CHECK-NEXT: sts     7, r15
+  ; CHECK-NEXT: sts     6, r14
   %2 = extractvalue { i32, i8 } %b, 0
   store volatile i32 %2, ptr inttoptr (i64 6 to ptr)
 
diff --git a/llvm/test/CodeGen/XCore/unaligned_load.ll b/llvm/test/CodeGen/XCore/unaligned_load.ll
index ee9aea4689503..ce27c1ac49801 100644
--- a/llvm/test/CodeGen/XCore/unaligned_load.ll
+++ b/llvm/test/CodeGen/XCore/unaligned_load.ll
@@ -24,8 +24,11 @@ entry:
 
 ; Constant offset from word aligned base.
 ; CHECK-LABEL: align3:
-; CHECK: ldw {{r[0-9]+}}, dp
-; CHECK: ldw {{r[0-9]+}}, dp
+; CHECK: ldaw {{r[0-9]+}}, dp
+; CHECK: ld8u
+; CHECK: ld16s
+; CHECK: or
+; CHECK: ld8u
 ; CHECK: or
 define i32 @align3() nounwind {
 entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll
new file mode 100644
index 0000000000000..a4d28aac256fc
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 %s \
+; RUN:   | FileCheck --check-prefixes=LE %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mtriple=riscv32 -data-layout="E-m:e-p:32:32-i64:64-n32-S128" %s \
+; RUN:   | FileCheck --check-prefixes=BE %s
+
+
+define void @foo-i32(ptr align 4 %p, i32 %v) {
+; LE-LABEL: @foo-i32(
+; LE-NEXT:  entry:
+; LE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT:    [[TMP0:%.*]] = trunc i32 [[V:%.*]] to i8
+; LE-NEXT:    store i8 [[TMP0]], ptr [[LEN]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = lshr i32 [[V]], 8
+; LE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1]] to i24
+; LE-NEXT:    store i24 [[TMP3]], ptr [[TMP2]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @foo-i32(
+; BE-NEXT:  entry:
+; BE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT:    [[TMP0:%.*]] = lshr i32 [[V:%.*]], 24
+; BE-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; BE-NEXT:    store i8 [[TMP1]], ptr [[LEN]], align 1
+; BE-NEXT:    [[TMP2:%.*]] = shl i32 [[V]], 8
+; BE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; BE-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP2]], 8
+; BE-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
+; BE-NEXT:    store i24 [[TMP5]], ptr [[TMP3]], align 4
+; BE-NEXT:    ret void
+;
+entry:
+  %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+  store i32 %v, ptr %len, align 1
+  ret void
+}
+define void @foo-i64(ptr align 4 %p, i64 %v) {
+; LE-LABEL: @foo-i64(
+; LE-NEXT:  entry:
+; LE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT:    [[TMP0:%.*]] = trunc i64 [[V:%.*]] to i8
+; LE-NEXT:    store i8 [[TMP0]], ptr [[LEN]], align 1
+; LE-NEXT:    [[TMP1:%.*]] = lshr i64 [[V]], 8
+; LE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; LE-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
+; LE-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP1]], 32
+; LE-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 4
+; LE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP4]] to i24
+; LE-NEXT:    store i24 [[TMP6]], ptr [[TMP5]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @foo-i64(
+; BE-NEXT:  entry:
+; BE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT:    [[TMP0:%.*]] = lshr i64 [[V:%.*]], 56
+; BE-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8
+; BE-NEXT:    store i8 [[TMP1]], ptr [[LEN]], align 1
+; BE-NEXT:    [[TMP2:%.*]] = shl i64 [[V]], 8
+; BE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; BE-NEXT:    [[TMP4:%.*]] = lshr i64 [[TMP2]], 32
+; BE-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; BE-NEXT:    store i32 [[TMP5]], ptr [[TMP3]], align 4
+; BE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP2]], 32
+; BE-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP3]], i32 4
+; BE-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP6]], 40
+; BE-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i24
+; BE-NEXT:    store i24 [[TMP9]], ptr [[TMP7]], align 4
+; BE-NEXT:    ret void
+;
+entry:
+  %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+  store i64 %v, ptr %len, align 1
+  ret void
+}
+
+define void @foo-float(ptr align 4 %p, float %v) {
+; LE-LABEL: @foo-float(
+; LE-NEXT:  entry:
+; LE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; LE-NEXT:    [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32
+; LE-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; LE-NEXT:    store i8 [[TMP1]], ptr [[LEN]], align 1
+; LE-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0]], 8
+; LE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1
+; LE-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP2]] to i24
+; LE-NEXT:    store i24 [[TMP4]], ptr [[TMP3]], align 4
+; LE-NEXT:    ret void
+;
+; BE-LABEL: @foo-float(
+; BE-NEXT:  entry:
+; BE-NEXT:    [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3
+; BE-NEXT:    [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32
+; BE-NEXT:    [[TMP1:%.*]...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/145168