[llvm] 031f33c - [RISCV] Add tests for legalization of <N x i128> and <N x i256> shuffles

Thu Jan 9 10:48:53 PST 2025

Author: Philip Reames
Date: 2025-01-09T10:48:45-08:00
New Revision: 031f33cca3c953dd09ac439fdb503fb3cb36af5e

URL: https://github.com/llvm/llvm-project/commit/031f33cca3c953dd09ac439fdb503fb3cb36af5e
DIFF: https://github.com/llvm/llvm-project/commit/031f33cca3c953dd09ac439fdb503fb3cb36af5e.diff

LOG: [RISCV] Add tests for legalization of <N x i128> and <N x i256> shuffles

Added: 
    

Modified: 
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 0bd8466669dc80..8915603471ec7f 100644

--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -1141,3 +1141,237 @@ define <16 x i32> @shuffle_disjoint_lanes_one_splat(i32 %v, <16 x i32> %w) {
   %out = shufflevector <16 x i32> %splat, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
   ret <16 x i32> %out
 }
+
+define <4 x i128> @shuffle_i128(<4 x i128> %a) {
+; RV32-LABEL: shuffle_i128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a2, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a4, 8(a1)
+; RV32-NEXT:    lw a5, 12(a1)
+; RV32-NEXT:    lw a6, 48(a1)
+; RV32-NEXT:    lw a7, 52(a1)
+; RV32-NEXT:    lw t0, 56(a1)
+; RV32-NEXT:    lw t1, 60(a1)
+; RV32-NEXT:    lw t2, 32(a1)
+; RV32-NEXT:    lw t3, 36(a1)
+; RV32-NEXT:    lw t4, 40(a1)
+; RV32-NEXT:    lw a1, 44(a1)
+; RV32-NEXT:    sw t2, 48(a0)
+; RV32-NEXT:    sw t3, 52(a0)
+; RV32-NEXT:    sw t4, 56(a0)
+; RV32-NEXT:    sw a1, 60(a0)
+; RV32-NEXT:    sw a6, 32(a0)
+; RV32-NEXT:    sw a7, 36(a0)
+; RV32-NEXT:    sw t0, 40(a0)
+; RV32-NEXT:    sw t1, 44(a0)
+; RV32-NEXT:    sw a2, 16(a0)
+; RV32-NEXT:    sw a3, 20(a0)
+; RV32-NEXT:    sw a4, 24(a0)
+; RV32-NEXT:    sw a5, 28(a0)
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a3, 4(a0)
+; RV32-NEXT:    sw a4, 8(a0)
+; RV32-NEXT:    sw a5, 12(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a2, 48(a1)
+; RV64-NEXT:    ld a3, 56(a1)
+; RV64-NEXT:    ld a4, 0(a1)
+; RV64-NEXT:    ld a5, 8(a1)
+; RV64-NEXT:    ld a6, 32(a1)
+; RV64-NEXT:    ld a1, 40(a1)
+; RV64-NEXT:    sd a2, 32(a0)
+; RV64-NEXT:    sd a3, 40(a0)
+; RV64-NEXT:    sd a6, 48(a0)
+; RV64-NEXT:    sd a1, 56(a0)
+; RV64-NEXT:    sd a4, 0(a0)
+; RV64-NEXT:    sd a5, 8(a0)
+; RV64-NEXT:    sd a4, 16(a0)
+; RV64-NEXT:    sd a5, 24(a0)
+; RV64-NEXT:    ret
+  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  ret <4 x i128> %res
+}
+
+define void @shuffle_i128_ldst(ptr %p) {
+; RV32-LABEL: shuffle_i128_ldst:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 48(a0)
+; RV32-NEXT:    lw a2, 52(a0)
+; RV32-NEXT:    lw a3, 56(a0)
+; RV32-NEXT:    lw a4, 60(a0)
+; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a6, 4(a0)
+; RV32-NEXT:    lw a7, 8(a0)
+; RV32-NEXT:    lw t0, 12(a0)
+; RV32-NEXT:    lw t1, 32(a0)
+; RV32-NEXT:    lw t2, 36(a0)
+; RV32-NEXT:    lw t3, 40(a0)
+; RV32-NEXT:    lw t4, 44(a0)
+; RV32-NEXT:    sw t1, 48(a0)
+; RV32-NEXT:    sw t2, 52(a0)
+; RV32-NEXT:    sw t3, 56(a0)
+; RV32-NEXT:    sw t4, 60(a0)
+; RV32-NEXT:    sw a5, 16(a0)
+; RV32-NEXT:    sw a6, 20(a0)
+; RV32-NEXT:    sw a7, 24(a0)
+; RV32-NEXT:    sw t0, 28(a0)
+; RV32-NEXT:    sw a1, 32(a0)
+; RV32-NEXT:    sw a2, 36(a0)
+; RV32-NEXT:    sw a3, 40(a0)
+; RV32-NEXT:    sw a4, 44(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_i128_ldst:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 32(a0)
+; RV64-NEXT:    ld a4, 40(a0)
+; RV64-NEXT:    ld a5, 48(a0)
+; RV64-NEXT:    ld a6, 56(a0)
+; RV64-NEXT:    sd a3, 48(a0)
+; RV64-NEXT:    sd a4, 56(a0)
+; RV64-NEXT:    sd a1, 16(a0)
+; RV64-NEXT:    sd a2, 24(a0)
+; RV64-NEXT:    sd a5, 32(a0)
+; RV64-NEXT:    sd a6, 40(a0)
+; RV64-NEXT:    ret
+  %a = load <4 x i128>, ptr %p
+  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i128> %res, ptr %p
+  ret void
+}
+
+define void @shuffle_i256_ldst(ptr %p) {
+; RV32-LABEL: shuffle_i256_ldst:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
+; RV32-NEXT:    .cfi_offset s9, -40
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    lw a5, 16(a0)
+; RV32-NEXT:    lw a6, 20(a0)
+; RV32-NEXT:    lw a7, 24(a0)
+; RV32-NEXT:    lw t0, 28(a0)
+; RV32-NEXT:    lw t1, 96(a0)
+; RV32-NEXT:    lw t2, 100(a0)
+; RV32-NEXT:    lw t3, 104(a0)
+; RV32-NEXT:    lw t4, 108(a0)
+; RV32-NEXT:    lw t5, 112(a0)
+; RV32-NEXT:    lw t6, 116(a0)
+; RV32-NEXT:    lw s0, 120(a0)
+; RV32-NEXT:    lw s1, 124(a0)
+; RV32-NEXT:    lw s2, 64(a0)
+; RV32-NEXT:    lw s3, 68(a0)
+; RV32-NEXT:    lw s4, 72(a0)
+; RV32-NEXT:    lw s5, 76(a0)
+; RV32-NEXT:    lw s6, 80(a0)
+; RV32-NEXT:    lw s7, 84(a0)
+; RV32-NEXT:    lw s8, 88(a0)
+; RV32-NEXT:    lw s9, 92(a0)
+; RV32-NEXT:    sw s6, 112(a0)
+; RV32-NEXT:    sw s7, 116(a0)
+; RV32-NEXT:    sw s8, 120(a0)
+; RV32-NEXT:    sw s9, 124(a0)
+; RV32-NEXT:    sw s2, 96(a0)
+; RV32-NEXT:    sw s3, 100(a0)
+; RV32-NEXT:    sw s4, 104(a0)
+; RV32-NEXT:    sw s5, 108(a0)
+; RV32-NEXT:    sw t5, 80(a0)
+; RV32-NEXT:    sw t6, 84(a0)
+; RV32-NEXT:    sw s0, 88(a0)
+; RV32-NEXT:    sw s1, 92(a0)
+; RV32-NEXT:    sw t1, 64(a0)
+; RV32-NEXT:    sw t2, 68(a0)
+; RV32-NEXT:    sw t3, 72(a0)
+; RV32-NEXT:    sw t4, 76(a0)
+; RV32-NEXT:    sw a5, 48(a0)
+; RV32-NEXT:    sw a6, 52(a0)
+; RV32-NEXT:    sw a7, 56(a0)
+; RV32-NEXT:    sw t0, 60(a0)
+; RV32-NEXT:    sw a1, 32(a0)
+; RV32-NEXT:    sw a2, 36(a0)
+; RV32-NEXT:    sw a3, 40(a0)
+; RV32-NEXT:    sw a4, 44(a0)
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_i256_ldst:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 96(a0)
+; RV64-NEXT:    ld a2, 104(a0)
+; RV64-NEXT:    ld a3, 112(a0)
+; RV64-NEXT:    ld a4, 120(a0)
+; RV64-NEXT:    ld a5, 0(a0)
+; RV64-NEXT:    ld a6, 8(a0)
+; RV64-NEXT:    ld a7, 16(a0)
+; RV64-NEXT:    ld t0, 24(a0)
+; RV64-NEXT:    ld t1, 64(a0)
+; RV64-NEXT:    ld t2, 72(a0)
+; RV64-NEXT:    ld t3, 80(a0)
+; RV64-NEXT:    ld t4, 88(a0)
+; RV64-NEXT:    sd t1, 96(a0)
+; RV64-NEXT:    sd t2, 104(a0)
+; RV64-NEXT:    sd t3, 112(a0)
+; RV64-NEXT:    sd t4, 120(a0)
+; RV64-NEXT:    sd a5, 32(a0)
+; RV64-NEXT:    sd a6, 40(a0)
+; RV64-NEXT:    sd a7, 48(a0)
+; RV64-NEXT:    sd t0, 56(a0)
+; RV64-NEXT:    sd a1, 64(a0)
+; RV64-NEXT:    sd a2, 72(a0)
+; RV64-NEXT:    sd a3, 80(a0)
+; RV64-NEXT:    sd a4, 88(a0)
+; RV64-NEXT:    ret
+  %a = load <4 x i256>, ptr %p
+  %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i256> %res, ptr %p
+  ret void
+}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index bb05eb5368ae92..4603c0d24f5d79 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -400,3 +400,183 @@ entry:
   %conv199 = sext i32 %4 to i64
   ret i64 %conv199
 }
+
+define void @shuffle_i128_ldst(ptr %p) vscale_range(2,2) {
+; RV32-LABEL: shuffle_i128_ldst:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 48(a0)
+; RV32-NEXT:    lw a2, 52(a0)
+; RV32-NEXT:    lw a3, 56(a0)
+; RV32-NEXT:    lw a4, 60(a0)
+; RV32-NEXT:    lw a5, 0(a0)
+; RV32-NEXT:    lw a6, 4(a0)
+; RV32-NEXT:    lw a7, 8(a0)
+; RV32-NEXT:    lw t0, 12(a0)
+; RV32-NEXT:    lw t1, 32(a0)
+; RV32-NEXT:    lw t2, 36(a0)
+; RV32-NEXT:    lw t3, 40(a0)
+; RV32-NEXT:    lw t4, 44(a0)
+; RV32-NEXT:    sw t1, 48(a0)
+; RV32-NEXT:    sw t2, 52(a0)
+; RV32-NEXT:    sw t3, 56(a0)
+; RV32-NEXT:    sw t4, 60(a0)
+; RV32-NEXT:    sw a5, 16(a0)
+; RV32-NEXT:    sw a6, 20(a0)
+; RV32-NEXT:    sw a7, 24(a0)
+; RV32-NEXT:    sw t0, 28(a0)
+; RV32-NEXT:    sw a1, 32(a0)
+; RV32-NEXT:    sw a2, 36(a0)
+; RV32-NEXT:    sw a3, 40(a0)
+; RV32-NEXT:    sw a4, 44(a0)
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_i128_ldst:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    ld a2, 8(a0)
+; RV64-NEXT:    ld a3, 32(a0)
+; RV64-NEXT:    ld a4, 40(a0)
+; RV64-NEXT:    ld a5, 48(a0)
+; RV64-NEXT:    ld a6, 56(a0)
+; RV64-NEXT:    sd a3, 48(a0)
+; RV64-NEXT:    sd a4, 56(a0)
+; RV64-NEXT:    sd a1, 16(a0)
+; RV64-NEXT:    sd a2, 24(a0)
+; RV64-NEXT:    sd a5, 32(a0)
+; RV64-NEXT:    sd a6, 40(a0)
+; RV64-NEXT:    ret
+  %a = load <4 x i128>, ptr %p
+  %res = shufflevector <4 x i128> %a, <4 x i128> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i128> %res, ptr %p
+  ret void
+}
+
+define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) {
+; RV32-LABEL: shuffle_i256_ldst:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    .cfi_offset s3, -16
+; RV32-NEXT:    .cfi_offset s4, -20
+; RV32-NEXT:    .cfi_offset s5, -24
+; RV32-NEXT:    .cfi_offset s6, -28
+; RV32-NEXT:    .cfi_offset s7, -32
+; RV32-NEXT:    .cfi_offset s8, -36
+; RV32-NEXT:    .cfi_offset s9, -40
+; RV32-NEXT:    lw a1, 0(a0)
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    lw a5, 16(a0)
+; RV32-NEXT:    lw a6, 20(a0)
+; RV32-NEXT:    lw a7, 24(a0)
+; RV32-NEXT:    lw t0, 28(a0)
+; RV32-NEXT:    lw t1, 96(a0)
+; RV32-NEXT:    lw t2, 100(a0)
+; RV32-NEXT:    lw t3, 104(a0)
+; RV32-NEXT:    lw t4, 108(a0)
+; RV32-NEXT:    lw t5, 112(a0)
+; RV32-NEXT:    lw t6, 116(a0)
+; RV32-NEXT:    lw s0, 120(a0)
+; RV32-NEXT:    lw s1, 124(a0)
+; RV32-NEXT:    lw s2, 64(a0)
+; RV32-NEXT:    lw s3, 68(a0)
+; RV32-NEXT:    lw s4, 72(a0)
+; RV32-NEXT:    lw s5, 76(a0)
+; RV32-NEXT:    lw s6, 80(a0)
+; RV32-NEXT:    lw s7, 84(a0)
+; RV32-NEXT:    lw s8, 88(a0)
+; RV32-NEXT:    lw s9, 92(a0)
+; RV32-NEXT:    sw s6, 112(a0)
+; RV32-NEXT:    sw s7, 116(a0)
+; RV32-NEXT:    sw s8, 120(a0)
+; RV32-NEXT:    sw s9, 124(a0)
+; RV32-NEXT:    sw s2, 96(a0)
+; RV32-NEXT:    sw s3, 100(a0)
+; RV32-NEXT:    sw s4, 104(a0)
+; RV32-NEXT:    sw s5, 108(a0)
+; RV32-NEXT:    sw t5, 80(a0)
+; RV32-NEXT:    sw t6, 84(a0)
+; RV32-NEXT:    sw s0, 88(a0)
+; RV32-NEXT:    sw s1, 92(a0)
+; RV32-NEXT:    sw t1, 64(a0)
+; RV32-NEXT:    sw t2, 68(a0)
+; RV32-NEXT:    sw t3, 72(a0)
+; RV32-NEXT:    sw t4, 76(a0)
+; RV32-NEXT:    sw a5, 48(a0)
+; RV32-NEXT:    sw a6, 52(a0)
+; RV32-NEXT:    sw a7, 56(a0)
+; RV32-NEXT:    sw t0, 60(a0)
+; RV32-NEXT:    sw a1, 32(a0)
+; RV32-NEXT:    sw a2, 36(a0)
+; RV32-NEXT:    sw a3, 40(a0)
+; RV32-NEXT:    sw a4, 44(a0)
+; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_i256_ldst:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 96(a0)
+; RV64-NEXT:    ld a2, 104(a0)
+; RV64-NEXT:    ld a3, 112(a0)
+; RV64-NEXT:    ld a4, 120(a0)
+; RV64-NEXT:    ld a5, 0(a0)
+; RV64-NEXT:    ld a6, 8(a0)
+; RV64-NEXT:    ld a7, 16(a0)
+; RV64-NEXT:    ld t0, 24(a0)
+; RV64-NEXT:    ld t1, 64(a0)
+; RV64-NEXT:    ld t2, 72(a0)
+; RV64-NEXT:    ld t3, 80(a0)
+; RV64-NEXT:    ld t4, 88(a0)
+; RV64-NEXT:    sd t1, 96(a0)
+; RV64-NEXT:    sd t2, 104(a0)
+; RV64-NEXT:    sd t3, 112(a0)
+; RV64-NEXT:    sd t4, 120(a0)
+; RV64-NEXT:    sd a5, 32(a0)
+; RV64-NEXT:    sd a6, 40(a0)
+; RV64-NEXT:    sd a7, 48(a0)
+; RV64-NEXT:    sd t0, 56(a0)
+; RV64-NEXT:    sd a1, 64(a0)
+; RV64-NEXT:    sd a2, 72(a0)
+; RV64-NEXT:    sd a3, 80(a0)
+; RV64-NEXT:    sd a4, 88(a0)
+; RV64-NEXT:    ret
+  %a = load <4 x i256>, ptr %p
+  %res = shufflevector <4 x i256> %a, <4 x i256> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
+  store <4 x i256> %res, ptr %p
+  ret void
+}