[llvm] [IA][RISCV] Recognize deinterleaved loads that could lower to strided segmented loads (PR #151612)
Min-Yih Hsu via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 4 13:11:49 PDT 2025
https://github.com/mshockwave updated https://github.com/llvm/llvm-project/pull/151612
>From c7220147cee093b95e138c377b4da1c2d724e485 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 31 Jul 2025 15:04:32 -0700
Subject: [PATCH 1/3] Pre-commit test
---
.../rvv/fixed-vectors-interleaved-access.ll | 680 +++++++++++++++++-
1 file changed, 644 insertions(+), 36 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 6eb0b693b5546..2df26b2f78d5b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -332,6 +332,174 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
+ ; mask = 1111, skip the last field.
+; RV32-LABEL: vpload_factor3_skip_fields:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 1755
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 73
+; RV32-NEXT: vmv.v.i v10, 8
+; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0), v0.t
+; RV32-NEXT: li a0, 36
+; RV32-NEXT: vmv.s.x v11, a1
+; RV32-NEXT: lui a1, %hi(.LCPI17_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI17_0)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vcompress.vm v8, v12, v11
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v12, 8
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.i v0, 2
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vslidedown.vi v14, v12, 1
+; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t
+; RV32-NEXT: vle16.v v9, (a1)
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT: vrgatherei16.vv v10, v12, v9
+; RV32-NEXT: vmv1r.v v9, v14
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpload_factor3_skip_fields:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 1755
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 73
+; RV64-NEXT: vmv.v.i v10, 8
+; RV64-NEXT: vmv.s.x v11, a1
+; RV64-NEXT: li a1, 36
+; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0), v0.t
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: addi a0, a0, 5
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v8, v12, v11
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v12, 8
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.i v0, 2
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vslidedown.vi v14, v12, 1
+; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v12, v9
+; RV64-NEXT: vmv1r.v v9, v14
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, i32 12)
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
+ ; mask = 0101, skip the last field.
+; RV32-LABEL: vpload_factor3_mask_skip_fields:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 1560
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 73
+; RV32-NEXT: vmv.v.i v10, 8
+; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0), v0.t
+; RV32-NEXT: li a0, 36
+; RV32-NEXT: vmv.s.x v11, a1
+; RV32-NEXT: lui a1, %hi(.LCPI18_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI18_0)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vcompress.vm v8, v12, v11
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v12, 8
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.i v0, 2
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vslidedown.vi v14, v12, 1
+; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t
+; RV32-NEXT: vle16.v v9, (a1)
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT: vrgatherei16.vv v10, v12, v9
+; RV32-NEXT: vmv1r.v v9, v14
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vpload_factor3_mask_skip_fields:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 1560
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 73
+; RV64-NEXT: vmv.v.i v10, 8
+; RV64-NEXT: vmv.s.x v11, a1
+; RV64-NEXT: li a1, 36
+; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0), v0.t
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: addi a0, a0, 5
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v8, v12, v11
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v12, 8
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vmv.v.i v0, 2
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vslidedown.vi v14, v12, 1
+; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v12, v9
+; RV64-NEXT: vmv1r.v v9, v14
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, i32 12)
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
; CHECK-LABEL: vpload_factor4:
; CHECK: # %bb.0:
@@ -479,8 +647,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: li a2, 32
; RV32-NEXT: lui a3, 12
; RV32-NEXT: lui a6, 12291
-; RV32-NEXT: lui a7, %hi(.LCPI23_0)
-; RV32-NEXT: addi a7, a7, %lo(.LCPI23_0)
+; RV32-NEXT: lui a7, %hi(.LCPI25_0)
+; RV32-NEXT: addi a7, a7, %lo(.LCPI25_0)
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vle32.v v24, (a5)
; RV32-NEXT: vmv.s.x v0, a3
@@ -565,12 +733,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: addi a1, a1, 16
; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
; RV32-NEXT: lui a7, 49164
-; RV32-NEXT: lui a1, %hi(.LCPI23_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_1)
+; RV32-NEXT: lui a1, %hi(.LCPI25_1)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_1)
; RV32-NEXT: lui t2, 3
; RV32-NEXT: lui t1, 196656
-; RV32-NEXT: lui a4, %hi(.LCPI23_3)
-; RV32-NEXT: addi a4, a4, %lo(.LCPI23_3)
+; RV32-NEXT: lui a4, %hi(.LCPI25_3)
+; RV32-NEXT: addi a4, a4, %lo(.LCPI25_3)
; RV32-NEXT: lui t0, 786624
; RV32-NEXT: li a5, 48
; RV32-NEXT: lui a6, 768
@@ -749,8 +917,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v24, v8, v2
-; RV32-NEXT: lui a1, %hi(.LCPI23_2)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_2)
+; RV32-NEXT: lui a1, %hi(.LCPI25_2)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_2)
; RV32-NEXT: lui a3, 3073
; RV32-NEXT: addi a3, a3, -1024
; RV32-NEXT: vmv.s.x v0, a3
@@ -814,16 +982,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vrgatherei16.vv v28, v8, v3
; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma
; RV32-NEXT: vmv.v.v v28, v24
-; RV32-NEXT: lui a1, %hi(.LCPI23_4)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_4)
-; RV32-NEXT: lui a2, %hi(.LCPI23_5)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI23_5)
+; RV32-NEXT: lui a1, %hi(.LCPI25_4)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_4)
+; RV32-NEXT: lui a2, %hi(.LCPI25_5)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI25_5)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v24, (a2)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v8, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI23_7)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_7)
+; RV32-NEXT: lui a1, %hi(.LCPI25_7)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_7)
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle16.v v10, (a1)
; RV32-NEXT: csrr a1, vlenb
@@ -851,14 +1019,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v16, v0, v10
-; RV32-NEXT: lui a1, %hi(.LCPI23_6)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_6)
-; RV32-NEXT: lui a2, %hi(.LCPI23_8)
-; RV32-NEXT: addi a2, a2, %lo(.LCPI23_8)
+; RV32-NEXT: lui a1, %hi(.LCPI25_6)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_6)
+; RV32-NEXT: lui a2, %hi(.LCPI25_8)
+; RV32-NEXT: addi a2, a2, %lo(.LCPI25_8)
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vle16.v v4, (a1)
-; RV32-NEXT: lui a1, %hi(.LCPI23_9)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI23_9)
+; RV32-NEXT: lui a1, %hi(.LCPI25_9)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI25_9)
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT: vle16.v v6, (a1)
; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
@@ -945,8 +1113,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: li a4, 128
; RV64-NEXT: lui a1, 1
; RV64-NEXT: vle64.v v8, (a3)
-; RV64-NEXT: lui a3, %hi(.LCPI23_0)
-; RV64-NEXT: addi a3, a3, %lo(.LCPI23_0)
+; RV64-NEXT: lui a3, %hi(.LCPI25_0)
+; RV64-NEXT: addi a3, a3, %lo(.LCPI25_0)
; RV64-NEXT: vmv.s.x v0, a4
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: li a5, 61
@@ -1134,8 +1302,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu
; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT: lui a2, %hi(.LCPI23_1)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI23_1)
+; RV64-NEXT: lui a2, %hi(.LCPI25_1)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI25_1)
; RV64-NEXT: li a3, 192
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v6, (a2)
@@ -1169,8 +1337,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vrgatherei16.vv v24, v16, v6
; RV64-NEXT: addi a2, sp, 16
; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT: lui a2, %hi(.LCPI23_2)
-; RV64-NEXT: addi a2, a2, %lo(.LCPI23_2)
+; RV64-NEXT: lui a2, %hi(.LCPI25_2)
+; RV64-NEXT: addi a2, a2, %lo(.LCPI25_2)
; RV64-NEXT: li a3, 1040
; RV64-NEXT: vmv.s.x v0, a3
; RV64-NEXT: addi a1, a1, -2016
@@ -1254,12 +1422,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 16
; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT: lui a1, %hi(.LCPI23_3)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI23_3)
+; RV64-NEXT: lui a1, %hi(.LCPI25_3)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI25_3)
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v20, (a1)
-; RV64-NEXT: lui a1, %hi(.LCPI23_4)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI23_4)
+; RV64-NEXT: lui a1, %hi(.LCPI25_4)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI25_4)
; RV64-NEXT: vle16.v v8, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 77
@@ -1310,8 +1478,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v0, v16, v8
-; RV64-NEXT: lui a1, %hi(.LCPI23_5)
-; RV64-NEXT: addi a1, a1, %lo(.LCPI23_5)
+; RV64-NEXT: lui a1, %hi(.LCPI25_5)
+; RV64-NEXT: addi a1, a1, %lo(.LCPI25_5)
; RV64-NEXT: vle16.v v20, (a1)
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: li a2, 61
@@ -1928,8 +2096,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
; RV32-NEXT: vle32.v v12, (a0), v0.t
; RV32-NEXT: li a0, 36
; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI59_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI59_0)
+; RV32-NEXT: lui a1, %hi(.LCPI61_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI61_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v21, (a1)
; RV32-NEXT: vcompress.vm v8, v12, v11
@@ -2004,8 +2172,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
; RV32-NEXT: vmv.s.x v10, a0
; RV32-NEXT: li a0, 146
; RV32-NEXT: vmv.s.x v11, a0
-; RV32-NEXT: lui a0, %hi(.LCPI60_0)
-; RV32-NEXT: addi a0, a0, %lo(.LCPI60_0)
+; RV32-NEXT: lui a0, %hi(.LCPI62_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI62_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle16.v v20, (a0)
; RV32-NEXT: li a0, 36
@@ -2094,3 +2262,443 @@ define void @maskedstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
tail call void @llvm.masked.store(<8 x i32> %interleaved.vec, ptr %ptr, i32 4, <8 x i1> splat (i1 true))
ret void
}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
+; CHECK-LABEL: maskedload_factor3_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 5
+; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 1,i1 0,i1 0,i1 0,i1 1,i1 1,i1 1,i1 0,i1 0,i1 0>, <12 x i32> poison)
+ ; mask = 1010
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_skip_field:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 1755
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 73
+; RV32-NEXT: vmv.v.i v10, 8
+; RV32-NEXT: vmv.s.x v11, a1
+; RV32-NEXT: li a1, 146
+; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0), v0.t
+; RV32-NEXT: li a0, 36
+; RV32-NEXT: vmv.s.x v20, a1
+; RV32-NEXT: lui a1, %hi(.LCPI66_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI66_0)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vle16.v v21, (a1)
+; RV32-NEXT: vcompress.vm v8, v12, v11
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v12, 8
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT: vcompress.vm v14, v12, v20
+; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT: vrgatherei16.vv v10, v12, v21
+; RV32-NEXT: vmv1r.v v9, v14
+; RV32-NEXT: ret
+;
+; RV64-LABEL: maskedload_factor3_skip_field:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 1755
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 73
+; RV64-NEXT: vmv.v.i v10, 8
+; RV64-NEXT: vmv.s.x v11, a1
+; RV64-NEXT: li a1, 146
+; RV64-NEXT: vmv.s.x v20, a1
+; RV64-NEXT: li a1, 36
+; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0), v0.t
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: addi a0, a0, 5
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v8, v12, v11
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v12, 8
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT: vcompress.vm v14, v12, v20
+; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v12, v9
+; RV64-NEXT: vmv1r.v v9, v14
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0>, <12 x i32> poison)
+ ; mask = 1111, skip last field
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_mask_skip_field:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 195
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 73
+; RV32-NEXT: vmv.v.i v10, 8
+; RV32-NEXT: vmv.s.x v11, a1
+; RV32-NEXT: li a1, 146
+; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0), v0.t
+; RV32-NEXT: li a0, 36
+; RV32-NEXT: vmv.s.x v20, a1
+; RV32-NEXT: lui a1, %hi(.LCPI67_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI67_0)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vle16.v v21, (a1)
+; RV32-NEXT: vcompress.vm v8, v12, v11
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v12, 8
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT: vcompress.vm v14, v12, v20
+; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT: vrgatherei16.vv v10, v12, v21
+; RV32-NEXT: vmv1r.v v9, v14
+; RV32-NEXT: ret
+;
+; RV64-LABEL: maskedload_factor3_mask_skip_field:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 195
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 73
+; RV64-NEXT: vmv.v.i v10, 8
+; RV64-NEXT: vmv.s.x v11, a1
+; RV64-NEXT: li a1, 146
+; RV64-NEXT: vmv.s.x v20, a1
+; RV64-NEXT: li a1, 36
+; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0), v0.t
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: addi a0, a0, 5
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v8, v12, v11
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v12, 8
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT: vcompress.vm v14, v12, v20
+; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v12, v9
+; RV64-NEXT: vmv1r.v v9, v14
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 0,i1 0,i1 0,i1 1,i1 1,i1 0,i1 0,i1 0,i1 0>, <12 x i32> poison)
+ ; mask = 1010, skip the last field
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; We can only skip the last field for now.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(ptr %ptr) {
+; RV32-LABEL: maskedload_factor3_invalid_skip_field:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 73
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.s.x v11, a1
+; RV32-NEXT: lui a1, 1
+; RV32-NEXT: vmv.v.i v10, 8
+; RV32-NEXT: addi a1, a1, -1171
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 146
+; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT: vle32.v v12, (a0), v0.t
+; RV32-NEXT: li a0, 36
+; RV32-NEXT: vmv.s.x v20, a1
+; RV32-NEXT: lui a1, %hi(.LCPI68_0)
+; RV32-NEXT: addi a1, a1, %lo(.LCPI68_0)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vle16.v v21, (a1)
+; RV32-NEXT: vcompress.vm v8, v12, v11
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v16, v12, 8
+; RV32-NEXT: vmv1r.v v0, v10
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT: vcompress.vm v14, v12, v20
+; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT: vmv.s.x v0, a0
+; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT: vrgatherei16.vv v10, v12, v21
+; RV32-NEXT: vmv1r.v v9, v14
+; RV32-NEXT: ret
+;
+; RV64-LABEL: maskedload_factor3_invalid_skip_field:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 73
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.s.x v11, a1
+; RV64-NEXT: li a1, 146
+; RV64-NEXT: vmv.s.x v20, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmv.v.i v10, 8
+; RV64-NEXT: addi a1, a1, -1171
+; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 36
+; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT: vle32.v v12, (a0), v0.t
+; RV64-NEXT: li a0, 3
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: addi a0, a0, 5
+; RV64-NEXT: slli a0, a0, 16
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vcompress.vm v8, v12, v11
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v16, v12, 8
+; RV64-NEXT: vmv1r.v v0, v10
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT: vcompress.vm v14, v12, v20
+; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v9, a0
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vrgatherei16.vv v10, v12, v9
+; RV64-NEXT: vmv1r.v v9, v14
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1>, <12 x i32> poison)
+ %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+ %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+ %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
+ ; mask = 1111, skip the last two fields.
+; RV32-LABEL: maskedload_factor5_skip_fields:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -256
+; RV32-NEXT: .cfi_def_cfa_offset 256
+; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: addi s0, sp, 256
+; RV32-NEXT: .cfi_def_cfa s0, 0
+; RV32-NEXT: andi sp, sp, -128
+; RV32-NEXT: lui a1, 58
+; RV32-NEXT: addi a1, a1, -793
+; RV32-NEXT: vsetivli zero, 20, e32, m8, ta, ma
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: li a1, 33
+; RV32-NEXT: vle32.v v16, (a0), v0.t
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: mv a2, sp
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v16, 8
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v12, v16, 6
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v13, v16, 1
+; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT: vse32.v v16, (a2)
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT: vslidedown.vi v10, v16, 7
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v11, v16, 2
+; RV32-NEXT: vslidedown.vi v18, v16, 3
+; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT: vslidedown.vi v14, v16, 4
+; RV32-NEXT: vmv.x.s a0, v12
+; RV32-NEXT: vmv.x.s a1, v13
+; RV32-NEXT: vmv.x.s a2, v11
+; RV32-NEXT: vmv.x.s a3, v18
+; RV32-NEXT: vmv.x.s a4, v14
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.v.x v11, a1
+; RV32-NEXT: vmv.v.x v12, a2
+; RV32-NEXT: vmv.v.x v13, a3
+; RV32-NEXT: vmv.v.x v14, a4
+; RV32-NEXT: lw a1, 32(sp)
+; RV32-NEXT: lw a2, 36(sp)
+; RV32-NEXT: lw a3, 44(sp)
+; RV32-NEXT: lw a4, 48(sp)
+; RV32-NEXT: vslide1down.vx v11, v11, a0
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vslide1down.vx v10, v12, a0
+; RV32-NEXT: vslide1down.vx v11, v11, a3
+; RV32-NEXT: vslide1down.vx v10, v10, a4
+; RV32-NEXT: vslide1down.vx v12, v13, a1
+; RV32-NEXT: lw a0, 64(sp)
+; RV32-NEXT: lw a1, 52(sp)
+; RV32-NEXT: lw a3, 56(sp)
+; RV32-NEXT: lw a4, 68(sp)
+; RV32-NEXT: vslide1down.vx v14, v14, a2
+; RV32-NEXT: vslide1down.vx v13, v11, a0
+; RV32-NEXT: vmv.v.i v0, 10
+; RV32-NEXT: vslide1down.vx v10, v10, a4
+; RV32-NEXT: vslide1down.vx v11, v12, a1
+; RV32-NEXT: lw a0, 72(sp)
+; RV32-NEXT: lw a1, 76(sp)
+; RV32-NEXT: vslide1down.vx v12, v14, a3
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT: vslidedown.vi v8, v8, 4, v0.t
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vslide1down.vx v11, v11, a0
+; RV32-NEXT: vslide1down.vx v12, v12, a1
+; RV32-NEXT: vmv1r.v v9, v13
+; RV32-NEXT: addi sp, s0, -256
+; RV32-NEXT: .cfi_def_cfa sp, 256
+; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: addi sp, sp, 256
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: maskedload_factor5_skip_fields:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -256
+; RV64-NEXT: .cfi_def_cfa_offset 256
+; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 256
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: andi sp, sp, -128
+; RV64-NEXT: lui a1, 58
+; RV64-NEXT: addi a1, a1, -793
+; RV64-NEXT: vsetivli zero, 20, e32, m8, ta, ma
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: li a1, 33
+; RV64-NEXT: vle32.v v16, (a0), v0.t
+; RV64-NEXT: li a0, 32
+; RV64-NEXT: mv a2, sp
+; RV64-NEXT: vmv.s.x v0, a1
+; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v16, 8
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v12, v16, 6
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v13, v16, 1
+; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT: vse32.v v16, (a2)
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT: vslidedown.vi v10, v16, 7
+; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v11, v16, 2
+; RV64-NEXT: vslidedown.vi v18, v16, 3
+; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v14, v16, 4
+; RV64-NEXT: vmv.x.s a0, v12
+; RV64-NEXT: vmv.x.s a1, v13
+; RV64-NEXT: vmv.x.s a2, v11
+; RV64-NEXT: vmv.x.s a3, v18
+; RV64-NEXT: vmv.x.s a4, v14
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.x v11, a1
+; RV64-NEXT: vmv.v.x v12, a2
+; RV64-NEXT: vmv.v.x v13, a3
+; RV64-NEXT: vmv.v.x v14, a4
+; RV64-NEXT: lw a1, 32(sp)
+; RV64-NEXT: lw a2, 36(sp)
+; RV64-NEXT: lw a3, 44(sp)
+; RV64-NEXT: lw a4, 48(sp)
+; RV64-NEXT: vslide1down.vx v11, v11, a0
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: vslide1down.vx v10, v12, a0
+; RV64-NEXT: vslide1down.vx v11, v11, a3
+; RV64-NEXT: vslide1down.vx v10, v10, a4
+; RV64-NEXT: vslide1down.vx v12, v13, a1
+; RV64-NEXT: lw a0, 64(sp)
+; RV64-NEXT: lw a1, 52(sp)
+; RV64-NEXT: lw a3, 56(sp)
+; RV64-NEXT: lw a4, 68(sp)
+; RV64-NEXT: vslide1down.vx v14, v14, a2
+; RV64-NEXT: vslide1down.vx v13, v11, a0
+; RV64-NEXT: vmv.v.i v0, 10
+; RV64-NEXT: vslide1down.vx v10, v10, a4
+; RV64-NEXT: vslide1down.vx v11, v12, a1
+; RV64-NEXT: lw a0, 72(sp)
+; RV64-NEXT: lw a1, 76(sp)
+; RV64-NEXT: vslide1down.vx v12, v14, a3
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT: vslidedown.vi v8, v8, 4, v0.t
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vslide1down.vx v11, v11, a0
+; RV64-NEXT: vslide1down.vx v12, v12, a1
+; RV64-NEXT: vmv1r.v v9, v13
+; RV64-NEXT: addi sp, s0, -256
+; RV64-NEXT: .cfi_def_cfa sp, 256
+; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: addi sp, sp, 256
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <20 x i32> poison)
+ %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+ %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+ %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+ %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+ %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+ %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+ ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+
>From 9d6ef18b57878c74ce673875683ade639f1eb14e Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Thu, 31 Jul 2025 15:56:04 -0700
Subject: [PATCH 2/3] [IA][RISCV] Recognize deinterleaved loads that could
lower to strided segmented loads
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 +-
llvm/lib/CodeGen/InterleavedAccessPass.cpp | 81 ++-
.../Target/AArch64/AArch64ISelLowering.cpp | 2 +-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +-
llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 +-
llvm/lib/Target/ARM/ARMISelLowering.h | 4 +-
llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +-
.../Target/RISCV/RISCVInterleavedAccess.cpp | 41 +-
llvm/lib/Target/X86/X86ISelLowering.h | 4 +-
llvm/lib/Target/X86/X86InterleavedAccess.cpp | 2 +-
.../rvv/fixed-vectors-interleaved-access.ll | 475 ++----------------
11 files changed, 139 insertions(+), 486 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cbdc1b6031680..3239b35031e36 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3209,10 +3209,12 @@ class LLVM_ABI TargetLoweringBase {
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
+ /// \p MaskFactor is the interleave factor that considers mask, which can
+ /// reduce the original factor.
virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ unsigned MaskFactor) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5e508989ef2da..e6c4de23c055e 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -268,13 +268,19 @@ static Value *getMaskOperand(IntrinsicInst *II) {
}
}
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor,
- ElementCount LeafValueEC);
-
-static Value *getMask(Value *WideMask, unsigned Factor,
- VectorType *LeafValueTy) {
+// Return a pair of
+// (1) The corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+// (2) Some mask effectively skips a certain field, this element contains
+// the factor after taking such contraction into consideration. Note that
+// currently we only support skipping trailing fields. So if the "nominal"
+// factor was 5, you cannot only skip field 1 and 2, but you can skip field 3
+// and 4.
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+ ElementCount LeafValueEC);
+
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+ VectorType *LeafValueTy) {
return getMask(WideMask, Factor, LeafValueTy->getElementCount());
}
@@ -379,22 +385,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
Value *Mask = nullptr;
+ unsigned MaskFactor = Factor;
if (LI) {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor, VecTy);
+ std::tie(Mask, MaskFactor) = getMask(getMaskOperand(II), Factor, VecTy);
if (!Mask)
return false;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
<< *Load << "\n");
+ LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
+ << " and mask factor " << MaskFactor << "\n");
}
// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
- Indices, Factor))
+ Indices, Factor, MaskFactor))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
@@ -536,8 +545,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
- Mask = getMask(getMaskOperand(II), Factor,
- ElementCount::getFixed(LaneMaskLen));
+ std::tie(Mask, std::ignore) = getMask(getMaskOperand(II), Factor,
+ ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;
@@ -556,34 +565,57 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
return true;
}
-static Value *getMask(Value *WideMask, unsigned Factor,
- ElementCount LeafValueEC) {
+static std::pair<Value *, unsigned> getMask(Value *WideMask, unsigned Factor,
+ ElementCount LeafValueEC) {
if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
F && F == Factor && llvm::all_equal(IMI->args())) {
- return IMI->getArgOperand(0);
+ return {IMI->getArgOperand(0), Factor};
}
}
if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
if (auto *Splat = ConstMask->getSplatValue())
// All-ones or all-zeros mask.
- return ConstantVector::getSplat(LeafValueEC, Splat);
+ return {ConstantVector::getSplat(LeafValueEC, Splat), Factor};
if (LeafValueEC.isFixed()) {
unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+ // First, check if the mask completely skips some of the factors / fields.
+ APInt FactorMask(Factor, 0);
+ FactorMask.setAllBits();
+ for (unsigned F = 0U; F < Factor; ++F) {
+ unsigned Idx;
+ for (Idx = 0U; Idx < LeafMaskLen; ++Idx) {
+ Constant *C = ConstMask->getAggregateElement(F + Idx * Factor);
+ if (!C->isZeroValue())
+ break;
+ }
+ // All mask bits on this field are zero, skipping it.
+ if (Idx >= LeafMaskLen)
+ FactorMask.clearBit(F);
+ }
+ // We currently only support skipping "trailing" factors / fields. So
+ // given the original factor being 4, we can skip fields 2 and 3, but we
+ // cannot only skip fields 1 and 2. If FactorMask does not match such
+ // pattern, reset it.
+ if (!FactorMask.isMask())
+ FactorMask.setAllBits();
+
SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
// If this is a fixed-length constant mask, each lane / leaf has to
// use the same mask. This is done by checking if every group with Factor
// number of elements in the interleaved mask has homogeneous values.
for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+ if (!FactorMask[Idx % Factor])
+ continue;
Constant *C = ConstMask->getAggregateElement(Idx);
if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
- return nullptr;
+ return {nullptr, Factor};
LeafMask[Idx / Factor] = C;
}
- return ConstantVector::get(LeafMask);
+ return {ConstantVector::get(LeafMask), FactorMask.popcount()};
}
}
@@ -603,12 +635,13 @@ static Value *getMask(Value *WideMask, unsigned Factor,
auto *LeafMaskTy =
VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
IRBuilder<> Builder(SVI);
- return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
- uint64_t(0));
+ return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
+ uint64_t(0)),
+ Factor};
}
}
- return nullptr;
+ return {nullptr, Factor};
}
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
@@ -639,7 +672,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
+ std::tie(Mask, std::ignore) =
+ getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;
@@ -680,8 +714,9 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
II->getIntrinsicID() != Intrinsic::vp_store)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
- Mask = getMask(getMaskOperand(II), Factor,
- cast<VectorType>(InterleaveValues[0]->getType()));
+ std::tie(Mask, std::ignore) =
+ getMask(getMaskOperand(II), Factor,
+ cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b6ea86ee1af5..e681d846f9e1c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ea63edd86210e..d0d6512d39015 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -220,8 +220,8 @@ class AArch64TargetLowering : public TargetLowering {
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ unsigned MaskFactor) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 936625606e315..a5750def66b7d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 825145d813fb1..670bbb62fe0f6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -683,8 +683,8 @@ class VectorType;
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ unsigned MaskFactor) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index fa50e2105a708..4155f613f7f04 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -431,8 +431,8 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ unsigned MaskFactor) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 726920e4015cf..d4e6351ea6a51 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
Intrinsic::riscv_seg8_load_mask};
+static const Intrinsic::ID FixedVlssegIntrIds[] = {
+ Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
+ Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
+ Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
+ Intrinsic::riscv_sseg8_load_mask};
+
static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
@@ -197,9 +203,13 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
assert(Indices.size() == Shuffles.size());
+ assert(MaskFactor <= Factor);
+ // TODO: Lower to strided load when MaskFactor = 1.
+ if (MaskFactor < 2)
+ return false;
IRBuilder<> Builder(Load);
const DataLayout &DL = Load->getDataLayout();
@@ -208,20 +218,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Ptr, *VL;
Align Alignment;
- if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+ if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
- if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
+ if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;
- CallInst *VlsegN = Builder.CreateIntrinsic(
- FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ CallInst *SegLoad = nullptr;
+ if (MaskFactor < Factor) {
+ // Lower to strided segmented load.
+ unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+ SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
+ {VTy, PtrTy, XLenTy, XLenTy},
+ {Ptr, Stride, Mask, VL});
+ } else {
+ // Lower to normal segmented load.
+ SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+ {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
+ }
for (unsigned i = 0; i < Shuffles.size(); i++) {
- Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
- Shuffles[i]->replaceAllUsesWith(SubVec);
+ unsigned FactorIdx = Indices[i];
+ if (FactorIdx >= MaskFactor) {
+ // Replace masked-off factors (that are still extracted) with poison.
+ Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
+ } else {
+ Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
+ Shuffles[i]->replaceAllUsesWith(SubVec);
+ }
}
return true;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 547b2210fdbf0..242d24b5faf60 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1663,8 +1663,8 @@ namespace llvm {
/// instructions/intrinsics.
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices,
- unsigned Factor) const override;
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ unsigned MaskFactor) const override;
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 636b072837441..6929c869b1a31 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor) const {
+ ArrayRef<unsigned> Indices, unsigned Factor, unsigned MaskFactor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 2df26b2f78d5b..497b39fb6f044 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -334,78 +334,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
; mask = 1111, skip the last field.
-; RV32-LABEL: vpload_factor3_skip_fields:
-; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1755
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: li a1, 73
-; RV32-NEXT: vmv.v.i v10, 8
-; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT: vle32.v v12, (a0), v0.t
-; RV32-NEXT: li a0, 36
-; RV32-NEXT: vmv.s.x v11, a1
-; RV32-NEXT: lui a1, %hi(.LCPI17_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI17_0)
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vcompress.vm v8, v12, v11
-; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v16, v12, 8
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 2
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vslidedown.vi v14, v12, 1
-; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t
-; RV32-NEXT: vle16.v v9, (a1)
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT: vrgatherei16.vv v10, v12, v9
-; RV32-NEXT: vmv1r.v v9, v14
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vpload_factor3_skip_fields:
-; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1755
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: li a1, 73
-; RV64-NEXT: vmv.v.i v10, 8
-; RV64-NEXT: vmv.s.x v11, a1
-; RV64-NEXT: li a1, 36
-; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT: vle32.v v12, (a0), v0.t
-; RV64-NEXT: li a0, 3
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: addi a0, a0, 5
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vcompress.vm v8, v12, v11
-; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v12, 8
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.i v0, 2
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vslidedown.vi v14, v12, 1
-; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vrgatherei16.vv v10, v12, v9
-; RV64-NEXT: vmv1r.v v9, v14
-; RV64-NEXT: ret
+; CHECK-LABEL: vpload_factor3_skip_fields:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 12
+; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT: ret
%interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, i32 12)
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
@@ -418,78 +352,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
; mask = 0101, skip the last field.
-; RV32-LABEL: vpload_factor3_mask_skip_fields:
-; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1560
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: li a1, 73
-; RV32-NEXT: vmv.v.i v10, 8
-; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT: vle32.v v12, (a0), v0.t
-; RV32-NEXT: li a0, 36
-; RV32-NEXT: vmv.s.x v11, a1
-; RV32-NEXT: lui a1, %hi(.LCPI18_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI18_0)
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vcompress.vm v8, v12, v11
-; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v16, v12, 8
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v0, 2
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vslidedown.vi v14, v12, 1
-; RV32-NEXT: vslidedown.vi v14, v12, 3, v0.t
-; RV32-NEXT: vle16.v v9, (a1)
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT: vrgatherei16.vv v10, v12, v9
-; RV32-NEXT: vmv1r.v v9, v14
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vpload_factor3_mask_skip_fields:
-; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1560
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: li a1, 73
-; RV64-NEXT: vmv.v.i v10, 8
-; RV64-NEXT: vmv.s.x v11, a1
-; RV64-NEXT: li a1, 36
-; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT: vle32.v v12, (a0), v0.t
-; RV64-NEXT: li a0, 3
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: addi a0, a0, 5
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vcompress.vm v8, v12, v11
-; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v12, 8
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; RV64-NEXT: vmv.v.i v0, 2
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vslidedown.vi v14, v12, 1
-; RV64-NEXT: vslidedown.vi v14, v12, 3, v0.t
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vrgatherei16.vv v10, v12, v9
-; RV64-NEXT: vmv1r.v v9, v14
-; RV64-NEXT: ret
+; CHECK-LABEL: vpload_factor3_mask_skip_fields:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 6, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 10
+; CHECK-NEXT: li a1, 12
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
%interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, i32 12)
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
@@ -2282,72 +2151,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
}
define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
-; RV32-LABEL: maskedload_factor3_skip_field:
-; RV32: # %bb.0:
-; RV32-NEXT: li a1, 1755
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: li a1, 73
-; RV32-NEXT: vmv.v.i v10, 8
-; RV32-NEXT: vmv.s.x v11, a1
-; RV32-NEXT: li a1, 146
-; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT: vle32.v v12, (a0), v0.t
-; RV32-NEXT: li a0, 36
-; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI66_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI66_0)
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle16.v v21, (a1)
-; RV32-NEXT: vcompress.vm v8, v12, v11
-; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v16, v12, 8
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT: vcompress.vm v14, v12, v20
-; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT: vrgatherei16.vv v10, v12, v21
-; RV32-NEXT: vmv1r.v v9, v14
-; RV32-NEXT: ret
-;
-; RV64-LABEL: maskedload_factor3_skip_field:
-; RV64: # %bb.0:
-; RV64-NEXT: li a1, 1755
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: li a1, 73
-; RV64-NEXT: vmv.v.i v10, 8
-; RV64-NEXT: vmv.s.x v11, a1
-; RV64-NEXT: li a1, 146
-; RV64-NEXT: vmv.s.x v20, a1
-; RV64-NEXT: li a1, 36
-; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT: vle32.v v12, (a0), v0.t
-; RV64-NEXT: li a0, 3
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: addi a0, a0, 5
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vcompress.vm v8, v12, v11
-; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v12, 8
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT: vcompress.vm v14, v12, v20
-; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vrgatherei16.vv v10, v12, v9
-; RV64-NEXT: vmv1r.v v9, v14
-; RV64-NEXT: ret
+; CHECK-LABEL: maskedload_factor3_skip_field:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 12
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1
+; CHECK-NEXT: ret
%interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0,i1 1,i1 1,i1 0>, <12 x i32> poison)
; mask = 1111, skip last field
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -2360,72 +2169,13 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr
}
define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
-; RV32-LABEL: maskedload_factor3_mask_skip_field:
-; RV32: # %bb.0:
-; RV32-NEXT: li a1, 195
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: li a1, 73
-; RV32-NEXT: vmv.v.i v10, 8
-; RV32-NEXT: vmv.s.x v11, a1
-; RV32-NEXT: li a1, 146
-; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV32-NEXT: vle32.v v12, (a0), v0.t
-; RV32-NEXT: li a0, 36
-; RV32-NEXT: vmv.s.x v20, a1
-; RV32-NEXT: lui a1, %hi(.LCPI67_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI67_0)
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle16.v v21, (a1)
-; RV32-NEXT: vcompress.vm v8, v12, v11
-; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v16, v12, 8
-; RV32-NEXT: vmv1r.v v0, v10
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV32-NEXT: vcompress.vm v14, v12, v20
-; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV32-NEXT: vrgatherei16.vv v10, v12, v21
-; RV32-NEXT: vmv1r.v v9, v14
-; RV32-NEXT: ret
-;
-; RV64-LABEL: maskedload_factor3_mask_skip_field:
-; RV64: # %bb.0:
-; RV64-NEXT: li a1, 195
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: li a1, 73
-; RV64-NEXT: vmv.v.i v10, 8
-; RV64-NEXT: vmv.s.x v11, a1
-; RV64-NEXT: li a1, 146
-; RV64-NEXT: vmv.s.x v20, a1
-; RV64-NEXT: li a1, 36
-; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma
-; RV64-NEXT: vle32.v v12, (a0), v0.t
-; RV64-NEXT: li a0, 3
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: addi a0, a0, 5
-; RV64-NEXT: slli a0, a0, 16
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vcompress.vm v8, v12, v11
-; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v16, v12, 8
-; RV64-NEXT: vmv1r.v v0, v10
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t
-; RV64-NEXT: vcompress.vm v14, v12, v20
-; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: addi a0, a0, 2
-; RV64-NEXT: vmerge.vvm v12, v16, v12, v0
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v9, a0
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vrgatherei16.vv v10, v12, v9
-; RV64-NEXT: vmv1r.v v9, v14
-; RV64-NEXT: ret
+; CHECK-LABEL: maskedload_factor3_mask_skip_field:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 5
+; CHECK-NEXT: li a1, 12
+; CHECK-NEXT: vlsseg2e32.v v8, (a0), a1, v0.t
+; CHECK-NEXT: ret
%interleaved.vec = tail call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %ptr, i32 4, <12 x i1> <i1 1,i1 1,i1 0,i1 0,i1 0,i1 0,i1 1,i1 1,i1 0,i1 0,i1 0,i1 0>, <12 x i32> poison)
; mask = 1010, skip the last field
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
@@ -2521,173 +2271,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
; mask = 1111, skip the last two fields.
-; RV32-LABEL: maskedload_factor5_skip_fields:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -256
-; RV32-NEXT: .cfi_def_cfa_offset 256
-; RV32-NEXT: sw ra, 252(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 248(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: addi s0, sp, 256
-; RV32-NEXT: .cfi_def_cfa s0, 0
-; RV32-NEXT: andi sp, sp, -128
-; RV32-NEXT: lui a1, 58
-; RV32-NEXT: addi a1, a1, -793
-; RV32-NEXT: vsetivli zero, 20, e32, m8, ta, ma
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: li a1, 33
-; RV32-NEXT: vle32.v v16, (a0), v0.t
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: mv a2, sp
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v16, 8
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v12, v16, 6
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v13, v16, 1
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vse32.v v16, (a2)
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT: vslidedown.vi v10, v16, 7
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v11, v16, 2
-; RV32-NEXT: vslidedown.vi v18, v16, 3
-; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vi v14, v16, 4
-; RV32-NEXT: vmv.x.s a0, v12
-; RV32-NEXT: vmv.x.s a1, v13
-; RV32-NEXT: vmv.x.s a2, v11
-; RV32-NEXT: vmv.x.s a3, v18
-; RV32-NEXT: vmv.x.s a4, v14
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v11, a1
-; RV32-NEXT: vmv.v.x v12, a2
-; RV32-NEXT: vmv.v.x v13, a3
-; RV32-NEXT: vmv.v.x v14, a4
-; RV32-NEXT: lw a1, 32(sp)
-; RV32-NEXT: lw a2, 36(sp)
-; RV32-NEXT: lw a3, 44(sp)
-; RV32-NEXT: lw a4, 48(sp)
-; RV32-NEXT: vslide1down.vx v11, v11, a0
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vslide1down.vx v10, v12, a0
-; RV32-NEXT: vslide1down.vx v11, v11, a3
-; RV32-NEXT: vslide1down.vx v10, v10, a4
-; RV32-NEXT: vslide1down.vx v12, v13, a1
-; RV32-NEXT: lw a0, 64(sp)
-; RV32-NEXT: lw a1, 52(sp)
-; RV32-NEXT: lw a3, 56(sp)
-; RV32-NEXT: lw a4, 68(sp)
-; RV32-NEXT: vslide1down.vx v14, v14, a2
-; RV32-NEXT: vslide1down.vx v13, v11, a0
-; RV32-NEXT: vmv.v.i v0, 10
-; RV32-NEXT: vslide1down.vx v10, v10, a4
-; RV32-NEXT: vslide1down.vx v11, v12, a1
-; RV32-NEXT: lw a0, 72(sp)
-; RV32-NEXT: lw a1, 76(sp)
-; RV32-NEXT: vslide1down.vx v12, v14, a3
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV32-NEXT: vslidedown.vi v8, v8, 4, v0.t
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vslide1down.vx v11, v11, a0
-; RV32-NEXT: vslide1down.vx v12, v12, a1
-; RV32-NEXT: vmv1r.v v9, v13
-; RV32-NEXT: addi sp, s0, -256
-; RV32-NEXT: .cfi_def_cfa sp, 256
-; RV32-NEXT: lw ra, 252(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 248(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 256
-; RV32-NEXT: .cfi_def_cfa_offset 0
-; RV32-NEXT: ret
-;
-; RV64-LABEL: maskedload_factor5_skip_fields:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -256
-; RV64-NEXT: .cfi_def_cfa_offset 256
-; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: addi s0, sp, 256
-; RV64-NEXT: .cfi_def_cfa s0, 0
-; RV64-NEXT: andi sp, sp, -128
-; RV64-NEXT: lui a1, 58
-; RV64-NEXT: addi a1, a1, -793
-; RV64-NEXT: vsetivli zero, 20, e32, m8, ta, ma
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: li a1, 33
-; RV64-NEXT: vle32.v v16, (a0), v0.t
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: mv a2, sp
-; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v16, 8
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v12, v16, 6
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v13, v16, 1
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vse32.v v16, (a2)
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT: vslidedown.vi v10, v16, 7
-; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v16, 2
-; RV64-NEXT: vslidedown.vi v18, v16, 3
-; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vi v14, v16, 4
-; RV64-NEXT: vmv.x.s a0, v12
-; RV64-NEXT: vmv.x.s a1, v13
-; RV64-NEXT: vmv.x.s a2, v11
-; RV64-NEXT: vmv.x.s a3, v18
-; RV64-NEXT: vmv.x.s a4, v14
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.v.x v11, a1
-; RV64-NEXT: vmv.v.x v12, a2
-; RV64-NEXT: vmv.v.x v13, a3
-; RV64-NEXT: vmv.v.x v14, a4
-; RV64-NEXT: lw a1, 32(sp)
-; RV64-NEXT: lw a2, 36(sp)
-; RV64-NEXT: lw a3, 44(sp)
-; RV64-NEXT: lw a4, 48(sp)
-; RV64-NEXT: vslide1down.vx v11, v11, a0
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: vslide1down.vx v10, v12, a0
-; RV64-NEXT: vslide1down.vx v11, v11, a3
-; RV64-NEXT: vslide1down.vx v10, v10, a4
-; RV64-NEXT: vslide1down.vx v12, v13, a1
-; RV64-NEXT: lw a0, 64(sp)
-; RV64-NEXT: lw a1, 52(sp)
-; RV64-NEXT: lw a3, 56(sp)
-; RV64-NEXT: lw a4, 68(sp)
-; RV64-NEXT: vslide1down.vx v14, v14, a2
-; RV64-NEXT: vslide1down.vx v13, v11, a0
-; RV64-NEXT: vmv.v.i v0, 10
-; RV64-NEXT: vslide1down.vx v10, v10, a4
-; RV64-NEXT: vslide1down.vx v11, v12, a1
-; RV64-NEXT: lw a0, 72(sp)
-; RV64-NEXT: lw a1, 76(sp)
-; RV64-NEXT: vslide1down.vx v12, v14, a3
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu
-; RV64-NEXT: vslidedown.vi v8, v8, 4, v0.t
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vslide1down.vx v11, v11, a0
-; RV64-NEXT: vslide1down.vx v12, v12, a1
-; RV64-NEXT: vmv1r.v v9, v13
-; RV64-NEXT: addi sp, s0, -256
-; RV64-NEXT: .cfi_def_cfa sp, 256
-; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: addi sp, sp, 256
-; RV64-NEXT: .cfi_def_cfa_offset 0
-; RV64-NEXT: ret
+; CHECK-LABEL: maskedload_factor5_skip_fields:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 20
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vlsseg3e32.v v8, (a0), a1
+; CHECK-NEXT: ret
%interleaved.vec = tail call <20 x i32> @llvm.masked.load(ptr %ptr, i32 4, <20 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <20 x i32> poison)
%v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
%v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
>From 95f772e818601ddf5f54e76ec518715f3929eeb2 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Mon, 4 Aug 2025 11:16:21 -0700
Subject: [PATCH 3/3] fixup! Clean up the tests
---
.../rvv/fixed-vectors-interleaved-access.ll | 48 ++++++++-----------
1 file changed, 21 insertions(+), 27 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 497b39fb6f044..a61a1b7cf9703 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -332,7 +332,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr
ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
-define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
; mask = 1111, skip the last field.
; CHECK-LABEL: vpload_factor3_skip_fields:
; CHECK: # %bb.0:
@@ -344,13 +344,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_skip_fields(ptr %ptr) {
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
- %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
- %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
- %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
- ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
}
-define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %ptr) {
; mask = 0101, skip the last field.
; CHECK-LABEL: vpload_factor3_mask_skip_fields:
; CHECK: # %bb.0:
@@ -363,10 +362,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask_skip_fields(ptr %p
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
- %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
- %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
- %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
- ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
}
define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
@@ -2150,7 +2148,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask(ptr %ptr) {
ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
-define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr) {
; CHECK-LABEL: maskedload_factor3_skip_field:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 12
@@ -2162,13 +2160,12 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_skip_field(ptr %ptr
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
- %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
- %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
- %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
- ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
}
-define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
+define {<4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr %ptr) {
; CHECK-LABEL: maskedload_factor3_mask_skip_field:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
@@ -2181,10 +2178,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_mask_skip_field(ptr
%v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
%v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
- %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
- %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
- %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
- ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+ %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ ret {<4 x i32>, <4 x i32>} %res1
}
; We can only skip the last field for now.
@@ -2269,7 +2265,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor3_invalid_skip_field(
ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
-define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
+define {<4 x i32>, <4 x i32>, <4 x i32>} @maskedload_factor5_skip_fields(ptr %ptr) {
; mask = 1111, skip the last two fields.
; CHECK-LABEL: maskedload_factor5_skip_fields:
; CHECK: # %bb.0:
@@ -2283,11 +2279,9 @@ define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @maskedload_facto
%v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
%v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
%v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
- %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
- %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
- %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
- %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
- %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
- ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+ %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+ %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+ %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+ ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
}
More information about the llvm-commits
mailing list