[llvm] [LoongArch] Optimize inserting extracted elements (PR #146018)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 2 03:26:18 PDT 2025
https://github.com/zhaoqi5 updated https://github.com/llvm/llvm-project/pull/146018
>From 460ef366bd701412666af896ca73baf02f34f437 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Fri, 27 Jun 2025 11:22:37 +0800
Subject: [PATCH 1/5] [LoongArch] Optimize inserting extracted elements
---
llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td | 13 ++++++++-----
llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td | 5 ++++-
.../LoongArch/lasx/ir-instruction/fix-xvshuf.ll | 12 ++----------
.../lasx/ir-instruction/insert-extract-element.ll | 4 ----
.../lsx/ir-instruction/insert-extract-element.ll | 6 ++----
5 files changed, 16 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..915dc803bdbd7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1593,11 +1593,14 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
(XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
(XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
- (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
- (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+ (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+ (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+ (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+ (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
// scalar_to_vector
def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..34c6ffc6727f1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1791,7 +1791,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
(VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
(VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (f32 (vector_extract v4f32:$vj, uimm2:$imm1)), uimm2:$imm2),
+ (VINSGR2VR_W $vd, (VPICKVE2GR_W v4f32:$vj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (vector_extract v2f64:$vj, uimm1:$imm1)), uimm1:$imm2),
+ (VINSGR2VR_D $vd, (VPICKVE2GR_D v2f64:$vj, uimm1:$imm1), uimm1:$imm2)>;
def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
(VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..f154dd3b8eb3c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: shufflevector_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT: movgr2fr.d $fa2, $a0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT: movgr2fr.d $fa3, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa2
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa3
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.d $fa0, $a0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT: movgr2fr.d $fa1, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa0
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
+; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3
; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3
; CHECK-NEXT: xvori.b $xr0, $xr2, 0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index 3fdc439e68679..fb942defe446a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -61,8 +61,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT: movgr2fr.w $fa1, $a0
-; CHECK-NEXT: movfr2gr.s $a0, $fa1
; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
; CHECK-NEXT: ret
entry:
@@ -87,8 +85,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.d $fa1, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index c7dd1454c7e33..605e886253790 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -40,8 +40,7 @@ entry:
define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT: movfr2gr.s $a0, $fa1
+; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3
; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -65,8 +64,7 @@ entry:
define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
+; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
>From 9dfa943e19efa256cfe46f2dd0aa00b93f07f14a Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Tue, 1 Jul 2025 11:04:58 +0800
Subject: [PATCH 2/5] use vextrins instruction
---
.../Target/LoongArch/LoongArchLSXInstrInfo.td | 51 +++++++++++++++++--
.../ir-instruction/insert-extract-element.ll | 18 +++----
2 files changed, 53 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 34c6ffc6727f1..9dd6006e3a9dc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst,
(Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>;
}
+multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_W $vd, $vj, Imm)>;
+ }
+ }
+}
+
+multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert vecty:$vd,
+ (elemty (vector_extract vecty:$vj, imm1)), imm2),
+ (VEXTRINS_D $vd, $vj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLSX] in {
// VADD_{B/H/W/D}
@@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
defm : PatCCVrVrF<SETO, "VFCMP_COR">;
defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
+// Insert element extracted from vector into vector.
+// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v16i8:$vd,
+ (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2),
+ (VEXTRINS_B $vd, $vj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert v8i16:$vd,
+ (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2),
+ (VEXTRINS_H $vd, $vj, Imm)>;
+ }
+}
+
+defm : InsertExtractPatV4<v4i32, GRLenVT>;
+defm : InsertExtractPatV4<v4f32, f32>;
+defm : InsertExtractPatV2<v2i64, GRLenVT>;
+defm : InsertExtractPatV2<v2f64, f64>;
+
// VINSGR2VR_{B/H/W/D}
def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
(VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
@@ -1791,10 +1838,6 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
(VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
(VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-def : Pat<(vector_insert v4f32:$vd, (f32 (vector_extract v4f32:$vj, uimm2:$imm1)), uimm2:$imm2),
- (VINSGR2VR_W $vd, (VPICKVE2GR_W v4f32:$vj, uimm2:$imm1), uimm2:$imm2)>;
-def : Pat<(vector_insert v2f64:$vd, (f64 (vector_extract v2f64:$vj, uimm1:$imm1)), uimm1:$imm2),
- (VINSGR2VR_D $vd, (VPICKVE2GR_D v2f64:$vj, uimm1:$imm1), uimm1:$imm2)>;
def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
(VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index 605e886253790..e9a0c8a110452 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
define <16 x i8> @insert_extract_v16i8(<16 x i8> %a) nounwind {
; CHECK-LABEL: insert_extract_v16i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1
+; CHECK-NEXT: vextrins.b $vr0, $vr0, 31
; CHECK-NEXT: ret
entry:
%b = extractelement <16 x i8> %a, i32 15
@@ -16,8 +15,7 @@ entry:
define <8 x i16> @insert_extract_v8i16(<8 x i16> %a) nounwind {
; CHECK-LABEL: insert_extract_v8i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1
+; CHECK-NEXT: vextrins.h $vr0, $vr0, 23
; CHECK-NEXT: ret
entry:
%b = extractelement <8 x i16> %a, i32 7
@@ -28,8 +26,7 @@ entry:
define <4 x i32> @insert_extract_v4i32(<4 x i32> %a) nounwind {
; CHECK-LABEL: insert_extract_v4i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1
+; CHECK-NEXT: vextrins.w $vr0, $vr0, 19
; CHECK-NEXT: ret
entry:
%b = extractelement <4 x i32> %a, i32 3
@@ -40,8 +37,7 @@ entry:
define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT: vextrins.w $vr0, $vr0, 3
; CHECK-NEXT: ret
entry:
%b = extractelement <4 x float> %a, i32 3
@@ -52,8 +48,7 @@ entry:
define <2 x i64> @insert_extract_v2i64(<2 x i64> %a) nounwind {
; CHECK-LABEL: insert_extract_v2i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vextrins.d $vr0, $vr0, 1
; CHECK-NEXT: ret
entry:
%b = extractelement <2 x i64> %a, i32 1
@@ -64,8 +59,7 @@ entry:
define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT: vextrins.d $vr0, $vr0, 1
; CHECK-NEXT: ret
entry:
%b = extractelement <2 x double> %a, i32 1
>From b028fc3483a1199d4534205209a3008c9fc1de85 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Wed, 2 Jul 2025 17:43:53 +0800
Subject: [PATCH 3/5] update tests
---
.../insert-extract-pair-elements.ll | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 88c3e4367ffa7..adf78e79c7d09 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -79,13 +79,9 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v8f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.w $fa1, $a0
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT: movgr2fr.w $fa2, $a0
-; CHECK-NEXT: movfr2gr.s $a0, $fa1
+; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT: movfr2gr.s $a0, $fa2
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5
+; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <8 x float> %a, i32 3
@@ -115,13 +111,9 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: movgr2fr.d $fa1, $a0
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT: movgr2fr.d $fa2, $a0
-; CHECK-NEXT: movfr2gr.d $a0, $fa1
+; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT: movfr2gr.d $a0, $fa2
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2
+; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <4 x double> %a, i32 1
>From 8aa5b1a945f67b8e8d37988002ad938531c74bce Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Wed, 2 Jul 2025 17:58:12 +0800
Subject: [PATCH 4/5] optimize extracting i8/i16 element from hi128
---
.../LoongArch/LoongArchISelLowering.cpp | 5 +---
.../LoongArch/LoongArchLASXInstrInfo.td | 12 ++++++++
.../ir-instruction/insert-extract-element.ll | 26 +++-------------
.../insert-extract-pair-elements.ll | 30 ++++---------------
4 files changed, 23 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 7dae4d30d31be..da12c520d196a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2525,12 +2525,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
EVT VecTy = Op->getOperand(0)->getValueType(0);
SDValue Idx = Op->getOperand(1);
- EVT EltTy = VecTy.getVectorElementType();
unsigned NumElts = VecTy.getVectorNumElements();
- if (isa<ConstantSDNode>(Idx) &&
- (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
- EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))
+ if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts)
return Op;
return SDValue();
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 915dc803bdbd7..992a6e9386bb4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1794,6 +1794,18 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
}
// Vector extraction with constant index.
+foreach imm = 16...31 in {
+ defvar Imm = !and(imm, 15);
+ def : Pat<(i64 (vector_extract v32i8:$xj, imm)),
+ (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128),
+ Imm)>;
+}
+foreach imm = 8...15 in {
+ defvar Imm = !and(imm, 7);
+ def : Pat<(i64 (vector_extract v16i16:$xj, imm)),
+ (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128),
+ Imm)>;
+}
def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
(VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index fb942defe446a..271e3eca31dbe 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -4,18 +4,9 @@
define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: insert_extract_v32i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi.d $sp, $sp, -64
-; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT: addi.d $fp, $sp, 64
-; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: ld.b $a0, $sp, 31
+; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT: vpickve2gr.b $a0, $vr1, 15
; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT: addi.d $sp, $fp, -64
-; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 64
; CHECK-NEXT: ret
entry:
%b = extractelement <32 x i8> %a, i32 31
@@ -26,18 +17,9 @@ entry:
define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: insert_extract_v16i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi.d $sp, $sp, -64
-; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT: addi.d $fp, $sp, 64
-; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: ld.h $a0, $sp, 30
+; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
+; CHECK-NEXT: vpickve2gr.h $a0, $vr1, 7
; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT: addi.d $sp, $fp, -64
-; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 64
; CHECK-NEXT: ret
entry:
%b = extractelement <16 x i16> %a, i32 15
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index adf78e79c7d09..0a044850ae619 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,23 +4,14 @@
define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: insert_extract_v32i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi.d $sp, $sp, -64
-; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT: addi.d $fp, $sp, 64
-; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: ld.b $a1, $sp, 31
+; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1
+; CHECK-NEXT: vpickve2gr.b $a0, $vr1, 15
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1
+; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 1
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT: addi.d $sp, $fp, -64
-; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 64
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <32 x i8> %a, i32 15
@@ -33,23 +24,14 @@ entry:
define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: insert_extract_v16i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi.d $sp, $sp, -64
-; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT: addi.d $fp, $sp, 64
-; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: ld.h $a1, $sp, 30
+; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1
+; CHECK-NEXT: vpickve2gr.h $a0, $vr1, 7
; CHECK-NEXT: xvori.b $xr1, $xr0, 0
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
+; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1
; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT: addi.d $sp, $fp, -64
-; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT: addi.d $sp, $sp, 64
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <16 x i16> %a, i32 7
>From 00a051225b3dd8f2c82a892f31f239f7a919b859 Mon Sep 17 00:00:00 2001
From: Qi Zhao <zhaoqi01 at loongson.cn>
Date: Wed, 2 Jul 2025 15:43:05 +0800
Subject: [PATCH 5/5] optimize extracting two elements when lasx supported
---
.../LoongArch/LoongArchLASXInstrInfo.td | 58 +++++++++++++++++++
.../insert-extract-pair-elements.ll | 38 ++----------
2 files changed, 64 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 992a6e9386bb4..89da2aa0a8fb0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> {
(!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
}
+multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...3 in {
+ foreach imm2 = 0...3 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 4))),
+ !add(imm2, 4)),
+ (XVEXTRINS_W $xd, $xj, Imm)>;
+ }
+ }
+}
+
+multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> {
+ foreach imm1 = 0...1 in {
+ foreach imm2 = 0...1 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert vecty:$xd,
+ (elemty (vector_extract vecty:$xj, imm1)), imm2),
+ (elemty (vector_extract vecty:$xj, !add(imm1, 2))),
+ !add(imm2, 2)),
+ (XVEXTRINS_D $xd, $xj, Imm)>;
+ }
+ }
+}
+
let Predicates = [HasExtLASX] in {
// XVADD_{B/H/W/D}
@@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
+// Insert two elements extracted from vector into vector. (The positions
+// of the two elements must be same in the source or destination vector's
+// front and back 128bits.)
+// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D}
+// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D}
+foreach imm1 = 0...15 in {
+ foreach imm2 = 0...15 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v32i8:$xd,
+ (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))),
+ !add(imm2, 16)),
+ (XVEXTRINS_B $xd, $xj, Imm)>;
+ }
+}
+
+foreach imm1 = 0...7 in {
+ foreach imm2 = 0...7 in {
+ defvar Imm = !or(!shl(imm2, 4), imm1);
+ def : Pat<(vector_insert (vector_insert v16i16:$xd,
+ (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2),
+ (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))),
+ !add(imm2, 8)),
+ (XVEXTRINS_H $xd, $xj, Imm)>;
+ }
+}
+
+defm : PairInsertExtractPatV8<v8i32, GRLenVT>;
+defm : PairInsertExtractPatV8<v8f32, f32>;
+defm : PairInsertExtractPatV4<v4i64, GRLenVT>;
+defm : PairInsertExtractPatV4<v4f64, f64>;
+
// PseudoXVINSGR2VR_{B/H}
def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
(PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 0a044850ae619..4e173c4feadba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,14 +4,7 @@
define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: insert_extract_v32i8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
-; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT: vpickve2gr.b $a0, $vr1, 15
-; CHECK-NEXT: xvori.b $xr1, $xr0, 0
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 1
-; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvextrins.b $xr0, $xr0, 31
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <32 x i8> %a, i32 15
@@ -24,14 +17,7 @@ entry:
define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: insert_extract_v16i16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14
-; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT: vpickve2gr.h $a0, $vr1, 7
-; CHECK-NEXT: xvori.b $xr1, $xr0, 0
-; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1
-; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT: xvextrins.h $xr0, $xr0, 23
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <16 x i16> %a, i32 7
@@ -44,10 +30,7 @@ entry:
define <8 x i32> @insert_extract_v8i32(<8 x i32> %a) nounwind {
; CHECK-LABEL: insert_extract_v8i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5
+; CHECK-NEXT: xvextrins.w $xr0, $xr0, 19
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <8 x i32> %a, i32 3
@@ -60,10 +43,7 @@ entry:
define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
; CHECK-LABEL: insert_extract_v8f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5
+; CHECK-NEXT: xvextrins.w $xr0, $xr0, 19
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <8 x float> %a, i32 3
@@ -76,10 +56,7 @@ entry:
define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind {
; CHECK-LABEL: insert_extract_v4i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
+; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <4 x i64> %a, i32 1
@@ -92,10 +69,7 @@ entry:
define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
; CHECK-LABEL: insert_extract_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
-; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2
+; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1
; CHECK-NEXT: ret
entry:
%b_lo = extractelement <4 x double> %a, i32 1
More information about the llvm-commits
mailing list