[llvm] [LoongArch] Pre-commit for broadcast load (PR #136070)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 17 18:45:44 PDT 2025
https://github.com/tangaac updated https://github.com/llvm/llvm-project/pull/136070
>From 031a3127a5e9ca805e3cded53a012b17d6ccd38d Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Wed, 16 Apr 2025 10:43:15 +0800
Subject: [PATCH 1/2] Pre-commit for broadcast load
---
.../CodeGen/LoongArch/lasx/broadcast-load.ll | 172 ++++++++++++++++++
.../CodeGen/LoongArch/lsx/broadcast-load.ll | 170 +++++++++++++++++
2 files changed, 342 insertions(+)
create mode 100644 llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
create mode 100644 llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
new file mode 100644
index 0000000000000..7fec52a340768
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to xvldrepl
+
+; A load has more than one user shouldn't be lowered to xvldrepl
+define <32 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ store i8 %tmp, ptr %dst
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i8, ptr %ptr, i64 33
+ %tmp = load i8, ptr %p
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+
+define <16 x i16> @xvldrepl_h(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i16, ptr %ptr
+ %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp2
+}
+
+define <16 x i16> @xvldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i16, ptr %ptr, i64 33
+ %tmp = load i16, ptr %p
+ %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i32, ptr %ptr
+ %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+ ret <8 x i32> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i64 33
+ %tmp = load i32, ptr %p
+ %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+ ret <8 x i32> %tmp2
+}
+
+
+define <4 x i64> @xvldrepl_d(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i64, ptr %ptr
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
+}
+
+define <4 x i64> @xvldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 264
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: xvreplve0.w $xr0, $xr0
+; CHECK-NEXT: ret
+ %tmp = load float, ptr %ptr
+ %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+ ret <8 x float> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 264
+; CHECK-NEXT: xvreplve0.w $xr0, $xr0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load float, ptr %p
+ %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+ ret <8 x float> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: xvreplve0.d $xr0, $xr0
+; CHECK-NEXT: ret
+ %tmp = load double, ptr %ptr
+ %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+ ret <4 x double> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 264
+; CHECK-NEXT: xvreplve0.d $xr0, $xr0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load double, ptr %p
+ %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+ ret <4 x double> %tmp2
+}
+
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
new file mode 100644
index 0000000000000..09edb33a49ed9
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to vldrepl
+
+; A load has more than one user shouldn't be lowered to vldrepl
+define <16 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst){
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ store i8 %tmp, ptr %dst
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i8, ptr %ptr, i64 33
+ %tmp = load i8, ptr %p
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+
+define <8 x i16> @vldrepl_h(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i16, ptr %ptr
+ %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @vldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i16, ptr %ptr, i64 33
+ %tmp = load i16, ptr %p
+ %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vldrepl_w(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i32, ptr %ptr
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i64 33
+ %tmp = load i32, ptr %p
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vldrepl_d(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i64, ptr %ptr
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @vldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 264
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %tmp = load float, ptr %ptr
+ %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 264
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load float, ptr %p
+ %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %tmp = load double, ptr %ptr
+ %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+ ret <2 x double> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 264
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load double, ptr %p
+ %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+ ret <2 x double> %tmp2
+}
>From e39d1e504584860b9b2f97eb4b3e5f9fcdf855f9 Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01 at loongson.cn>
Date: Fri, 18 Apr 2025 09:38:17 +0800
Subject: [PATCH 2/2] add unaligned offset tests
---
.../CodeGen/LoongArch/lasx/broadcast-load.ll | 31 +++++++++++++------
.../CodeGen/LoongArch/lsx/broadcast-load.ll | 31 +++++++++++++------
2 files changed, 44 insertions(+), 18 deletions(-)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
index 7fec52a340768..4fcf016376d09 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -4,18 +4,31 @@
; TODO: Load a element and splat it to a vector could be lowerd to xvldrepl
; A load has more than one user shouldn't be lowered to xvldrepl
-define <32 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst) {
+define <4 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst) {
; CHECK-LABEL: should_not_be_optimized:
; CHECK: # %bb.0:
-; CHECK-NEXT: ld.b $a0, $a0, 0
-; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
-; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: st.d $a0, $a1, 0
; CHECK-NEXT: ret
- %tmp = load i8, ptr %ptr
- store i8 %tmp, ptr %dst
- %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
- %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
- ret <32 x i8> %tmp2
+ %tmp = load i64, ptr %ptr
+ store i64 %tmp, ptr %dst
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
+}
+
+define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d_unaligned_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 4
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i32 1
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
}
define <32 x i8> @xvldrepl_b(ptr %ptr) {
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
index 09edb33a49ed9..02b68725687dd 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -4,18 +4,31 @@
; TODO: Load a element and splat it to a vector could be lowerd to vldrepl
; A load has more than one user shouldn't be lowered to vldrepl
-define <16 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst){
+define <2 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst){
; CHECK-LABEL: should_not_be_optimized:
; CHECK: # %bb.0:
-; CHECK-NEXT: ld.b $a0, $a0, 0
-; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
-; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: st.d $a0, $a1, 0
; CHECK-NEXT: ret
- %tmp = load i8, ptr %ptr
- store i8 %tmp, ptr %dst
- %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
- %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
- ret <16 x i8> %tmp2
+ %tmp = load i64, ptr %ptr
+ store i64 %tmp, ptr %dst
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @vldrepl_d_unaligned_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_unaligned_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 4
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i32 1
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
}
define <16 x i8> @vldrepl_b(ptr %ptr) {
More information about the llvm-commits
mailing list