[llvm] 8a2729f - [WebAssembly] Improve codegen for loading scalars from memory to v128

Thomas Lively via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 21 21:05:53 PDT 2022


Author: Fanchen Kong
Date: 2022-09-21T21:05:44-07:00
New Revision: 8a2729fea719c7ef1353135a0e9cb5866d5178b7

URL: https://github.com/llvm/llvm-project/commit/8a2729fea719c7ef1353135a0e9cb5866d5178b7
DIFF: https://github.com/llvm/llvm-project/commit/8a2729fea719c7ef1353135a0e9cb5866d5178b7.diff

LOG: [WebAssembly] Improve codegen for loading scalars from memory to v128

Use load32_zero instead of load32_splat to load the low 32 bits from memory to
v128. Test cases are added to cover this change.

Reviewed By: tlively

Differential Revision: https://reviews.llvm.org/D134257

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
    llvm/test/CodeGen/WebAssembly/simd-offset.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 36393aea293c..995fb1d74ce5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -285,14 +285,16 @@ defm "" : SIMDLoadZero<I32x4, 0x5c>;
 defm "" : SIMDLoadZero<I64x2, 0x5d>;
 
 // Use load_zero to load scalars into vectors as well where possible.
-// TODO: i32, i16, and i8 scalars
-def load_scalar :
-  PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>;
-defm : LoadPatNoOffset<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
-defm : LoadPatImmOff<v2i64, load_scalar, regPlusImm, "LOAD_ZERO_I64x2">;
-defm : LoadPatImmOff<v2i64, load_scalar, or_is_add, "LOAD_ZERO_I64x2">;
-defm : LoadPatOffsetOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
-defm : LoadPatGlobalAddrOffOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+// TODO: i16, and i8 scalars
+foreach vec = [I32x4, I64x2] in {
+  defvar inst = "LOAD_ZERO_"#vec;
+  defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
+  defm : LoadPatNoOffset<vec.vt, pat, inst>;
+  defm : LoadPatImmOff<vec.vt, pat, regPlusImm, inst>;
+  defm : LoadPatImmOff<vec.vt, pat, or_is_add, inst>;
+  defm : LoadPatOffsetOnly<vec.vt, pat, inst>;
+  defm : LoadPatGlobalAddrOffOnly<vec.vt, pat, inst>;
+}
 
 // TODO: f32x4 and f64x2 as well
 foreach vec = [I32x4, I64x2] in {

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
index 0800893a0007..95f3587caed8 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
@@ -1160,9 +1160,9 @@ define <4 x i32> @load_splat_v4i32(i32* %addr) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32:
-; CHECK:         .functype load_sext_v4i32 (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32:
+; CHECK:         .functype load_sext_v4i16_to_v4i32 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_s 0
@@ -1172,9 +1172,9 @@ define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32:
-; CHECK:         .functype load_zext_v4i32 (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32:
+; CHECK:         .functype load_zext_v4i16_to_v4i32 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_u 0
@@ -1184,6 +1184,39 @@ define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32:
+; CHECK:         .functype load_sext_v4i8_to_v4i32 (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %v = load <4 x i8>, <4 x i8>* %p
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32:
+; CHECK:         .functype load_zext_v4i8_to_v4i32 (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %v = load <4 x i8>, <4 x i8>* %p
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32:
 ; CHECK:         .functype load_ext_v4i32 (i32) -> (v128)
@@ -1225,9 +1258,9 @@ define <4 x i32> @load_splat_v4i32_with_folded_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32_with_folded_offset:
-; CHECK:         .functype load_sext_v4i32_with_folded_offset (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_folded_offset:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_with_folded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_s 16
@@ -1240,9 +1273,9 @@ define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32_with_folded_offset:
-; CHECK:         .functype load_zext_v4i32_with_folded_offset (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_folded_offset:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_with_folded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_u 16
@@ -1255,6 +1288,45 @@ define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_offset:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 16
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint <4 x i8>* %p to i32
+  %r = add nuw i32 %q, 16
+  %s = inttoptr i32 %r to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_offset:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 16
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint <4 x i8>* %p to i32
+  %r = add nuw i32 %q, 16
+  %s = inttoptr i32 %r to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32_with_folded_offset:
 ; CHECK:         .functype load_ext_v4i32_with_folded_offset (i32) -> (v128)
@@ -1295,9 +1367,9 @@ define <4 x i32> @load_splat_v4i32_with_folded_gep_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32_with_folded_gep_offset:
-; CHECK:         .functype load_sext_v4i32_with_folded_gep_offset (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_s 8
@@ -1308,9 +1380,9 @@ define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32_with_folded_gep_offset:
-; CHECK:         .functype load_zext_v4i32_with_folded_gep_offset (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32x4.load16x4_u 8
@@ -1321,6 +1393,41 @@ define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 4
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_gep_offset:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:  v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:  local.get 0
+; CHECK-NEXT:  v128.load32_zero 4
+; CHECK-NEXT:  i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:  # fallthrough-return
+  %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32_with_folded_gep_offset:
 ; CHECK:         .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128)
@@ -1363,9 +1470,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_negative_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_negative_offset:
-; CHECK:         .functype load_sext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const -8
@@ -1378,9 +1485,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_negative_offset:
-; CHECK:         .functype load_zext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const -8
@@ -1393,6 +1500,45 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_negative_offset:
 ; CHECK:         .functype load_ext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
@@ -1441,9 +1587,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_offset:
-; CHECK:         .functype load_sext_v4i32_with_unfolded_offset (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 16
@@ -1458,9 +1604,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_offset:
-; CHECK:         .functype load_zext_v4i32_with_unfolded_offset (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 16
@@ -1475,6 +1621,49 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint <4 x i8>* %p to i32
+  %r = add nsw i32 %q, 16
+  %s = inttoptr i32 %r to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_offset:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint <4 x i8>* %p to i32
+  %r = add nsw i32 %q, 16
+  %s = inttoptr i32 %r to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32_with_unfolded_offset:
 ; CHECK:         .functype load_ext_v4i32_with_unfolded_offset (i32) -> (v128)
@@ -1521,9 +1710,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_offset(i32* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_offset:
-; CHECK:         .functype load_sext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 8
@@ -1536,9 +1725,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
-; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_offset:
-; CHECK:         .functype load_zext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 8
@@ -1551,6 +1740,45 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 4
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
 ; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_offset:
 ; CHECK:         .functype load_ext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
@@ -1591,9 +1819,9 @@ define <4 x i32> @load_splat_v4i32_from_numeric_address() {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_sext_v4i32_from_numeric_address() {
-; CHECK-LABEL: load_sext_v4i32_from_numeric_address:
-; CHECK:         .functype load_sext_v4i32_from_numeric_address () -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_from_numeric_address:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_from_numeric_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
 ; CHECK-NEXT:    i32x4.load16x4_s 32
@@ -1604,9 +1832,9 @@ define <4 x i32> @load_sext_v4i32_from_numeric_address() {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_from_numeric_address() {
-; CHECK-LABEL: load_zext_v4i32_from_numeric_address:
-; CHECK:         .functype load_zext_v4i32_from_numeric_address () -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_from_numeric_address:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_from_numeric_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
 ; CHECK-NEXT:    i32x4.load16x4_u 32
@@ -1617,6 +1845,41 @@ define <4 x i32> @load_zext_v4i32_from_numeric_address() {
   ret <4 x i32> %v2
 }
 
+define <4 x i32> @load_sext_v4i8_to_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_numeric_address:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero 32
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %s = inttoptr i32 32 to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_from_numeric_address() {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_numeric_address:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero 32
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %s = inttoptr i32 32 to <4 x i8>*
+  %v = load <4 x i8>, <4 x i8>* %s
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_from_numeric_address() {
 ; CHECK-LABEL: load_ext_v4i32_from_numeric_address:
 ; CHECK:         .functype load_ext_v4i32_from_numeric_address () -> (v128)
@@ -1656,9 +1919,9 @@ define <4 x i32> @load_splat_v4i32_from_global_address() {
 }
 
 @gv_v4i16 = global <4 x i16> <i16 42, i16 42, i16 42, i16 42>
-define <4 x i32> @load_sext_v4i32_from_global_address() {
-; CHECK-LABEL: load_sext_v4i32_from_global_address:
-; CHECK:         .functype load_sext_v4i32_from_global_address () -> (v128)
+define <4 x i32> @load_sext_v4i16_to_v4i32_from_global_address() {
+; CHECK-LABEL: load_sext_v4i16_to_v4i32_from_global_address:
+; CHECK:         .functype load_sext_v4i16_to_v4i32_from_global_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
 ; CHECK-NEXT:    i32x4.load16x4_s gv_v4i16
@@ -1668,9 +1931,9 @@ define <4 x i32> @load_sext_v4i32_from_global_address() {
   ret <4 x i32> %v2
 }
 
-define <4 x i32> @load_zext_v4i32_from_global_address() {
-; CHECK-LABEL: load_zext_v4i32_from_global_address:
-; CHECK:         .functype load_zext_v4i32_from_global_address () -> (v128)
+define <4 x i32> @load_zext_v4i16_to_v4i32_from_global_address() {
+; CHECK-LABEL: load_zext_v4i16_to_v4i32_from_global_address:
+; CHECK:         .functype load_zext_v4i16_to_v4i32_from_global_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
 ; CHECK-NEXT:    i32x4.load16x4_u gv_v4i16
@@ -1680,6 +1943,40 @@ define <4 x i32> @load_zext_v4i32_from_global_address() {
   ret <4 x i32> %v2
 }
 
+ at gv_v4i8 = global <4 x i8> <i8 42, i8 42, i8 42, i8 42>
+define <4 x i32> @load_sext_v4i8_to_v4i32_from_global_address() {
+; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_global_address:
+; CHECK:         .functype load_sext_v4i8_to_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero gv_v4i8
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shl
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32x4.shr_s
+; CHECK-NEXT:    # fallthrough-return
+  %v = load <4 x i8>, <4 x i8>* @gv_v4i8
+  %v2 = sext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
+define <4 x i32> @load_zext_v4i8_to_v4i32_from_global_address() {
+; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_global_address:
+; CHECK:         .functype load_zext_v4i8_to_v4i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero gv_v4i8
+; CHECK-NEXT:    i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT:    # fallthrough-return
+  %v = load <4 x i8>, <4 x i8>* @gv_v4i8
+  %v2 = zext <4 x i8> %v to <4 x i32>
+  ret <4 x i32> %v2
+}
+
 define <4 x i16> @load_ext_v4i32_from_global_address() {
 ; CHECK-LABEL: load_ext_v4i32_from_global_address:
 ; CHECK:         .functype load_ext_v4i32_from_global_address () -> (v128)


        


More information about the llvm-commits mailing list