[llvm] 3e76d01 - [AArch64][CodeGen] Allow vectors larger than hardware support to use SVE's load zero/sign-extend for fixed vectors.

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 19 07:18:37 PDT 2023


Author: Dinar Temirbulatov
Date: 2023-04-19T14:17:32Z
New Revision: 3e76d012d174a2e33564a6d8476c7f88c7d1b4b8

URL: https://github.com/llvm/llvm-project/commit/3e76d012d174a2e33564a6d8476c7f88c7d1b4b8
DIFF: https://github.com/llvm/llvm-project/commit/3e76d012d174a2e33564a6d8476c7f88c7d1b4b8.diff

LOG: [AArch64][CodeGen] Allow vectors larger than hardware support to use SVE's load zero/sign-extend for fixed vectors.

Prefer to fold LOAD + SIGN/ZEROEXTEND to SVE load and sign extend instructions
for fixed-length-vectors even if a type is not representable on a hardware.

Differential Revision: https://reviews.llvm.org/D147533

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 514712c737fe..3939f4de416b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5228,9 +5228,7 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
 
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   return ExtVal.getValueType().isScalableVector() ||
-         useSVEForFixedLengthVectorVT(
-             ExtVal.getValueType(),
-             /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
+         Subtarget->useSVEForFixedLengthVectors();
 }
 
 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index 2aec28e783a3..d7bd08628ff5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -73,13 +73,10 @@ define <32 x i32> @load_zext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
 define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov x9, #32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT:    ld1h { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -145,13 +142,10 @@ define <32 x i32> @load_sext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
 define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT:    mov x9, #32
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT:    ld1sh { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT:    ld1sh { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_1024-NEXT:    st1w { z1.s }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -170,17 +164,11 @@ define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    uunpklo z1.h, z0.b
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_1024-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT:    uunpklo z1.s, z1.h
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1b { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT:    ld1b { z1.d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -199,17 +187,11 @@ define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    sunpklo z1.h, z0.b
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_1024-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1sb { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT:    ld1sb { z1.d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -228,15 +210,10 @@ define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_1024-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT:    ld1h { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -255,15 +232,10 @@ define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    sunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_1024-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1sh { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT:    ld1sh { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -282,13 +254,10 @@ define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    uunpklo z1.d, z0.s
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    ld1w { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret
@@ -307,13 +276,10 @@ define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT:    mov x9, #16
-; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT:    sunpklo z1.d, z0.s
-; VBITS_GE_1024-NEXT:    ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT:    ld1sw { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT:    ld1sw { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_1024-NEXT:    st1d { z1.d }, p0, [x8]
 ; VBITS_GE_1024-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 94b4aad294f0..cefe83639e14 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -63,16 +63,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 {
 define <16 x i32> @load_sext_v16i8i32(ptr %ap)  #0 {
 ; CHECK-LABEL: load_sext_v16i8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z4.h, z1.b
-; CHECK-NEXT:    sunpklo z0.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z3.h
-; CHECK-NEXT:    sunpklo z3.s, z4.h
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    mov w10, #12 // =0xc
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x0, x8]
+; CHECK-NEXT:    ld1sb { z2.s }, p0/z, [x0, x9]
+; CHECK-NEXT:    ld1sb { z3.s }, p0/z, [x0, x10]
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
@@ -86,10 +84,10 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  #0 {
 define <8 x i32> @load_sext_v8i16i32(ptr %ap)  #0 {
 ; CHECK-LABEL: load_sext_v8i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    sunpklo z0.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    mov x8, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
@@ -165,25 +163,22 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) #0 {
 define <16 x i64> @load_zext_v16i16i64(ptr %ap)  #0 {
 ; CHECK-LABEL: load_zext_v16i16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEXT:    uunpklo z0.d, z3.s
-; CHECK-NEXT:    uunpklo z5.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z16.s, z2.h
-; CHECK-NEXT:    uunpklo z4.d, z5.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z2.d, z7.s
-; CHECK-NEXT:    uunpklo z6.d, z16.s
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z3.s
-; CHECK-NEXT:    uunpklo z5.d, z5.s
-; CHECK-NEXT:    uunpklo z3.d, z7.s
-; CHECK-NEXT:    uunpklo z7.d, z16.s
+; CHECK-NEXT:    mov x8, #2 // =0x2
+; CHECK-NEXT:    mov x9, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x10, #6 // =0x6
+; CHECK-NEXT:    mov x11, #8 // =0x8
+; CHECK-NEXT:    mov x12, #10 // =0xa
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #12 // =0xc
+; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    mov x9, #14 // =0xe
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x11, lsl #1]
+; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2


        


More information about the llvm-commits mailing list