[llvm-branch-commits] [AArch64][GlobalISel] Avoid splitting loads of large vector types into individual element loads (PR #85039)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Mar 13 00:49:17 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: Dhruv Chawla (work) (dc03-work)

<details>
<summary>Changes</summary>

This patch adds custom legalization for G_LOAD where it splits loads of
fixed-width vector types larger than 128 bits into loads of 128-bit
vectors with the same element type.

This is an improvement to what was being done before where loads would
be split into individual loads for each element of the vector.


---

Patch is 77.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/85039.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+70) 
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir (+16-25) 
- (modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+204-1272) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 36adada2796531..fc1063b6bd4893 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -356,6 +356,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
       })
+      .customIf([=](const LegalityQuery &Query) {
+        // We need custom legalization for loads greater than 128-bits as they
+        // need to be split up into chunks.
+        return Query.Types[0].isFixedVector() &&
+               Query.Types[0].getSizeInBits() > 128;
+      })
       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
                                  {s16, p0, s16, 8},
                                  {s32, p0, s32, 8},
@@ -1632,6 +1638,70 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
   Register ValReg = MI.getOperand(0).getReg();
   const LLT ValTy = MRI.getType(ValReg);
 
+  if (ValTy.isFixedVector() && ValTy.getSizeInBits() > 128) {
+    // Break fixed-width vector loads of sizes greater than 128 bits into chunks
+    // of 128-bit vector loads with the same element type.
+    Register LoadReg = MI.getOperand(1).getReg();
+    Register LoadRegWithOffset = LoadReg;
+
+    unsigned EltSize = ValTy.getScalarSizeInBits();
+    // Only support element types which can cleanly divide into 128-bit wide
+    // vectors.
+    if (128 % EltSize != 0)
+      return false;
+
+    unsigned NewEltCount = 128 / EltSize;
+    LLT NewTy = LLT::fixed_vector(NewEltCount, ValTy.getElementType());
+
+    unsigned OldEltCount = ValTy.getNumElements();
+    unsigned NumVecs = OldEltCount / NewEltCount;
+
+    // Create registers to represent each element of ValReg. Load into these,
+    // then combine them at the end.
+    SmallVector<Register, 16> ComponentRegs;
+    for (unsigned i = 0, e = ValTy.getNumElements(); i != e; i++)
+      ComponentRegs.push_back(
+          MRI.createGenericVirtualRegister(ValTy.getElementType()));
+
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+    auto GetMMO = [&MMO, &MI](int64_t Offset, LLT Ty) {
+      return MI.getMF()->getMachineMemOperand(&MMO, Offset, Ty);
+    };
+
+    for (unsigned i = 0, e = NumVecs; i != e; i++) {
+      auto LoadChunk = MIRBuilder.buildLoad(
+          NewTy, LoadRegWithOffset, *GetMMO(i * NewTy.getSizeInBytes(), NewTy));
+
+      auto LoadOffset = MIRBuilder.buildConstant(
+          LLT::scalar(64), (i + 1) * NewTy.getSizeInBytes());
+
+      LoadRegWithOffset =
+          MIRBuilder.buildPtrAdd(MRI.getType(LoadReg), LoadReg, LoadOffset)
+              .getReg(0);
+
+      Register *ChunkFirstReg = ComponentRegs.begin() + (i * NewEltCount);
+      MIRBuilder.buildUnmerge({ChunkFirstReg, ChunkFirstReg + NewEltCount},
+                              LoadChunk.getReg(0));
+    }
+
+    unsigned ExtraElems = OldEltCount % NewEltCount;
+    if (ExtraElems != 0) {
+      LLT ExtraTy = LLT::fixed_vector(ExtraElems, ValTy.getElementType());
+
+      auto ExtraLoadChunk = MIRBuilder.buildLoad(
+          ExtraTy, LoadRegWithOffset,
+          *GetMMO(NumVecs * NewTy.getSizeInBytes(), ExtraTy));
+
+      MIRBuilder.buildUnmerge({ComponentRegs.begin() + (NumVecs * NewEltCount),
+                               ComponentRegs.end()},
+                              ExtraLoadChunk.getReg(0));
+    }
+
+    MIRBuilder.buildBuildVector(ValReg, ComponentRegs);
+    MI.eraseFromParent();
+    return true;
+  }
+
   if (ValTy == LLT::scalar(128)) {
 
     AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 5cbb8649d158b0..aa152aea81ff9c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -711,33 +711,24 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
-    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
-    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
-    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
-    ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
     ; CHECK-NEXT: RET_ReallyLR
     %ptr:_(p0) = COPY $x0
     %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 94ab173e9183ac..66ef436f48c637 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2063,228 +2063,52 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v24i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x0, #1]
-; CHECK-GI-BASE-NEXT:    ldr b1, [x0, #8]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x0, #9]
-; CHECK-GI-BASE-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x0, #17]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[1], v16.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b4, [x1, #8]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-BASE-NEXT:    mov v3.b[1], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x1, #17]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #2]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #18]
-; CHECK-GI-BASE-NEXT:    mov v4.b[1], v7.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-BASE-NEXT:    mov v5.b[1], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[2], v17.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #2]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #18]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #3]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-BASE-NEXT:    mov v4.b[2], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #3]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #19]
-; CHECK-GI-BASE-NEXT:    mov v5.b[2], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #19]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[3], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[3], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #4]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-BASE-NEXT:    mov v4.b[3], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #4]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-BASE-NEXT:    mov v5.b[3], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #20]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #5]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-BASE-NEXT:    mov v4.b[4], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #5]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #21]
-; CHECK-GI-BASE-NEXT:    mov v5.b[4], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #21]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[5], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[5], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #6]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-BASE-NEXT:    mov v4.b[5], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #6]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-BASE-NEXT:    mov v5.b[5], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #22]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b6, [x0, #7]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-BASE-NEXT:    mov v4.b[6], v18.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b16, [x1, #7]
-; CHECK-GI-BASE-NEXT:    ldr b17, [x0, #23]
-; CHECK-GI-BASE-NEXT:    mov v5.b[6], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-BASE-NEXT:    ldr b19, [x1, #23]
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-BASE-NEXT:    mov v3.b[7], v16.b[0]
-; CHECK-GI-BASE-NEXT:    mov v2.b[7], v17.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v6.b[0]
-; CHECK-GI-BASE-NEXT:    mov v4.b[7], v18.b[0]
-; CHECK-GI-BASE-NEXT:    mov v5.b[7], v19.b[0]
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v3.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umull v3.4s, v4.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v4.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v4.4s, v5.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v5.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w8, s5
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
 ; CHECK-GI-BASE-NEXT:    fmov w11, s1
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    fmov w12, s4
-; CHECK-GI-BASE-NEXT:    fmov w13, s2
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    add w8, w8, w10
-; CHECK-GI-BASE-NEXT:    add w9, w11, w12
-; CHECK-GI-BASE-NEXT:    add w9, w9, w13
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2352,449 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldr b0, [x0]
-; CHECK-GI-BASE-NEXT:    ldr b7, [x0, #1]
-; CHECK-G...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/85039


More information about the llvm-branch-commits mailing list