[llvm] 30d6c39 - [AMDGPU] Add merging into S_BUFFER_LOAD_DWORDX8_IMM
Piotr Sobczak via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 2 07:28:25 PDT 2021
Author: Piotr Sobczak
Date: 2021-09-02T16:26:25+02:00
New Revision: 30d6c39bca6cf5b2a9a166fb04954cfcd7b34dec
URL: https://github.com/llvm/llvm-project/commit/30d6c39bca6cf5b2a9a166fb04954cfcd7b34dec
DIFF: https://github.com/llvm/llvm-project/commit/30d6c39bca6cf5b2a9a166fb04954cfcd7b34dec.diff
LOG: [AMDGPU] Add merging into S_BUFFER_LOAD_DWORDX8_IMM
Extend SILoadStoreOptimizer to merge into DWORDX8 variant of S_BUFFER_LOAD.
Merging into DWORDX2 and DWORDX4 variants is handled already.
Differential Revision: https://reviews.llvm.org/D108909
Added:
llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
Modified:
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 5dd621856a721..58415ecc94327 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -303,6 +303,8 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
+ return 8;
case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
@@ -372,6 +374,7 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::DS_READ_B32:
case AMDGPU::DS_READ_B32_gfx9:
@@ -413,6 +416,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
}
}
@@ -463,6 +467,7 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
Result.SBase = true;
return Result;
case AMDGPU::DS_READ_B32:
@@ -857,6 +862,7 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
return false;
case 2:
case 4:
+ case 8:
return true;
}
}
@@ -1523,45 +1529,62 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
+ case 8:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
}
case MIMG:
- assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
+ assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
+ "No overlaps");
return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
}
std::pair<unsigned, unsigned>
-SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
+SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
+ const CombineInfo &Paired) {
- if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
- return std::make_pair(0, 0);
+ assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
bool ReverseOrder;
if (CI.InstClass == MIMG) {
- assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
- "No overlaps");
+ assert(
+ (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
+ "No overlaps");
ReverseOrder = CI.DMask > Paired.DMask;
} else
ReverseOrder = CI.Offset > Paired.Offset;
- static const unsigned Idxs[4][4] = {
- {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
- {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
- {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
- {AMDGPU::sub3, 0, 0, 0},
- };
unsigned Idx0;
unsigned Idx1;
- assert(CI.Width >= 1 && CI.Width <= 3);
- assert(Paired.Width >= 1 && Paired.Width <= 3);
+ if (CI.Width + Paired.Width > 4) {
+ assert(CI.Width == 4 && Paired.Width == 4);
- if (ReverseOrder) {
- Idx1 = Idxs[0][Paired.Width - 1];
- Idx0 = Idxs[Paired.Width][CI.Width - 1];
+ if (ReverseOrder) {
+ Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
+ Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
+ } else {
+ Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
+ Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
+ }
} else {
- Idx0 = Idxs[0][CI.Width - 1];
- Idx1 = Idxs[CI.Width][Paired.Width - 1];
+ static const unsigned Idxs[4][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+ {AMDGPU::sub3, 0, 0, 0},
+ };
+
+ assert(CI.Width >= 1 && CI.Width <= 3);
+ assert(Paired.Width >= 1 && Paired.Width <= 3);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][Paired.Width - 1];
+ Idx0 = Idxs[Paired.Width][CI.Width - 1];
+ } else {
+ Idx0 = Idxs[0][CI.Width - 1];
+ Idx1 = Idxs[CI.Width][Paired.Width - 1];
+ }
}
return std::make_pair(Idx0, Idx1);
@@ -2134,7 +2157,7 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
MachineBasicBlock::iterator NewMI =
mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
CI.setMI(NewMI, *TII, *STM);
- OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
+ OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
break;
}
case BUFFER_LOAD: {
diff --git a/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
new file mode 100644
index 0000000000000..86af43198ad04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-sbuffer-load.mir
@@ -0,0 +1,133 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck %s
+
+# CHECK-LABEL: name: merge_s_buffer_load_x2
+# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 4)
+
+name: merge_s_buffer_load_x2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x4
+# CHECK: S_BUFFER_LOAD_DWORDX4_IMM %0, 0, 0 :: (dereferenceable invariant load (s128), align 4)
+name: merge_s_buffer_load_x4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
+ %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
+ %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x8
+# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
+name: merge_s_buffer_load_x8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
+ %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
+ %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
+ %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ %6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
+ %7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
+ %8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x8_reordered
+# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 4)
+name: merge_s_buffer_load_x8_reordered
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 4, 0 :: (dereferenceable invariant load (s32))
+ %3:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 28, 0 :: (dereferenceable invariant load (s32))
+ %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 12, 0 :: (dereferenceable invariant load (s32))
+ %6:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ %7:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s32))
+ %8:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x2
+# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 8)
+name: merge_s_buffer_load_x8_out_of_x2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s64))
+ %2:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 8, 0 :: (dereferenceable invariant load (s64))
+ %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s64))
+ %4:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
+
+ S_ENDPGM 0
+...
+---
+
+# CHECK-LABEL: name: merge_s_buffer_load_x8_out_of_x4
+# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
+name: merge_s_buffer_load_x8_out_of_x4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ %2:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s128))
+
+ S_ENDPGM 0
+...
+---
+
+
+# CHECK-LABEL: name: merge_s_buffer_load_x8_mixed
+# CHECK: S_BUFFER_LOAD_DWORDX8_IMM %0, 0, 0 :: (dereferenceable invariant load (s256), align 16)
+name: merge_s_buffer_load_x8_mixed
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:sgpr_128 = S_BUFFER_LOAD_DWORDX4_IMM %0:sgpr_128, 0, 0 :: (dereferenceable invariant load (s128))
+ %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ %3:sgpr_64 = S_BUFFER_LOAD_DWORDX2_IMM %0:sgpr_128, 24, 0 :: (dereferenceable invariant load (s64))
+ %4:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0:sgpr_128, 20, 0 :: (dereferenceable invariant load (s32))
+
+ S_ENDPGM 0
+...
+---
More information about the llvm-commits
mailing list