[llvm] 3e53aea - AMDGPU: Make frame index folding logic consistent with eliminateFrameIndex (#129633)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 4 19:09:33 PST 2025
Author: Matt Arsenault
Date: 2025-03-05T10:09:30+07:00
New Revision: 3e53aeae94cfe98486ae3186a3eb627b69b51b77
URL: https://github.com/llvm/llvm-project/commit/3e53aeae94cfe98486ae3186a3eb627b69b51b77
DIFF: https://github.com/llvm/llvm-project/commit/3e53aeae94cfe98486ae3186a3eb627b69b51b77.diff
LOG: AMDGPU: Make frame index folding logic consistent with eliminateFrameIndex (#129633)
This adds handling of s_add_u32, which is handled and removes handling of
s_or_b32 and s_and_b32, which are not. I was working on handling them
in #102345, but need to finish that patch. This fixes a regression
exposed by a3165398db0736588daedb07650195502592e567 where the
final instruction would use two literals.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2e66796bcb6bc..91df516b80857 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -232,8 +232,7 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
const unsigned Opc = UseMI.getOpcode();
switch (Opc) {
case AMDGPU::S_ADD_I32:
- case AMDGPU::S_OR_B32:
- case AMDGPU::S_AND_B32:
+ case AMDGPU::S_ADD_U32:
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32:
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 413408b417c5a..4417f205646ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -426,4 +426,150 @@ body: |
$sgpr4 = COPY %4
$sgpr5 = COPY %5
SI_RETURN implicit $sgpr4, implicit $sgpr5
+
+...
+
+name: fold_frame_index__s_add_u32__fi_const
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 16384
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_const
+ ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 128, implicit-def $scc
+ ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_ADD_U32 %0, 128, implicit-def $scc
+ $sgpr4 = COPY %1
+ SI_RETURN implicit $sgpr4
+...
+
+---
+name: fold_frame_index__s_add_u32__const_fi
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 16384
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_frame_index__s_add_u32__const_fi
+ ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 128, %stack.0, implicit-def $scc
+ ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_ADD_U32 128, %0, implicit-def $scc
+ $sgpr4 = COPY %1
+ SI_RETURN implicit $sgpr4
+...
+
+---
+name: fold_frame_index__s_add_u32__fi_inlineimm
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 16384
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_frame_index__s_add_u32__fi_inlineimm
+ ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 %stack.0, 16, implicit-def $scc
+ ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_ADD_U32 %0, 16, implicit-def $scc
+ $sgpr4 = COPY %1
+ SI_RETURN implicit $sgpr4
+...
+
+---
+name: fold_frame_index__s_add_u32__inlineimm_fi
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 4
+ localFrameSize: 16384
+stack:
+ - { id: 0, size: 16384, alignment: 4, local-offset: 0 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: fold_frame_index__s_add_u32__inlineimm_fi
+ ; CHECK: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 16, %stack.0, implicit-def $scc
+ ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_U32_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
+ %0:sreg_32 = S_MOV_B32 %stack.0
+ %1:sreg_32 = S_ADD_U32 16, %0, implicit-def $scc
+ $sgpr4 = COPY %1
+ SI_RETURN implicit $sgpr4
+...
+
+---
+name: no_fold_literal_and_fi_s_or_b32
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 16
+ localFrameSize: 8192
+stack:
+ - { id: 0, size: 4096, alignment: 4, local-offset: 0 }
+ - { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_fold_literal_and_fi_s_or_b32
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]]
+ %0:sreg_32 = S_MOV_B32 12345
+ %1:sreg_32 = S_MOV_B32 %stack.1
+ %2:sreg_32 = S_AND_B32 killed %1, killed %0, implicit-def dead $scc
+ S_ENDPGM 0, implicit %2
+
+...
+
+---
+name: no_fold_literal_or_fi_s_or_b32
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 16
+ localFrameSize: 8192
+stack:
+ - { id: 0, size: 4096, alignment: 4, local-offset: 0 }
+ - { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_fold_literal_or_fi_s_or_b32
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]]
+ %0:sreg_32 = S_MOV_B32 12345
+ %1:sreg_32 = S_MOV_B32 %stack.1
+ %2:sreg_32 = S_OR_B32 killed %1, killed %0, implicit-def dead $scc
+ S_ENDPGM 0, implicit %2
+
+...
+
+---
+name: no_fold_literal_and_fi_s_mul_i32
+tracksRegLiveness: true
+frameInfo:
+ maxAlignment: 16
+ localFrameSize: 8192
+stack:
+ - { id: 0, size: 4096, alignment: 4, local-offset: 0 }
+ - { id: 1, size: 4096, alignment: 16, local-offset: 4096 }
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: no_fold_literal_and_fi_s_mul_i32
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.1
+ ; CHECK-NEXT: [[S_MUL_I32_:%[0-9]+]]:sreg_32 = S_MUL_I32 killed [[S_MOV_B32_]], 12345, implicit-def dead $scc
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_MUL_I32_]]
+ %0:sreg_32 = S_MOV_B32 12345
+ %1:sreg_32 = S_MOV_B32 %stack.1
+ %2:sreg_32 = S_MUL_I32 killed %1, killed %0, implicit-def dead $scc
+ S_ENDPGM 0, implicit %2
+
...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
index ab0aa16cf6c09..2bdc3f671897c 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir
@@ -394,8 +394,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
- ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
- ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc
%2:vgpr_32 = COPY %1
@@ -410,8 +412,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
- ; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
- ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
+ ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 128, [[S_MOV_B32_]], implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]]
+ ; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc
%2:vgpr_32 = COPY %1
@@ -426,8 +430,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
- ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
- ; CHECK-NEXT: SI_RETURN implicit %1
+ ; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]]
%0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc
%1:vgpr_32 = COPY %0
SI_RETURN implicit %1
@@ -441,8 +445,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
- ; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
- ; CHECK-NEXT: SI_RETURN implicit %1
+ ; CHECK: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
+ ; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e64_]]
%0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc
%1:vgpr_32 = COPY %0
SI_RETURN implicit %1
@@ -521,8 +525,10 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
- ; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
- ; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
+ ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], 128, implicit-def dead $scc
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]]
+ ; CHECK-NEXT: SI_RETURN implicit [[COPY]]
%0:sreg_32 = S_MOV_B32 %stack.0
%1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc
%2:vgpr_32 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 004403f46a4d4..7125e7740c10a 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -374,4 +374,46 @@ vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i
ret void
}
+; GCN-LABEL: {{^}}fi_sop2_and_literal_error:
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1fe00
+define amdgpu_kernel void @fi_sop2_and_literal_error() #0 {
+entry:
+ %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
+ %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
+ %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32
+ br label %.shuffle.then.i.i.i.i
+
+.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry
+ store i64 0, ptr addrspace(5) null, align 4
+ %or = and i32 %p2i, -512
+ %icmp = icmp ugt i32 %or, 9999999
+ br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i
+
+vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i
+ %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4
+ store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}fi_sop2_or_literal_error:
+; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
+define amdgpu_kernel void @fi_sop2_or_literal_error() #0 {
+entry:
+ %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
+ %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
+ %p2i = ptrtoint ptr addrspace(5) %Total3.i.i to i32
+ br label %.shuffle.then.i.i.i.i
+
+.shuffle.then.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i, %entry
+ store i64 0, ptr addrspace(5) null, align 4
+ %or = or i32 %p2i, 12345
+ %icmp = icmp ugt i32 %or, 9999999
+ br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i
+
+vector.body.i.i.i.i: ; preds = %.shuffle.then.i.i.i.i
+ %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4
+ store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
index 2cb440b1b7a01..08ea81ad81ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/huge-private-buffer.ll
@@ -7,9 +7,10 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SCRATCH2048K %s
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xfffc, [[FI]]
-; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
+; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; GCN: s_and_b32 s{{[0-9]+}}, [[FI]], 0xfffc
+; GCN: v_mov_b32_e32 [[VFI:v[0-9]+]], [[FI]]{{$}}
+; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}], [[VFI]]
define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 15, ptr addrspace(5) %alloca
@@ -20,11 +21,15 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo16() {
}
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; SCRATCH128K-NOT: v_and_b32
-; SCRATCH256K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
-; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
-; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x1fffc, [[FI]]
+; SCRATCH256K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH256K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc
+
+; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc
+
+; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x1fffc
+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
%alloca = alloca i32, align 4, addrspace(5)
@@ -36,11 +41,17 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo17() {
}
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo18:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; SCRATCH128K-NOT: v_and_b32
-; SCRATCH256K-NOT: v_and_b32
-; SCRATCH1024K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
-; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0x3fffc, [[FI]]
+; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; SCRATCH128K-NOT: and_b32
+; SCRATCH256K-NOT: and_b32
+
+; SCRATCH1024K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH1024K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc
+
+; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0x3fffc
+
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
%alloca = alloca i32, align 4, addrspace(5)
@@ -52,11 +63,16 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo18() {
}
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo20:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; SCRATCH128K-NOT: v_and_b32
-; SCRATCH256K-NOT: v_and_b32
-; SCRATCH1024K-NOT: v_and_b32
-; SCRATCH2048K: v_and_b32_e32 v{{[0-9]+}}, 0xffffc, [[FI]]
+; SCRATCH128K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; SCRATCH256K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; SCRATCH1024K: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+
+; SCRATCH128K-NOT: and_b32
+; SCRATCH256K-NOT: and_b32
+; SCRATCH1024K-NOT: and_b32
+
+; SCRATCH2048K: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
+; SCRATCH2048K: s_and_b32 s{{[0-9]+}}, [[FI]], 0xffffc
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
%alloca = alloca i32, align 4, addrspace(5)
@@ -69,7 +85,7 @@ define amdgpu_kernel void @scratch_buffer_known_high_masklo20() {
; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo21:
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; GCN-NOT: v_and_b32
+; GCN-NOT: and_b32
; GCN: {{flat|global}}_store_{{dword|b32}} v[{{[0-9]+:[0-9]+}}],
define amdgpu_kernel void @scratch_buffer_known_high_masklo21() {
%alloca = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 8ec3b7e2508ac..a3ebaec4811a9 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -224,54 +224,55 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
; MUBUF-NEXT: ; %bb.2: ; %split
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c0, v1
-; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d4, v2
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
-; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v1, 0x12d0, v2
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
-; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
+; MUBUF-NEXT: s_movk_i32 s4, 0x4000
+; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c4, v2
-; MUBUF-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
+; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0
+; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12cc, v1
-; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c8, v2
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: v_mov_b32_e32 v0, s4
+; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3
+; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
-; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000
+; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000
-; MUBUF-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6
+; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000
+; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000
+; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
-; MUBUF-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000
+; MUBUF-NEXT: buffer_load_dword v6, v7, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v11, 0x4000
-; MUBUF-NEXT: buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc
+; MUBUF-NEXT: buffer_load_dword v7, v8, s[0:3], 0 offen offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000
-; MUBUF-NEXT: buffer_load_dword v3, v10, s[0:3], 0 offen offset:12 glc
+; MUBUF-NEXT: buffer_load_dword v8, v9, s[0:3], 0 offen offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; MUBUF-NEXT: buffer_load_dword v9, v10, s[0:3], 0 offen offset:12 glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8
; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
+; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8
-; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc
-; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10
+; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
+; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
; MUBUF-NEXT: v_mov_b32_e32 v12, 0
+; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10
; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16
More information about the llvm-commits
mailing list