[llvm-branch-commits] [llvm] 0c7cce5 - [AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2
Mirko Brkusanin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Dec 10 03:54:29 PST 2020
Author: Mirko Brkusanin
Date: 2020-12-10T12:40:49+01:00
New Revision: 0c7cce54eba3249489530040f41103dd8e0049f7
URL: https://github.com/llvm/llvm-project/commit/0c7cce54eba3249489530040f41103dd8e0049f7
DIFF: https://github.com/llvm/llvm-project/commit/0c7cce54eba3249489530040f41103dd8e0049f7.diff
LOG: [AMDGPU] Resolve issues when picking between ds_read/write and ds_read2/write2
Both ds_read_b128 and ds_read2_b64 are valid for 128bit 16-byte aligned
loads but the one that will be selected is determined either by the order in
tablegen or by the AddedComplexity attribute. Currently ds_read_b128 has
priority.
While ds_read2_b64 has lower alignment requirements, we cannot always
restrict ds_read_b128 to 16-byte alignment because of unaligned-access-mode
option. This was causing ds_read_b128 to be selected for 8-byte aligned
loads regardles of chosen access mode.
To resolve this we use two patterns for selecting ds_read_b128. One
requires alignment of 16-byte and the other requires
unaligned-access-mode option.
Same goes for ds_write2_b64 and ds_write_b128.
Differential Revision: https://reviews.llvm.org/D92767
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
llvm/test/CodeGen/AMDGPU/load-local.128.ll
llvm/test/CodeGen/AMDGPU/store-local.128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index f27ee1975a7f..77063f370976 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1089,11 +1089,6 @@ def isGFX7GFX10 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
-def isGFX7GFX8 :
- Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
- "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<(all_of FeatureSouthernIslands, FeatureCIInsts)>;
-
def isGFX7GFX8GFX9 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
@@ -1299,6 +1294,9 @@ def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
+def HasUnalignedAccessMode : Predicate<"Subtarget->hasUnalignedAccessMode()">,
+ AssemblerPredicate<(all_of FeatureUnalignedAccessMode)>;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 2e38619e2333..328c81005df4 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -680,7 +680,7 @@ foreach vt = VReg_64.RegTypes in {
defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
}
-let SubtargetPredicate = isGFX7GFX8 in {
+let SubtargetPredicate = isGFX7Plus in {
foreach vt = VReg_96.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
@@ -690,9 +690,7 @@ foreach vt = VReg_128.RegTypes in {
defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
}
-}
-
-let SubtargetPredicate = isGFX9Plus in {
+let SubtargetPredicate = HasUnalignedAccessMode in {
foreach vt = VReg_96.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
@@ -702,7 +700,9 @@ foreach vt = VReg_128.RegTypes in {
defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
}
-}
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
} // End AddedComplexity = 100
@@ -835,7 +835,7 @@ foreach vt = VReg_64.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
}
-let SubtargetPredicate = isGFX7GFX8 in {
+let SubtargetPredicate = isGFX7Plus in {
foreach vt = VReg_96.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
@@ -845,9 +845,7 @@ foreach vt = VReg_128.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
}
-}
-
-let SubtargetPredicate = isGFX9Plus in {
+let SubtargetPredicate = HasUnalignedAccessMode in {
foreach vt = VReg_96.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
@@ -857,9 +855,12 @@ foreach vt = VReg_128.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
}
-}
+} // End SubtargetPredicate = HasUnalignedAccessMode
+
+} // End SubtargetPredicate = isGFX7Plus
} // End AddedComplexity = 100
+
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
index e7c646ee73a7..71fc286dc75c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir
@@ -50,8 +50,8 @@ body: |
; GFX9-LABEL: name: load_local_v4s32_align_8
; GFX9: liveins: $vgpr0
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
- ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
+ ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<4 x s32>) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -78,8 +78,8 @@ body: |
; GFX9-LABEL: name: load_local_v4s32_align_8_offset_160
; GFX9: liveins: $vgpr0
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 400, 0, implicit $exec :: (load 16, align 8, addrspace 3)
- ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
+ ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 50, 51, 0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 400
%2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -110,8 +110,10 @@ body: |
; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320
; GFX9: liveins: $vgpr0
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 4000, 0, implicit $exec :: (load 16, align 8, addrspace 3)
- ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec
+ ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = G_CONSTANT i32 4000
%2:vgpr(p3) = G_PTR_ADD %0, %1
@@ -140,8 +142,8 @@ body: |
; GFX9-LABEL: name: load_local_v2s64
; GFX9: liveins: $vgpr0
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
- ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
+ ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -196,8 +198,8 @@ body: |
; GFX9-LABEL: name: load_local_s128
; GFX9: liveins: $vgpr0
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 16, align 8, addrspace 3)
- ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_gfx9_]]
+ ; GFX9: [[DS_READ2_B64_gfx9_:%[0-9]+]]:vreg_128 = DS_READ2_B64_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 16, align 8, addrspace 3)
+ ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_gfx9_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s128) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
$vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
index b0f664726b26..9edbdfc2c247 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
@@ -1,8 +1,8 @@
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
@@ -22,10 +22,10 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v4:
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_write2_b32
-; GCN-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
@@ -47,10 +47,10 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v3:
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_read_b32
-; GCN-DAG: ds_write2_b32
-; GCN-DAG: ds_write_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_read_b32
+; ALIGNED-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
@@ -106,8 +106,10 @@ bb:
}
; GCN-LABEL: test_local_v4_aligned8:
-; GCN-DAG: ds_read_b128
-; GCN-DAG: ds_write_b128
+; ALIGNED-DAG: ds_read2_b64
+; ALIGNED-DAG: ds_write2_b64
+; UNALIGNED-DAG: ds_read_b128
+; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index 6cc9dbd1793b..c7f74cb3b489 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -267,7 +267,7 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_b128 v[0:3], v0
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 3ad1caa0f0ef..3e6c4aa28581 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -250,7 +250,7 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: ds_write_b128 v4, v[0:3]
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align8:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 76ee829ad793..6aba2b5bf2b7 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=UNALIGNED,VECT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
; GCN-LABEL: test_local_misaligned_v2:
; GCN-DAG: ds_read2_b32
@@ -22,10 +22,10 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v4:
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_write2_b32
-; GCN-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_write2_b32
; UNALIGNED-DAG: ds_read_b128
; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
@@ -47,10 +47,10 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v3:
-; GCN-DAG: ds_read2_b32
-; GCN-DAG: ds_read_b32
-; GCN-DAG: ds_write2_b32
-; GCN-DAG: ds_write_b32
+; ALIGNED-DAG: ds_read2_b32
+; ALIGNED-DAG: ds_read_b32
+; ALIGNED-DAG: ds_write2_b32
+; ALIGNED-DAG: ds_write_b32
; UNALIGNED-DAG: ds_read_b96
; UNALIGNED-DAG: ds_write_b96
define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
@@ -221,8 +221,10 @@ bb:
}
; GCN-LABEL: test_local_v4_aligned8:
-; GCN-DAG: ds_read_b128
-; GCN-DAG: ds_write_b128
+; ALIGNED-DAG: ds_read2_b64
+; ALIGNED-DAG: ds_write2_b64
+; UNALIGNED-DAG: ds_read_b128
+; UNALIGNED-DAG: ds_write_b128
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 716237a9955f..f5bd05a558fe 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -325,7 +325,7 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_b128 v[0:3], v0
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index ebb200458265..1dc8da1c425a 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -340,10 +340,10 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: ds_write_b128 v4, v[0:3]
+; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-NEXT: s_endpgm
;
; GFX7-LABEL: store_lds_v4i32_align8:
More information about the llvm-branch-commits
mailing list