[llvm] r335827 - AMDGPU: Fix assert on aggregate type kernel arguments

Thu Jun 28 03:18:11 PDT 2018

Author: arsenm
Date: Thu Jun 28 03:18:11 2018
New Revision: 335827

URL: http://llvm.org/viewvc/llvm-project?rev=335827&view=rev
Log:
AMDGPU: Fix assert on aggregate type kernel arguments

Just fix the crash for now by not doing the optimization since
figuring out how to properly convert the bits for an arbitrary
struct is a pain.

Also fix a crash when there is only an empty struct argument.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
    llvm/trunk/test/CodeGen/AMDGPU/lower-kernargs.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp?rev=335827&r1=335826&r2=335827&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Thu Jun 28 03:18:11 2018
@@ -99,6 +99,8 @@ bool AMDGPULowerKernelArguments::runOnFu
   // FIXME: Alignment is broken broken with explicit arg offset.;
   const uint64_t TotalKernArgSize = BaseOffset +
     ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
+  if (TotalKernArgSize == 0)
+    return false;
 
   CallInst *KernArgSegment =
     Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
@@ -152,7 +154,7 @@ bool AMDGPULowerKernelArguments::runOnFu
     unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
 
     Value *ArgPtr;
-    if (Size < 32) {
+    if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
       // Since we don't have sub-dword scalar loads, avoid doing an extload by
       // loading earlier than the argument address, and extracting the relevant
       // bits.
@@ -218,7 +220,7 @@ bool AMDGPULowerKernelArguments::runOnFu
 
     // TODO: Convert noalias arg to !noalias
 
-    if (Size < 32) {
+    if (Size < 32 && !ArgTy->isAggregateType()) {
       if (IsExtArg && OffsetDiff == 0) {
         Type *I32Ty = Builder.getInt32Ty();
         bool IsSext = Arg.hasSExtAttr();

Modified: llvm/trunk/test/CodeGen/AMDGPU/lower-kernargs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lower-kernargs.ll?rev=335827&r1=335826&r2=335827&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lower-kernargs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/lower-kernargs.ll Thu Jun 28 03:18:11 2018
@@ -1266,6 +1266,177 @@ define amdgpu_kernel void @kern_noalias_
   ret void
 }
 
+define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
+; HSA-LABEL: @struct_i8_i8_arg(
+; HSA-NEXT:  entry:
+; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]] to [[STRUCT_I8_I8_ARG:%.*]] addrspace(4)*
+; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I8_ARG]], [[STRUCT_I8_I8_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0
+; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
+; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
+; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @struct_i8_i8_arg(
+; MESA-NEXT:  entry:
+; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[STRUCT_I8_I8_ARG:%.*]] addrspace(4)*
+; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I8_ARG]], [[STRUCT_I8_I8_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
+; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
+; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
+; MESA-NEXT:    ret void
+;
+entry:
+  %elt0 = extractvalue {i8, i8} %in, 0
+  %elt1 = extractvalue {i8, i8} %in, 1
+  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
+  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
+  ret void
+}
+
+define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
+; HSA-LABEL: @struct_i8_i16_arg(
+; HSA-NEXT:  entry:
+; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]] to [[STRUCT_I8_I16_ARG:%.*]] addrspace(4)*
+; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I16_ARG]], [[STRUCT_I8_I16_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0
+; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
+; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
+; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; HSA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @struct_i8_i16_arg(
+; MESA-NEXT:  entry:
+; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[STRUCT_I8_I16_ARG:%.*]] addrspace(4)*
+; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[STRUCT_I8_I16_ARG]], [[STRUCT_I8_I16_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
+; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
+; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; MESA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
+; MESA-NEXT:    ret void
+;
+entry:
+  %elt0 = extractvalue {i8, i16} %in, 0
+  %elt1 = extractvalue {i8, i16} %in, 1
+  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
+  store volatile i16 %elt1, i16 addrspace(1)* null, align 4
+  ret void
+}
+
+define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
+; HSA-LABEL: @array_2xi8_arg(
+; HSA-NEXT:  entry:
+; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]] to [[ARRAY_2XI8_ARG:%.*]] addrspace(4)*
+; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI8_ARG]], [[ARRAY_2XI8_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0
+; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
+; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
+; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @array_2xi8_arg(
+; MESA-NEXT:  entry:
+; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[ARRAY_2XI8_ARG:%.*]] addrspace(4)*
+; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI8_ARG]], [[ARRAY_2XI8_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
+; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
+; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
+; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
+; MESA-NEXT:    ret void
+;
+entry:
+  %elt0 = extractvalue [2 x i8] %in, 0
+  %elt1 = extractvalue [2 x i8] %in, 1
+  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
+  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
+  ret void
+}
+
+define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
+; HSA-LABEL: @array_2xi1_arg(
+; HSA-NEXT:  entry:
+; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP0:%.*]] = bitcast i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]] to [[ARRAY_2XI1_ARG:%.*]] addrspace(4)*
+; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI1_ARG]], [[ARRAY_2XI1_ARG]] addrspace(4)* [[TMP0]], i32 0, i32 0
+; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
+; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
+; HSA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
+; HSA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @array_2xi1_arg(
+; MESA-NEXT:  entry:
+; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to [[ARRAY_2XI1_ARG:%.*]] addrspace(4)*
+; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[ARRAY_2XI1_ARG]], [[ARRAY_2XI1_ARG]] addrspace(4)* [[TMP1]], i32 0, i32 0
+; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
+; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
+; MESA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
+; MESA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
+; MESA-NEXT:    ret void
+;
+entry:
+  %elt0 = extractvalue [2 x i1] %in, 0
+  %elt1 = extractvalue [2 x i1] %in, 1
+  store volatile i1 %elt0, i1 addrspace(1)* null, align 4
+  store volatile i1 %elt1, i1 addrspace(1)* null, align 4
+  ret void
+}
+
+define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
+; HSA-LABEL: @only_empty_struct(
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @only_empty_struct(
+; MESA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[ONLY_EMPTY_STRUCT:%.*]] addrspace(4)*
+; MESA-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
+; HSA-LABEL: @empty_struct_with_other(
+; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]] to [[EMPTY_STRUCT_WITH_OTHER:%.*]] addrspace(4)*
+; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[EMPTY_STRUCT_WITH_OTHER]], [[EMPTY_STRUCT_WITH_OTHER]] addrspace(4)* [[TMP1]], i32 0, i32 1
+; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load !0
+; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; HSA-NEXT:    ret void
+;
+; MESA-LABEL: @empty_struct_with_other(
+; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+; MESA-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
+; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[EMPTY_STRUCT_WITH_OTHER:%.*]] addrspace(4)*
+; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[EMPTY_STRUCT_WITH_OTHER]], [[EMPTY_STRUCT_WITH_OTHER]] addrspace(4)* [[TMP2]], i32 0, i32 1
+; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load !0
+; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef
+; MESA-NEXT:    ret void
+;
+  store i32 %arg1, i32 addrspace(1)* undef
+  ret void
+}
+
 attributes #0 = { nounwind "target-cpu"="kaveri" }
 attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
 attributes #2 = { nounwind "target-cpu"="tahiti" }