[Libclc-dev] [PATCH 2/2] R600: improve float vload/vstore path
Aaron Watry
awatry at gmail.com
Fri Jul 18 09:04:20 PDT 2014
float values can be loaded/stored via casting through int first
which prevents us having to write float load/store assembly paths (which
can be done if deemed desirable).
This cast is the same method we use for unsigned int types. This lets
us write one assembly path for all 32-bit value types (and eventually
i8/i16/i64 paths too).
This results in a fabs(float16) unit test kernel going from 101 lines to
8 lines of llvm bit code and decompiled shader size of 296dw and 32gprs
to 94dw and 8gprs on evergreen (CEDAR).
Signed-off-by: Aaron Watry <awatry at gmail.com>
---
r600/lib/shared/vload.cl | 4 +++-
r600/lib/shared/vstore.cl | 11 +++++++++--
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/r600/lib/shared/vload.cl b/r600/lib/shared/vload.cl
index 49309c3..79dc976 100644
--- a/r600/lib/shared/vload.cl
+++ b/r600/lib/shared/vload.cl
@@ -35,7 +35,6 @@
VLOAD_ADDR_SPACES(ushort) \
VLOAD_ADDR_SPACES(long) \
VLOAD_ADDR_SPACES(ulong) \
- VLOAD_ADDR_SPACES(float) \
VLOAD_TYPES()
@@ -50,6 +49,8 @@ VLOAD_VECTORIZE(int, __private)
VLOAD_VECTORIZE(int, __local)
VLOAD_VECTORIZE(uint, __private)
VLOAD_VECTORIZE(uint, __local)
+VLOAD_VECTORIZE(float, __private)
+VLOAD_VECTORIZE(float, __local)
//We only define functions for typeN vloadN(), and then just bitcast the result for unsigned types
#define _CLC_VLOAD_ASM_DECL(PRIM_TYPE,LLVM_SCALAR_TYPE,ADDR_SPACE,ADDR_SPACE_ID) \
@@ -80,5 +81,6 @@ _CLC_DECL PRIM_TYPE##16 __clc_vload16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID
_CLC_VLOAD_ASM_DECL(int,i32,__constant,2) \
_CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \
_CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \
+ _CLC_VLOAD_ASM_OVERLOAD_ADDR_SPACES(float,int,i32) \
_CLC_VLOAD_ASM_OVERLOADS()
\ No newline at end of file
diff --git a/r600/lib/shared/vstore.cl b/r600/lib/shared/vstore.cl
index a150849..51890ed 100644
--- a/r600/lib/shared/vstore.cl
+++ b/r600/lib/shared/vstore.cl
@@ -42,7 +42,6 @@
VSTORE_ADDR_SPACES(ushort) \
VSTORE_ADDR_SPACES(long) \
VSTORE_ADDR_SPACES(ulong) \
- VSTORE_ADDR_SPACES(float) \
VSTORE_TYPES()
@@ -55,6 +54,8 @@ VSTORE_VECTORIZE(int, __private)
VSTORE_VECTORIZE(int, __local)
VSTORE_VECTORIZE(uint, __private)
VSTORE_VECTORIZE(uint, __local)
+VSTORE_VECTORIZE(float, __private)
+VSTORE_VECTORIZE(float, __local)
_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) {
mem[3*offset] = vec.s0;
@@ -66,6 +67,11 @@ _CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem)
mem[3*offset+1] = vec.s1;
mem[3*offset+2] = vec.s2;
}
+_CLC_OVERLOAD _CLC_DEF void vstore3(float3 vec, size_t offset, global float *mem) {
+ mem[3*offset] = vec.s0;
+ mem[3*offset+1] = vec.s1;
+ mem[3*offset+2] = vec.s2;
+}
/*Note: R600 doesn't support store <3 x ?>... so
* those functions aren't actually overridden here... lowest-common-denominator
@@ -83,7 +89,7 @@ _CLC_DECL void __clc_vstore16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TY
__clc_vstore##VEC_WIDTH##_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (__builtin_astype(vec, S_PRIM_TYPE##VEC_WIDTH), (ADDR_SPACE S_PRIM_TYPE *)&x[ VEC_WIDTH * offset]); \
} \
-/*Note: R600 back-end doesn't support load <3 x ?>... so
+/*Note: R600 back-end doesn't support store <3 x ?>... so
* those functions aren't actually overridden here... When the back-end supports
* that, then clean add here, and remove the vstore3 definitions from above.
*/
@@ -100,5 +106,6 @@ _CLC_DECL void __clc_vstore16_##LLVM_SCALAR_TYPE##__addr##ADDR_SPACE_ID (PRIM_TY
_CLC_VSTORE_ASM_DECL(int,i32,__global,1) \
_CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(int,int,i32) \
_CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(uint,int,i32) \
+ _CLC_VSTORE_ASM_OVERLOAD_ADDR_SPACES(float,int,i32) \
_CLC_VSTORE_ASM_OVERLOADS()
\ No newline at end of file
--
1.9.1
More information about the Libclc-dev
mailing list