[polly] r308623 - [PPCG] [1/3] Bump up PPCG version to 0.07.

Thu Jul 20 08:48:13 PDT 2017

Author: bollu
Date: Thu Jul 20 08:48:13 2017
New Revision: 308623

URL: http://llvm.org/viewvc/llvm-project?rev=308623&view=rev
Log:
[PPCG] [1/3] Bump up PPCG version to 0.07.

- This commit *WILL NOT COMPILE*, as it checks in vanilla PPCG 0.07
- We choose to introduce this commit into the history to cleanly display
  the Polly-specific changes made to PPCG.

Differential Revision: https://reviews.llvm.org/D35675

Added:
    polly/trunk/lib/External/ppcg/gpu_hybrid.c
    polly/trunk/lib/External/ppcg/gpu_hybrid.h
    polly/trunk/lib/External/ppcg/grouping.c
    polly/trunk/lib/External/ppcg/hybrid.c
    polly/trunk/lib/External/ppcg/hybrid.h
    polly/trunk/lib/External/ppcg/tests/iterator.c
    polly/trunk/lib/External/ppcg/tests/live_out.c
    polly/trunk/lib/External/ppcg/tests/local.c
    polly/trunk/lib/External/ppcg/tests/struct4.c
    polly/trunk/lib/External/ppcg/util.c
Modified:
    polly/trunk/lib/External/pet/include/pet.h
    polly/trunk/lib/External/ppcg/ChangeLog
    polly/trunk/lib/External/ppcg/GIT_HEAD_ID
    polly/trunk/lib/External/ppcg/Makefile.am
    polly/trunk/lib/External/ppcg/README
    polly/trunk/lib/External/ppcg/configure.ac
    polly/trunk/lib/External/ppcg/cpu.c
    polly/trunk/lib/External/ppcg/cuda.c
    polly/trunk/lib/External/ppcg/cuda.h
    polly/trunk/lib/External/ppcg/gpu.c
    polly/trunk/lib/External/ppcg/gpu.h
    polly/trunk/lib/External/ppcg/gpu_array_tile.h
    polly/trunk/lib/External/ppcg/gpu_group.c
    polly/trunk/lib/External/ppcg/gpu_group.h
    polly/trunk/lib/External/ppcg/gpu_print.c
    polly/trunk/lib/External/ppcg/gpu_print.h
    polly/trunk/lib/External/ppcg/gpu_tree.c
    polly/trunk/lib/External/ppcg/gpu_tree.h
    polly/trunk/lib/External/ppcg/opencl.c
    polly/trunk/lib/External/ppcg/opencl_test.sh.in
    polly/trunk/lib/External/ppcg/polybench_test.sh.in
    polly/trunk/lib/External/ppcg/ppcg.c
    polly/trunk/lib/External/ppcg/ppcg.h
    polly/trunk/lib/External/ppcg/ppcg_options.c
    polly/trunk/lib/External/ppcg/ppcg_options.h
    polly/trunk/lib/External/ppcg/print.c
    polly/trunk/lib/External/ppcg/print.h
    polly/trunk/lib/External/ppcg/schedule.c
    polly/trunk/lib/External/ppcg/schedule.h
    polly/trunk/lib/External/ppcg/util.h

Modified: polly/trunk/lib/External/pet/include/pet.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/pet/include/pet.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================

--- polly/trunk/lib/External/pet/include/pet.h (original)
+++ polly/trunk/lib/External/pet/include/pet.h Thu Jul 20 08:48:13 2017
@@ -19,6 +19,9 @@ extern "C" {
 struct pet_options;
 ISL_ARG_DECL(pet_options, struct pet_options, pet_options_args)
 
+/* Create an isl_ctx that references the pet options. */
+isl_ctx *isl_ctx_alloc_with_pet_options();
+
 /* If autodetect is set, any valid scop is extracted.
  * Otherwise, the scop needs to be delimited by pragmas.
  */
@@ -74,6 +77,9 @@ enum pet_op_type {
 	pet_op_sub_assign,
 	pet_op_mul_assign,
 	pet_op_div_assign,
+	pet_op_and_assign,
+	pet_op_xor_assign,
+	pet_op_or_assign,
 	pet_op_assign,
 	pet_op_add,
 	pet_op_sub,
@@ -163,18 +169,20 @@ __isl_give pet_expr *pet_expr_op_set_typ
 __isl_give pet_expr *pet_expr_from_index(__isl_take isl_multi_pw_aff *index);
 
 /* Does "expr" represent an affine expression? */
-int pet_expr_is_affine(__isl_keep pet_expr *expr);
+isl_bool pet_expr_is_affine(__isl_keep pet_expr *expr);
 /* Does the access expression "expr" read the accessed elements? */
-int pet_expr_access_is_read(__isl_keep pet_expr *expr);
+isl_bool pet_expr_access_is_read(__isl_keep pet_expr *expr);
 /* Does the access expression "expr" write to the accessed elements? */
-int pet_expr_access_is_write(__isl_keep pet_expr *expr);
-/* Mark "expr" as a read dependening on "read". */
+isl_bool pet_expr_access_is_write(__isl_keep pet_expr *expr);
+/* Does the access expression "expr" kill the accessed elements? */
+isl_bool pet_expr_access_is_kill(__isl_keep pet_expr *expr);
+/* Mark "expr" as a read depending on "read". */
 __isl_give pet_expr *pet_expr_access_set_read(__isl_take pet_expr *expr,
 	int read);
-/* Mark "expr" as a write dependening on "write". */
+/* Mark "expr" as a write depending on "write". */
 __isl_give pet_expr *pet_expr_access_set_write(__isl_take pet_expr *expr,
 	int write);
-/* Mark "expr" as a kill dependening on "kill". */
+/* Mark "expr" as a kill depending on "kill". */
 __isl_give pet_expr *pet_expr_access_set_kill(__isl_take pet_expr *expr,
 	int kill);
 /* Return the reference identifier of access expression "expr". */
@@ -278,7 +286,8 @@ enum pet_tree_type {
 	pet_tree_if_else,	/* An if with an else branch */
 	pet_tree_for,
 	pet_tree_infinite_loop,
-	pet_tree_while
+	pet_tree_while,
+	pet_tree_return,
 };
 
 struct pet_tree;
@@ -301,6 +310,9 @@ enum pet_tree_type pet_tree_get_type(__i
 /* Return the expression of the expression tree "tree". */
 __isl_give pet_expr *pet_tree_expr_get_expr(__isl_keep pet_tree *tree);
 
+/* Return the expression returned by the return tree "tree". */
+__isl_give pet_expr *pet_tree_return_get_expr(__isl_keep pet_tree *tree);
+
 /* Return the number of children of the block tree "tree". */
 int pet_tree_block_n_child(__isl_keep pet_tree *tree);
 /* Return child "pos" of the block tree "tree". */
@@ -420,7 +432,7 @@ struct pet_type {
  * this array has a valid (i.e., non-negative) size
  *
  * extent holds constraints on the indices
- * 
+ *
  * value_bounds holds constraints on the elements of the array
  * and may be NULL if no such constraints were specified by the user
  *
@@ -436,6 +448,8 @@ struct pet_type {
  *
  * declared is set if the array was declared somewhere inside the scop.
  * exposed is set if the declared array is visible outside the scop.
+ * outer is set if the type of the array elements is a record and
+ * the fields of this record are represented by separate pet_array structures.
  */
 struct pet_array {
 	isl_set *context;
@@ -448,6 +462,7 @@ struct pet_array {
 	int uniquely_defined;
 	int declared;
 	int exposed;
+	int outer;
 };
 
 /* This structure represents an implication on a boolean filter.
@@ -517,6 +532,7 @@ struct pet_scop {
 	int n_independence;
 	struct pet_independence **independences;
 };
+typedef struct pet_scop pet_scop;
 
 /* Return a textual representation of the operator. */
 const char *pet_op_str(enum pet_op_type op);
@@ -526,7 +542,7 @@ int pet_op_is_inc_dec(enum pet_op_type o
  * If function is not NULL, then the pet_scop is extracted from
  * a function with that name.
  */
-struct pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
+__isl_give pet_scop *pet_scop_extract_from_C_source(isl_ctx *ctx,
 	const char *filename, const char *function);
 
 /* Transform the C source file "input" by rewriting each scop
@@ -535,63 +551,69 @@ struct pet_scop *pet_scop_extract_from_C
  */
 int pet_transform_C_source(isl_ctx *ctx, const char *input, FILE *output,
 	__isl_give isl_printer *(*transform)(__isl_take isl_printer *p,
-		struct pet_scop *scop, void *user), void *user);
+		__isl_take pet_scop *scop, void *user), void *user);
 /* Given a scop and a printer passed to a pet_transform_C_source callback,
  * print the original corresponding code to the printer.
  */
-__isl_give isl_printer *pet_scop_print_original(struct pet_scop *scop,
+__isl_give isl_printer *pet_scop_print_original(__isl_keep pet_scop *scop,
 	__isl_take isl_printer *p);
 
 /* Update all isl_sets and isl_maps such that they all have the same
  * parameters in the same order.
  */
-struct pet_scop *pet_scop_align_params(struct pet_scop *scop);
+__isl_give pet_scop *pet_scop_align_params(__isl_take pet_scop *scop);
 
 /* Does "scop" contain any data dependent accesses? */
-int pet_scop_has_data_dependent_accesses(struct pet_scop *scop);
+int pet_scop_has_data_dependent_accesses(__isl_keep pet_scop *scop);
 /* Does "scop" contain any data dependent conditions? */
-int pet_scop_has_data_dependent_conditions(struct pet_scop *scop);
+int pet_scop_has_data_dependent_conditions(__isl_keep pet_scop *scop);
 /* pet_stmt_build_ast_exprs is currently limited to only handle
  * some forms of data dependent accesses.
  * If pet_scop_can_build_ast_exprs returns 1, then pet_stmt_build_ast_exprs
  * can safely be called on all statements in the scop.
  */
-int pet_scop_can_build_ast_exprs(struct pet_scop *scop);
+int pet_scop_can_build_ast_exprs(__isl_keep pet_scop *scop);
 
-void pet_scop_dump(struct pet_scop *scop);
-struct pet_scop *pet_scop_free(struct pet_scop *scop);
+void pet_scop_dump(__isl_keep pet_scop *scop);
+__isl_null pet_scop *pet_scop_free(__isl_take pet_scop *scop);
 
-__isl_give isl_union_set *pet_scop_collect_domains(struct pet_scop *scop);
-/* Collect all potential read access relations. */
-__isl_give isl_union_map *pet_scop_collect_may_reads(struct pet_scop *scop);
-/* Collect all tagged potential read access relations. */
-__isl_give isl_union_map *pet_scop_collect_tagged_may_reads(
-	struct pet_scop *scop);
-/* Collect all potential write access relations. */
-__isl_give isl_union_map *pet_scop_collect_may_writes(struct pet_scop *scop);
-/* Collect all definite write access relations. */
-__isl_give isl_union_map *pet_scop_collect_must_writes(struct pet_scop *scop);
-/* Collect all tagged potential write access relations. */
-__isl_give isl_union_map *pet_scop_collect_tagged_may_writes(
-	struct pet_scop *scop);
-/* Collect all tagged definite write access relations. */
-__isl_give isl_union_map *pet_scop_collect_tagged_must_writes(
-	struct pet_scop *scop);
-/* Collect all definite kill access relations. */
-__isl_give isl_union_map *pet_scop_collect_must_kills(struct pet_scop *scop);
-/* Collect all tagged definite kill access relations. */
-__isl_give isl_union_map *pet_scop_collect_tagged_must_kills(
-	struct pet_scop *scop);
+/* Return the context of "scop". */
+__isl_give isl_set *pet_scop_get_context(__isl_keep pet_scop *scop);
+/* Return the schedule of "scop". */
+__isl_give isl_schedule *pet_scop_get_schedule(__isl_keep pet_scop *scop);
+/* Return the set of all statement instances. */
+__isl_give isl_union_set *pet_scop_get_instance_set(__isl_keep pet_scop *scop);
+/* Return the potential read access relation. */
+__isl_give isl_union_map *pet_scop_get_may_reads(__isl_keep pet_scop *scop);
+/* Return the tagged potential read access relation. */
+__isl_give isl_union_map *pet_scop_get_tagged_may_reads(
+	__isl_keep pet_scop *scop);
+/* Return the potential write access relation. */
+__isl_give isl_union_map *pet_scop_get_may_writes(__isl_keep pet_scop *scop);
+/* Return the definite write access relation. */
+__isl_give isl_union_map *pet_scop_get_must_writes(__isl_keep pet_scop *scop);
+/* Return the tagged potential write access relation. */
+__isl_give isl_union_map *pet_scop_get_tagged_may_writes(
+	__isl_keep pet_scop *scop);
+/* Return the tagged definite write access relation. */
+__isl_give isl_union_map *pet_scop_get_tagged_must_writes(
+	__isl_keep pet_scop *scop);
+/* Return the definite kill access relation. */
+__isl_give isl_union_map *pet_scop_get_must_kills(__isl_keep pet_scop *scop);
+/* Return the tagged definite kill access relation. */
+__isl_give isl_union_map *pet_scop_get_tagged_must_kills(
+	__isl_keep pet_scop *scop);
 
 /* Compute a mapping from all outermost arrays (of structs) in scop
  * to their innermost members.
  */
 __isl_give isl_union_map *pet_scop_compute_outer_to_inner(
-	struct pet_scop *scop);
+	__isl_keep pet_scop *scop);
 /* Compute a mapping from all outermost arrays (of structs) in scop
  * to their members, including the outermost arrays themselves.
  */
-__isl_give isl_union_map *pet_scop_compute_outer_to_any(struct pet_scop *scop);
+__isl_give isl_union_map *pet_scop_compute_outer_to_any(
+	__isl_keep pet_scop *scop);
 
 #if defined(__cplusplus)
 }

Modified: polly/trunk/lib/External/ppcg/ChangeLog
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/ChangeLog?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/ChangeLog (original)
+++ polly/trunk/lib/External/ppcg/ChangeLog Thu Jul 20 08:48:13 2017
@@ -1,3 +1,25 @@
+version: 0.07
+date: Tue Feb  7 17:23:22 CET 2017
+changes:
+	- support hybrid tiling
+---
+version: 0.06
+date: Fri May  6 12:08:50 CEST 2016
+changes:
+	- use PPCG specific macro names in generated code
+	- complete transition to schedule trees
+	- maximize coincidence by default
+	- map arrays with constant index expressions to private memory
+	- optionally group chains of statements
+---
+version: 0.05
+date: Fri Jan 15 09:30:23 CET 2016
+changes:
+	- fix live-out computation
+	- optionally compute schedule for C target
+	- optionally perform tiling for C target
+	- create single kernel for non-permutable subtree
+---
 version: 0.04
 date: Wed Jun 17 10:52:58 CEST 2015
 changes:

Modified: polly/trunk/lib/External/ppcg/GIT_HEAD_ID
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/GIT_HEAD_ID?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/GIT_HEAD_ID (original)
+++ polly/trunk/lib/External/ppcg/GIT_HEAD_ID Thu Jul 20 08:48:13 2017
@@ -1 +1 @@
-ppcg-0.04
+ppcg-0.07

Modified: polly/trunk/lib/External/ppcg/Makefile.am
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/Makefile.am?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/Makefile.am (original)
+++ polly/trunk/lib/External/ppcg/Makefile.am Thu Jul 20 08:48:13 2017
@@ -40,10 +40,15 @@ ppcg_SOURCES = \
 	gpu_array_tile.h \
 	gpu_group.c \
 	gpu_group.h \
+	gpu_hybrid.c \
+	gpu_hybrid.h \
 	gpu_print.c \
 	gpu_print.h \
 	gpu_tree.c \
 	gpu_tree.h \
+	grouping.c \
+	hybrid.c \
+	hybrid.h \
 	schedule.c \
 	schedule.h \
 	ppcg_options.c \
@@ -52,6 +57,7 @@ ppcg_SOURCES = \
 	ppcg.h \
 	print.c \
 	print.h \
+	util.c \
 	util.h \
 	version.c
 

Modified: polly/trunk/lib/External/ppcg/README
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/README?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/README (original)
+++ polly/trunk/lib/External/ppcg/README Thu Jul 20 08:48:13 2017
@@ -9,7 +9,7 @@ Requirements:
 	(only needed if you want to compile the pet executable)
 - LLVM/clang libraries, 2.9 or higher (http://clang.llvm.org/get_started.html)
 	Unless you have some other reasons for wanting to use the svn version,
-	it is best to install the latest release (3.6).
+	it is best to install the latest release (3.9).
 	For more details, see pet/README.
 
 If you are installing on Ubuntu, then you can install the following packages:
@@ -30,8 +30,7 @@ automake, libtool and pkg-config.
 
 	git clone git://repo.or.cz/ppcg.git
 	cd ppcg
-	git submodule init
-	git submodule update
+	./get_submodules.sh
 	./autogen.sh
 
 
@@ -83,6 +82,11 @@ spaces.  The sizes are specified outermo
 The dimension of the "tile" space indicates the (maximal) number of loop
 dimensions to tile.  The elements of the single integer tuple
 specify the tile sizes in each dimension.
+In case of hybrid tiling, the first element is half the size of
+the tile in the time (sequential) dimension.  The second element
+specifies the number of elements in the base of the hexagon.
+The remaining elements specify the tile sizes in the remaining space
+dimensions.
 
 The dimension of the "grid" space indicates the (maximal) number of block
 dimensions in the grid.  The elements of the single integer tuple
@@ -170,6 +174,17 @@ included files.  The --no-opencl-print-k
 PPCG from generating type definitions.
 
 
+GNU extensions
+
+By default, PPCG may print out macro definitions that involve
+GNU extensions such as __typeof__ and statement expressions.
+Some compilers may not support these extensions.
+In particular, OpenCL 1.2 beignet 1.1.1 (git-6de6918)
+has been reported not to support __typeof__.
+The use of these extensions can be turned off with the
+--no-allow-gnu-extensions option.
+
+
 Processing PolyBench
 
 When processing a PolyBench/C 3.2 benchmark, you should always specify
@@ -200,6 +215,11 @@ Contact
 For bug reports, feature requests and questions,
 contact http://groups.google.com/group/isl-development
 
+Whenever you report a bug, please mention the exact version of PPCG
+that you are using (output of "./ppcg --version").  If you are unable
+to compile PPCG, then report the git version (output of "git describe")
+or the version number included in the name of the tarball.
+
 
 Citing PPCG
 

Modified: polly/trunk/lib/External/ppcg/configure.ac
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/configure.ac?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/configure.ac (original)
+++ polly/trunk/lib/External/ppcg/configure.ac Thu Jul 20 08:48:13 2017
@@ -1,4 +1,4 @@
-AC_INIT([ppcg], [0.04], [isl-development at googlegroups.com])
+AC_INIT([ppcg], [0.07], [isl-development at googlegroups.com])
 AC_CONFIG_AUX_DIR([.])
 AC_CONFIG_MACRO_DIR([m4])
 AM_INIT_AUTOMAKE([foreign])
@@ -25,6 +25,7 @@ bundled)
 	ISL_CFLAGS="$ISL_CFLAGS"
 	ppcg_configure_args="$ppcg_configure_args --with-isl-builddir=../isl"
 	ppcg_configure_args="$ppcg_configure_args --with-isl=build"
+	ppcg_configure_args="$ppcg_configure_args --with-clang=system"
 	;;
 build)
 	ISL_BUILDDIR=`echo @abs_builddir@ | $with_isl_builddir/config.status --file=-`

Modified: polly/trunk/lib/External/ppcg/cpu.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/cpu.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/cpu.c (original)
+++ polly/trunk/lib/External/ppcg/cpu.c Thu Jul 20 08:48:13 2017
@@ -1,11 +1,14 @@
 /*
  * Copyright 2012 INRIA Paris-Rocquencourt
+ * Copyright 2012 Ecole Normale Superieure
  *
  * Use of this software is governed by the MIT license
  *
  * Written by Tobias Grosser, INRIA Paris-Rocquencourt,
  * Domaine de Voluceau, Rocquenqourt, B.P. 105,
  * 78153 Le Chesnay Cedex France
+ * and Sven Verdoolaege,
+ * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
  */
 
 #include <limits.h>
@@ -14,14 +17,19 @@
 
 #include <isl/aff.h>
 #include <isl/ctx.h>
+#include <isl/flow.h>
 #include <isl/map.h>
 #include <isl/ast_build.h>
+#include <isl/schedule.h>
+#include <isl/schedule_node.h>
 #include <pet.h>
 
 #include "ppcg.h"
 #include "ppcg_options.h"
 #include "cpu.h"
 #include "print.h"
+#include "schedule.h"
+#include "util.h"
 
 /* Representation of a statement inside a generated AST.
  *
@@ -39,7 +47,6 @@ struct ppcg_stmt {
 static void ppcg_stmt_free(void *user)
 {
 	struct ppcg_stmt *stmt = user;
-	int i;
 
 	if (!stmt)
 		return;
@@ -118,7 +125,7 @@ struct ast_build_userinfo {
 static int ast_schedule_dim_is_parallel(__isl_keep isl_ast_build *build,
 	struct ppcg_scop *scop)
 {
-	isl_union_map *schedule_node, *schedule, *deps;
+	isl_union_map *schedule, *deps;
 	isl_map *schedule_deps, *test;
 	isl_space *schedule_space;
 	unsigned i, dimension, is_parallel;
@@ -228,8 +235,10 @@ static __isl_give isl_id *ast_build_befo
  * 	  that is marked as openmp parallel.
  *
  */
-static __isl_give isl_ast_node *ast_build_after_for(__isl_take isl_ast_node *node,
-        __isl_keep isl_ast_build *build, void *user) {
+static __isl_give isl_ast_node *ast_build_after_for(
+	__isl_take isl_ast_node *node, __isl_keep isl_ast_build *build,
+	void *user)
+{
 	isl_id *id;
 	struct ast_build_userinfo *build_info;
 	struct ast_node_userinfo *info;
@@ -327,7 +336,6 @@ static __isl_give isl_printer *print_for
 	__isl_take isl_ast_print_options *print_options,
 	__isl_keep isl_ast_node *node, void *user)
 {
-	struct ppcg_print_info *print_info;
 	isl_id *id;
 	int openmp;
 
@@ -416,29 +424,75 @@ error:
 	return isl_ast_node_free(node);
 }
 
-/* Set *depth to the number of scheduling dimensions
- * for the schedule of the first domain.
- * We assume here that this number is the same for all domains.
+/* Set *depth (initialized to 0 by the caller) to the maximum
+ * of the schedule depths of the leaf nodes for which this function is called.
  */
-static isl_stat set_depth(__isl_take isl_map *map, void *user)
+static isl_bool update_depth(__isl_keep isl_schedule_node *node, void *user)
 {
-	unsigned *depth = user;
+	int *depth = user;
+	int node_depth;
+
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_leaf)
+		return isl_bool_true;
+	node_depth = isl_schedule_node_get_schedule_depth(node);
+	if (node_depth > *depth)
+		*depth = node_depth;
+
+	return isl_bool_false;
+}
+
+/* This function is called for each node in a CPU AST.
+ * In case of a user node, print the macro definitions required
+ * for printing the AST expressions in the annotation, if any.
+ * For other nodes, return true such that descendants are also
+ * visited.
+ *
+ * In particular, print the macro definitions needed for the substitutions
+ * of the original user statements.
+ */
+static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
+{
+	struct ppcg_stmt *stmt;
+	isl_id *id;
+	isl_printer **p = user;
 
-	*depth = isl_map_dim(map, isl_dim_out);
+	if (isl_ast_node_get_type(node) != isl_ast_node_user)
+		return isl_bool_true;
 
-	isl_map_free(map);
-	return isl_stat_error;
+	id = isl_ast_node_get_annotation(node);
+	stmt = isl_id_get_user(id);
+	isl_id_free(id);
+
+	if (!stmt)
+		return isl_bool_error;
+
+	*p = ppcg_print_body_macros(*p, stmt->ref2expr);
+	if (!*p)
+		return isl_bool_error;
+
+	return isl_bool_false;
+}
+
+/* Print the required macros for the CPU AST "node" to "p",
+ * including those needed for the user statements inside the AST.
+ */
+static __isl_give isl_printer *cpu_print_macros(__isl_take isl_printer *p,
+	__isl_keep isl_ast_node *node)
+{
+	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
+		return isl_printer_free(p);
+	p = ppcg_print_macros(p, node);
+	return p;
 }
 
-/* Code generate the scop 'scop' and print the corresponding C code to 'p'.
+/* Code generate the scop 'scop' using "schedule"
+ * and print the corresponding C code to 'p'.
  */
 static __isl_give isl_printer *print_scop(struct ppcg_scop *scop,
-	__isl_take isl_printer *p, struct ppcg_options *options)
+	__isl_take isl_schedule *schedule, __isl_take isl_printer *p,
+	struct ppcg_options *options)
 {
 	isl_ctx *ctx = isl_printer_get_ctx(p);
-	isl_set *context;
-	isl_union_set *domain_set;
-	isl_union_map *schedule_map;
 	isl_ast_build *build;
 	isl_ast_print_options *print_options;
 	isl_ast_node *tree;
@@ -446,14 +500,12 @@ static __isl_give isl_printer *print_sco
 	struct ast_build_userinfo build_info;
 	int depth;
 
-	context = isl_set_copy(scop->context);
-	domain_set = isl_union_set_copy(scop->domain);
-	schedule_map = isl_schedule_get_map(scop->schedule);
-	schedule_map = isl_union_map_intersect_domain(schedule_map, domain_set);
-
-	isl_union_map_foreach_map(schedule_map, &set_depth, &depth);
+	depth = 0;
+	if (isl_schedule_foreach_schedule_node_top_down(schedule, &update_depth,
+						&depth) < 0)
+		goto error;
 
-	build = isl_ast_build_from_context(context);
+	build = isl_ast_build_alloc(ctx);
 	iterators = ppcg_scop_generate_names(scop, depth, "c");
 	build = isl_ast_build_set_iterators(build, iterators);
 	build = isl_ast_build_set_at_each_domain(build, &at_each_domain, scop);
@@ -470,7 +522,7 @@ static __isl_give isl_printer *print_sco
 							&build_info);
 	}
 
-	tree = isl_ast_build_node_from_schedule_map(build, schedule_map);
+	tree = isl_ast_build_node_from_schedule(build, schedule);
 	isl_ast_build_free(build);
 
 	print_options = isl_ast_print_options_alloc(ctx);
@@ -480,21 +532,188 @@ static __isl_give isl_printer *print_sco
 	print_options = isl_ast_print_options_set_print_for(print_options,
 							&print_for, NULL);
 
-	p = ppcg_print_macros(p, tree);
+	p = cpu_print_macros(p, tree);
 	p = isl_ast_node_print(tree, p, print_options);
 
 	isl_ast_node_free(tree);
 
 	return p;
+error:
+	isl_schedule_free(schedule);
+	isl_printer_free(p);
+	return NULL;
 }
 
-/* Generate CPU code for the scop "ps" and print the corresponding C code
- * to "p", including variable declarations.
+/* Tile the band node "node" with tile sizes "sizes" and
+ * mark all members of the resulting tile node as "atomic".
  */
-__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
-	struct ppcg_scop *ps, struct ppcg_options *options)
+static __isl_give isl_schedule_node *tile(__isl_take isl_schedule_node *node,
+	__isl_take isl_multi_val *sizes)
+{
+	node = isl_schedule_node_band_tile(node, sizes);
+	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
+
+	return node;
+}
+
+/* Tile "node", if it is a band node with at least 2 members.
+ * The tile sizes are set from the "tile_size" option.
+ */
+static __isl_give isl_schedule_node *tile_band(
+	__isl_take isl_schedule_node *node, void *user)
+{
+	struct ppcg_scop *scop = user;
+	int n;
+	isl_space *space;
+	isl_multi_val *sizes;
+
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
+		return node;
+
+	n = isl_schedule_node_band_n_member(node);
+	if (n <= 1)
+		return node;
+
+	space = isl_schedule_node_band_get_space(node);
+	sizes = ppcg_multi_val_from_int(space, scop->options->tile_size);
+
+	return tile(node, sizes);
+}
+
+/* Construct schedule constraints from the dependences in ps
+ * for the purpose of computing a schedule for a CPU.
+ *
+ * The proximity constraints are set to the flow dependences.
+ *
+ * If live-range reordering is allowed then the conditional validity
+ * constraints are set to the order dependences with the flow dependences
+ * as condition.  That is, a live-range (flow dependence) will be either
+ * local to an iteration of a band or all adjacent order dependences
+ * will be respected by the band.
+ * The validity constraints are set to the union of the flow dependences
+ * and the forced dependences, while the coincidence constraints
+ * are set to the union of the flow dependences, the forced dependences and
+ * the order dependences.
+ *
+ * If live-range reordering is not allowed, then both the validity
+ * and the coincidence constraints are set to the union of the flow
+ * dependences and the false dependences.
+ *
+ * Note that the coincidence constraints are only set when the "openmp"
+ * options is set.  Even though the way openmp pragmas are introduced
+ * does not rely on the coincident property of the schedule band members,
+ * the coincidence constraints do affect the way the schedule is constructed,
+ * such that more schedule dimensions should be detected as parallel
+ * by ast_schedule_dim_is_parallel.
+ * Since the order dependences are also taken into account by
+ * ast_schedule_dim_is_parallel, they are also added to
+ * the coincidence constraints.  If the openmp handling learns
+ * how to privatize some memory, then the corresponding order
+ * dependences can be removed from the coincidence constraints.
+ */
+static __isl_give isl_schedule_constraints *construct_cpu_schedule_constraints(
+	struct ppcg_scop *ps)
+{
+	isl_schedule_constraints *sc;
+	isl_union_map *validity, *coincidence;
+
+	sc = isl_schedule_constraints_on_domain(isl_union_set_copy(ps->domain));
+	if (ps->options->live_range_reordering) {
+		sc = isl_schedule_constraints_set_conditional_validity(sc,
+				isl_union_map_copy(ps->tagged_dep_flow),
+				isl_union_map_copy(ps->tagged_dep_order));
+		validity = isl_union_map_copy(ps->dep_flow);
+		validity = isl_union_map_union(validity,
+				isl_union_map_copy(ps->dep_forced));
+		if (ps->options->openmp) {
+			coincidence = isl_union_map_copy(validity);
+			coincidence = isl_union_map_union(coincidence,
+					isl_union_map_copy(ps->dep_order));
+		}
+	} else {
+		validity = isl_union_map_copy(ps->dep_flow);
+		validity = isl_union_map_union(validity,
+				isl_union_map_copy(ps->dep_false));
+		if (ps->options->openmp)
+			coincidence = isl_union_map_copy(validity);
+	}
+	if (ps->options->openmp)
+		sc = isl_schedule_constraints_set_coincidence(sc, coincidence);
+	sc = isl_schedule_constraints_set_validity(sc, validity);
+	sc = isl_schedule_constraints_set_proximity(sc,
+					isl_union_map_copy(ps->dep_flow));
+
+	return sc;
+}
+
+/* Compute a schedule for the scop "ps".
+ *
+ * First derive the appropriate schedule constraints from the dependences
+ * in "ps" and then compute a schedule from those schedule constraints,
+ * possibly grouping statement instances based on the input schedule.
+ */
+static __isl_give isl_schedule *compute_cpu_schedule(struct ppcg_scop *ps)
+{
+	isl_schedule_constraints *sc;
+	isl_schedule *schedule;
+
+	if (!ps)
+		return NULL;
+
+	sc = construct_cpu_schedule_constraints(ps);
+
+	if (ps->options->debug->dump_schedule_constraints)
+		isl_schedule_constraints_dump(sc);
+	schedule = ppcg_compute_schedule(sc, ps->schedule, ps->options);
+
+	return schedule;
+}
+
+/* Compute a new schedule to the scop "ps" if the reschedule option is set.
+ * Otherwise, return a copy of the original schedule.
+ */
+static __isl_give isl_schedule *optionally_compute_schedule(void *user)
+{
+	struct ppcg_scop *ps = user;
+
+	if (!ps)
+		return NULL;
+	if (!ps->options->reschedule)
+		return isl_schedule_copy(ps->schedule);
+	return compute_cpu_schedule(ps);
+}
+
+/* Compute a schedule based on the dependences in "ps" and
+ * tile it if requested by the user.
+ */
+static __isl_give isl_schedule *get_schedule(struct ppcg_scop *ps,
+	struct ppcg_options *options)
+{
+	isl_ctx *ctx;
+	isl_schedule *schedule;
+
+	if (!ps)
+		return NULL;
+
+	ctx = isl_union_set_get_ctx(ps->domain);
+	schedule = ppcg_get_schedule(ctx, options,
+				    &optionally_compute_schedule, ps);
+	if (ps->options->tile)
+		schedule = isl_schedule_map_schedule_node_bottom_up(schedule,
+							&tile_band, ps);
+
+	return schedule;
+}
+
+/* Generate CPU code for the scop "ps" using "schedule" and
+ * print the corresponding C code to "p", including variable declarations.
+ */
+static __isl_give isl_printer *print_cpu_with_schedule(
+	__isl_take isl_printer *p, struct ppcg_scop *ps,
+	__isl_take isl_schedule *schedule, struct ppcg_options *options)
 {
 	int hidden;
+	isl_set *context;
 
 	p = isl_printer_start_line(p);
 	p = isl_printer_print_str(p, "/* ppcg generated CPU code */");
@@ -503,30 +722,61 @@ __isl_give isl_printer *print_cpu(__isl_
 	p = isl_printer_start_line(p);
 	p = isl_printer_end_line(p);
 
-	p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
+	p = ppcg_set_macro_names(p);
 	p = ppcg_print_exposed_declarations(p, ps);
 	hidden = ppcg_scop_any_hidden_declarations(ps);
 	if (hidden) {
 		p = ppcg_start_block(p);
 		p = ppcg_print_hidden_declarations(p, ps);
 	}
+
+	context = isl_set_copy(ps->context);
+	context = isl_set_from_params(context);
+	schedule = isl_schedule_insert_context(schedule, context);
 	if (options->debug->dump_final_schedule)
-		isl_schedule_dump(ps->schedule);
-	p = print_scop(ps, p, options);
+		isl_schedule_dump(schedule);
+	p = print_scop(ps, schedule, p, options);
 	if (hidden)
 		p = ppcg_end_block(p);
 
 	return p;
 }
 
-/* Wrapper around print_cpu for use as a ppcg_transform callback.
+/* Generate CPU code for the scop "ps" and print the corresponding C code
+ * to "p", including variable declarations.
+ */
+__isl_give isl_printer *print_cpu(__isl_take isl_printer *p,
+	struct ppcg_scop *ps, struct ppcg_options *options)
+{
+	isl_schedule *schedule;
+
+	schedule = isl_schedule_copy(ps->schedule);
+	return print_cpu_with_schedule(p, ps, schedule, options);
+}
+
+/* Generate CPU code for "scop" and print it to "p".
+ *
+ * First obtain a schedule for "scop" and then print code for "scop"
+ * using that schedule.
+ */
+static __isl_give isl_printer *generate(__isl_take isl_printer *p,
+	struct ppcg_scop *scop, struct ppcg_options *options)
+{
+	isl_schedule *schedule;
+
+	schedule = get_schedule(scop, options);
+
+	return print_cpu_with_schedule(p, scop, schedule, options);
+}
+
+/* Wrapper around generate for use as a ppcg_transform callback.
  */
 static __isl_give isl_printer *print_cpu_wrap(__isl_take isl_printer *p,
 	struct ppcg_scop *scop, void *user)
 {
 	struct ppcg_options *options = user;
 
-	return print_cpu(p, scop, options);
+	return generate(p, scop, options);
 }
 
 /* Transform the code in the file called "input" by replacing

Modified: polly/trunk/lib/External/ppcg/cuda.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/cuda.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/cuda.c (original)
+++ polly/trunk/lib/External/ppcg/cuda.c Thu Jul 20 08:48:13 2017
@@ -56,9 +56,13 @@ static __isl_give isl_printer *declare_d
 	if (!array->linearize && array->n_index > 1) {
 		p = isl_printer_print_str(p, ")");
 		for (i = 1; i < array->n_index; i++) {
+			isl_ast_expr *bound;
+			bound = isl_ast_expr_get_op_arg(array->bound_expr,
+							1 + i);
 			p = isl_printer_print_str(p, "[");
-			p = isl_printer_print_pw_aff(p, array->bound[i]);
+			p = isl_printer_print_ast_expr(p, bound);
 			p = isl_printer_print_str(p, "]");
+			isl_ast_expr_free(bound);
 		}
 	}
 	p = isl_printer_print_str(p, ";");
@@ -89,8 +93,11 @@ static __isl_give isl_printer *allocate_
 	int i;
 
 	for (i = 0; i < prog->n_array; ++i) {
+		struct gpu_array_info *array = &prog->array[i];
+
 		if (!gpu_array_requires_device_allocation(&prog->array[i]))
 			continue;
+		p = ppcg_ast_expr_print_macros(array->bound_expr, p);
 		p = isl_printer_start_line(p);
 		p = isl_printer_print_str(p,
 			"cudaCheckReturn(cudaMalloc((void **) &dev_");
@@ -105,6 +112,24 @@ static __isl_give isl_printer *allocate_
 	return p;
 }
 
+static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
+	struct gpu_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < prog->n_array; ++i) {
+		if (!gpu_array_requires_device_allocation(&prog->array[i]))
+			continue;
+		p = isl_printer_start_line(p);
+		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
+		p = isl_printer_print_str(p, prog->array[i].name);
+		p = isl_printer_print_str(p, "));");
+		p = isl_printer_end_line(p);
+	}
+
+	return p;
+}
+
 /* Print code to "p" for copying "array" from the host to the device
  * in its entirety.  The bounds on the extent of "array" have
  * been precomputed in extract_array_info and are used in
@@ -153,20 +178,20 @@ static __isl_give isl_printer *copy_arra
 	return p;
 }
 
-static isl_printer *print_reverse_list(isl_printer *p, int len, int *list)
+static void print_reverse_list(FILE *out, int len, int *list)
 {
 	int i;
 
-	if (len == 0)
-		return p;
+	if (!out || len == 0)
+		return;
 
-        p = isl_printer_print_str(p, "(");
+	fprintf(out, "(");
 	for (i = 0; i < len; ++i) {
 		if (i)
-                        p = isl_printer_print_str(p, ", ");
-                p = isl_printer_print_int(p, list[len - 1 - i]);
+			fprintf(out, ", ");
+		fprintf(out, "%d", list[len - 1 - i]);
 	}
-        return isl_printer_print_str(p, ")");
+	fprintf(out, ")");
 }
 
 /* Print the effective grid size as a list of the sizes in each
@@ -184,11 +209,11 @@ static __isl_give isl_printer *print_gri
 
 	p = isl_printer_print_str(p, "(");
 	for (i = dim - 1; i >= 0; --i) {
-		isl_pw_aff *bound;
+		isl_ast_expr *bound;
 
-		bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
-		p = isl_printer_print_pw_aff(p, bound);
-		isl_pw_aff_free(bound);
+		bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
+		p = isl_printer_print_ast_expr(p, bound);
+		isl_ast_expr_free(bound);
 
 		if (i > 0)
 			p = isl_printer_print_str(p, ", ");
@@ -469,8 +494,8 @@ static void print_kernel(struct gpu_prog
 
 	p = print_kernel_vars(p, kernel);
 	p = isl_printer_end_line(p);
-	p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
-	p = ppcg_print_macros(p, kernel->tree);
+	p = ppcg_set_macro_names(p);
+	p = gpu_print_macros(p, kernel->tree);
 
 	print_options = isl_ast_print_options_alloc(ctx);
 	print_options = isl_ast_print_options_set_print_user(print_options,
@@ -481,15 +506,46 @@ static void print_kernel(struct gpu_prog
 	fprintf(cuda->kernel_c, "}\n");
 }
 
-/* Print a statement for copying an array to or from the device.
- * The statement identifier is called "to_device_<array name>" or
- * "from_device_<array name>" and its user pointer points
- * to the gpu_array_info of the array that needs to be copied.
+/* Print code for initializing the device for execution of the transformed
+ * code.  This includes declaring locally defined variables as well as
+ * declaring and allocating the required copies of arrays on the device.
+ */
+static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
+	struct gpu_prog *prog)
+{
+	p = print_cuda_macros(p);
+
+	p = gpu_print_local_declarations(p, prog);
+	p = declare_device_arrays(p, prog);
+	p = allocate_device_arrays(p, prog);
+
+	return p;
+}
+
+/* Print code for clearing the device after execution of the transformed code.
+ * In particular, free the memory that was allocated on the device.
+ */
+static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
+	struct gpu_prog *prog)
+{
+	p = free_device_arrays(p, prog);
+
+	return p;
+}
+
+/* Print a statement for copying an array to or from the device,
+ * or for initializing or clearing the device.
+ * The statement identifier of a copying node is called
+ * "to_device_<array name>" or "from_device_<array name>" and
+ * its user pointer points to the gpu_array_info of the array
+ * that needs to be copied.
+ * The node for initializing the device is called "init_device".
+ * The node for clearing the device is called "clear_device".
  *
- * Extract the array from the identifier and call
- * copy_array_to_device or copy_array_from_device.
+ * Extract the array (if any) from the identifier and call
+ * init_device, clear_device, copy_array_to_device or copy_array_from_device.
  */
-static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
+static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
 	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
 {
 	isl_ast_expr *expr, *arg;
@@ -507,7 +563,11 @@ static __isl_give isl_printer *print_to_
 	isl_ast_expr_free(expr);
 
 	if (!name)
-		array = NULL;
+		return isl_printer_free(p);
+	if (!strcmp(name, "init_device"))
+		return init_device(p, prog);
+	if (!strcmp(name, "clear_device"))
+		return clear_device(p, prog);
 	if (!array)
 		return isl_printer_free(p);
 
@@ -524,17 +584,18 @@ struct print_host_user_data {
 
 /* Print the user statement of the host code to "p".
  *
- * The host code may contain original user statements, kernel launches and
- * statements that copy data to/from the device.
+ * The host code may contain original user statements, kernel launches,
+ * statements that copy data to/from the device and statements
+ * the initialize or clear the device.
  * The original user statements and the kernel launches have
- * an associated annotation, while the data copy statements do not.
- * The latter are handled by print_to_from_device.
+ * an associated annotation, while the other statements do not.
+ * The latter are handled by print_device_node.
  * The annotation on the user statements is called "user".
  *
  * In case of a kernel launch, print a block of statements that
  * defines the grid and the block and then launches the kernel.
  */
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
+static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 	__isl_take isl_ast_print_options *print_options,
 	__isl_keep isl_ast_node *node, void *user)
 {
@@ -550,7 +611,7 @@ __isl_give isl_printer *print_host_user(
 
 	id = isl_ast_node_get_annotation(node);
 	if (!id)
-		return print_to_from_device(p, node, data->prog);
+		return print_device_node(p, node, data->prog);
 
 	is_user = !strcmp(isl_id_get_name(id), "user");
 	kernel = is_user ? NULL : isl_id_get_user(id);
@@ -560,16 +621,14 @@ __isl_give isl_printer *print_host_user(
 	if (is_user)
 		return ppcg_kernel_print_domain(p, stmt);
 
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "{");
-	p = isl_printer_end_line(p);
-	p = isl_printer_indent(p, 2);
+	p = ppcg_start_block(p);
 
 	p = isl_printer_start_line(p);
 	p = isl_printer_print_str(p, "dim3 k");
 	p = isl_printer_print_int(p, kernel->id);
 	p = isl_printer_print_str(p, "_dimBlock");
-	p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
+	print_reverse_list(isl_printer_get_file(p),
+				kernel->n_block, kernel->block_dim);
 	p = isl_printer_print_str(p, ";");
 	p = isl_printer_end_line(p);
 
@@ -591,17 +650,12 @@ __isl_give isl_printer *print_host_user(
 	p = isl_printer_print_str(p, "cudaCheckKernel();");
 	p = isl_printer_end_line(p);
 
-	p = isl_printer_indent(p, -2);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "}");
-	p = isl_printer_end_line(p);
+	p = ppcg_end_block(p);
 
 	p = isl_printer_start_line(p);
 	p = isl_printer_end_line(p);
 
-#if 0
 	print_kernel(data->prog, kernel, data->cuda);
-#endif
 
 	return p;
 }
@@ -618,30 +672,12 @@ static __isl_give isl_printer *print_hos
 	print_options = isl_ast_print_options_set_print_user(print_options,
 						&print_host_user, &data);
 
-	p = ppcg_print_macros(p, tree);
+	p = gpu_print_macros(p, tree);
 	p = isl_ast_node_print(tree, p, print_options);
 
 	return p;
 }
 
-static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
-	struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		if (!gpu_array_requires_device_allocation(&prog->array[i]))
-			continue;
-		p = isl_printer_start_line(p);
-		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
-		p = isl_printer_print_str(p, prog->array[i].name);
-		p = isl_printer_print_str(p, "));");
-		p = isl_printer_end_line(p);
-	}
-
-	return p;
-}
-
 /* Given a gpu_prog "prog" and the corresponding transformed AST
  * "tree", print the entire CUDA code to "p".
  * "types" collects the types for which a definition has already
@@ -662,20 +698,8 @@ static __isl_give isl_printer *print_cud
 	if (!kernel)
 		return isl_printer_free(p);
 
-	p = ppcg_start_block(p);
-
-	p = print_cuda_macros(p);
-
-	p = gpu_print_local_declarations(p, prog);
-	p = declare_device_arrays(p, prog);
-	p = allocate_device_arrays(p, prog);
-
 	p = print_host_code(p, prog, tree, cuda);
 
-	p = free_device_arrays(p, prog);
-
-	p = ppcg_end_block(p);
-
 	return p;
 }
 

Modified: polly/trunk/lib/External/ppcg/cuda.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/cuda.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/cuda.h (original)
+++ polly/trunk/lib/External/ppcg/cuda.h Thu Jul 20 08:48:13 2017
@@ -6,8 +6,5 @@
 
 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 	const char *input);
-__isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *print_options,
-	__isl_keep isl_ast_node *node, void *user);
 
 #endif

Modified: polly/trunk/lib/External/ppcg/gpu.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu.c (original)
+++ polly/trunk/lib/External/ppcg/gpu.c Thu Jul 20 08:48:13 2017
@@ -1,6 +1,7 @@
 /*
  * Copyright 2010-2011 INRIA Saclay
  * Copyright 2012-2013 Ecole Normale Superieure
+ * Copyright 2015-2016 Sven Verdoolaege
  *
  * Use of this software is governed by the MIT license
  *
@@ -28,7 +29,9 @@
 #include "gpu.h"
 #include "gpu_array_tile.h"
 #include "gpu_group.h"
+#include "gpu_hybrid.h"
 #include "gpu_tree.h"
+#include "hybrid.h"
 #include "schedule.h"
 #include "ppcg_options.h"
 #include "print.h"
@@ -55,7 +58,7 @@ static const char *get_outer_array_name(
 /* Collect all references to the given array and store pointers to them
  * in array->refs.
  */
-void collect_references(struct gpu_prog *prog,
+static void collect_references(struct gpu_prog *prog,
 	struct gpu_array_info *array)
 {
 	int i;
@@ -156,6 +159,20 @@ static int is_read_only_scalar(struct gp
 	return empty;
 }
 
+/* Is "array" only accessed as individual, fixed elements?
+ * That is, does each access to "array" access a single, fixed element?
+ */
+static isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
+{
+	int i;
+
+	for (i = 0; i < array->n_ref; ++i)
+		if (!array->refs[i]->fixed_element)
+			return isl_bool_false;
+
+	return isl_bool_true;
+}
+
 /* Compute bounds on the host array "pa" based on the corresponding
  * accessed elements in "arrays"
  * and collect all references to the array.
@@ -169,22 +186,18 @@ static int extract_array_info(struct gpu
 	struct gpu_array_info *info, struct pet_array *pa,
 	__isl_keep isl_union_set *arrays)
 {
-	int i, empty;
+	int empty;
 	const char *name;
 	int n_index;
-	isl_pw_aff **bounds;
+	isl_multi_pw_aff *bounds;
 	isl_set *accessed, *extent;
 
 	n_index = isl_set_dim(pa->extent, isl_dim_set);
 	name = isl_set_get_tuple_name(pa->extent);
-	bounds = isl_alloc_array(prog->ctx, isl_pw_aff *, n_index);
-	if (!bounds)
-		return -1;
 
 	info->space = isl_set_get_space(pa->extent);
 	info->name = strdup(name);
 	info->n_index = n_index;
-	info->bound = bounds;
 	info->linearize = prog->scop->options->linearize_device_arrays;
 
 	info->type = strdup(pa->element_type);
@@ -193,6 +206,7 @@ static int extract_array_info(struct gpu
 	info->has_compound_element = pa->element_is_record;
 	info->read_only_scalar = is_read_only_scalar(info, prog);
 
+	info->declared_extent = isl_set_copy(pa->extent);
 	accessed = isl_union_set_extract_set(arrays,
 					    isl_space_copy(info->space));
 	empty = isl_set_is_empty(accessed);
@@ -202,35 +216,16 @@ static int extract_array_info(struct gpu
 	if (empty < 0)
 		return -1;
 	info->accessed = !empty;
-	for (i = 0; i < n_index; ++i) {
-		isl_set *dom;
-		isl_local_space *ls;
-		isl_aff *one;
-		isl_pw_aff *bound;
-
-		dom = isl_set_copy(extent);
-		dom = isl_set_project_out(dom, isl_dim_set, i + 1,
-					    n_index - (i + 1));
-		dom = isl_set_project_out(dom, isl_dim_set, 0, i);
-		if (!isl_set_dim_has_upper_bound(dom, isl_dim_set, 0)) {
-			fprintf(stderr, "unable to determine extent of '%s' "
-				"in dimension %d\n", info->name, i);
-			dom = isl_set_free(dom);
-		}
-		bound = isl_set_dim_max(dom, 0);
-		dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
-		ls = isl_local_space_from_space(isl_set_get_space(dom));
-		one = isl_aff_zero_on_domain(ls);
-		one = isl_aff_add_constant_si(one, 1);
-		bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
-		bound = isl_pw_aff_gist(bound, isl_set_copy(prog->context));
-
-		bounds[i] = bound;
-		if (!isl_pw_aff_is_cst(bound))
-			info->linearize = 1;
-	}
+	bounds = ppcg_size_from_extent(isl_set_copy(extent));
+	bounds = isl_multi_pw_aff_gist(bounds, isl_set_copy(prog->context));
+	if (!bounds)
+		return -1;
+	if (!isl_multi_pw_aff_is_cst(bounds))
+		info->linearize = 1;
+	info->bound = bounds;
 
 	collect_references(prog, info);
+	info->only_fixed_element = only_fixed_element_accessed(info);
 
 	return 0;
 }
@@ -238,7 +233,7 @@ static int extract_array_info(struct gpu
 /* Remove independence from the order constraints "order" on array "array".
  * Since the pairs of iterations in the filter relation of an independence
  * are guaranteed to be completely independent by the user, there is
- * no need to ensure that live ranges are ordered along thong pairs.
+ * no need to ensure that live ranges are ordered along those pairs.
  * We make an exception for local variables, though, as the independence
  * guarantee does not apply to those.
  *
@@ -277,7 +272,7 @@ static __isl_give isl_union_map *remove_
  * the same array, the target of these order dependences will also
  * be one of these references.)
  * Additionally, store the union of these array->dep_order relations
- * for all non-scalar arrays in prog->array_order.
+ * for all arrays that cannot be mapped to private memory in prog->array_order.
  */
 void collect_order_dependences(struct gpu_prog *prog)
 {
@@ -313,7 +308,7 @@ void collect_order_dependences(struct gp
 		order = remove_independences(prog, array, order);
 		array->dep_order = order;
 
-		if (gpu_array_is_scalar(array) && !array->has_compound_element)
+		if (gpu_array_can_be_private(array))
 			continue;
 
 		prog->array_order = isl_union_map_union(prog->array_order,
@@ -330,6 +325,7 @@ void collect_order_dependences(struct gp
  * elements by "prog".
  * If there are any member accesses involved, then they are first mapped
  * to the outer arrays of structs.
+ * Only extract gpu_array_info entries for these outer arrays.
  *
  * If we are allowing live range reordering, then also set
  * the dep_order field.  Otherwise leave it NULL.
@@ -353,10 +349,21 @@ static int collect_array_info(struct gpu
 	prog->array = isl_calloc_array(prog->ctx,
 				     struct gpu_array_info, prog->n_array);
 	assert(prog->array);
-	for (i = 0; i < prog->scop->pet->n_array; ++i)
-		if (extract_array_info(prog, &prog->array[i],
+	prog->n_array = 0;
+	for (i = 0; i < prog->scop->pet->n_array; ++i) {
+		isl_bool field;
+
+		field = isl_set_is_wrapping(prog->scop->pet->arrays[i]->extent);
+		if (field < 0)
+			break;
+		if (field)
+			continue;
+		if (extract_array_info(prog, &prog->array[prog->n_array++],
 					prog->scop->pet->arrays[i], arrays) < 0)
 			r = -1;
+	}
+	if (i < prog->scop->pet->n_array)
+		r = -1;
 
 	isl_union_set_free(arrays);
 
@@ -368,17 +375,17 @@ static int collect_array_info(struct gpu
 
 static void free_array_info(struct gpu_prog *prog)
 {
-	int i, j;
+	int i;
 
 	for (i = 0; i < prog->n_array; ++i) {
-		int n_index = prog->array[i].n_index;
 		free(prog->array[i].type);
 		free(prog->array[i].name);
-		for (j = 0; j < n_index; ++j)
-			isl_pw_aff_free(prog->array[i].bound[j]);
+		isl_multi_pw_aff_free(prog->array[i].bound);
+		isl_ast_expr_free(prog->array[i].bound_expr);
 		isl_space_free(prog->array[i].space);
+		isl_set_free(prog->array[i].declared_extent);
 		isl_set_free(prog->array[i].extent);
-		free(prog->array[i].bound);
+		isl_ast_expr_free(prog->array[i].declared_size);
 		free(prog->array[i].refs);
 		isl_union_map_free(prog->array[i].dep_order);
 	}
@@ -395,6 +402,17 @@ int gpu_array_is_scalar(struct gpu_array
 	return array->n_index == 0;
 }
 
+/* Can "array" be mapped to private memory?
+ * That is, is it only accessed as individual elements with
+ * constant index expressions?
+ */
+isl_bool gpu_array_can_be_private(struct gpu_array_info *array)
+{
+	if (!array)
+		return isl_bool_error;
+	return array->only_fixed_element;
+}
+
 /* Is "array" a read-only scalar?
  */
 int gpu_array_is_read_only_scalar(struct gpu_array_info *array)
@@ -438,7 +456,7 @@ __isl_give isl_set *gpu_array_positive_s
 		isl_pw_aff *bound;
 		isl_set *guard_i, *zero;
 
-		bound = isl_pw_aff_copy(array->bound[i]);
+		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
 		guard_i = isl_pw_aff_nonneg_set(isl_pw_aff_copy(bound));
 		zero = isl_pw_aff_zero_set(bound);
 		guard_i = isl_set_subtract(guard_i, zero);
@@ -804,7 +822,7 @@ static __isl_give isl_set *array_extent(
 		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
 						isl_dim_set, i);
 		index = isl_pw_aff_from_aff(aff);
-		bound = isl_pw_aff_copy(array->bound[i]);
+		bound = isl_multi_pw_aff_get_pw_aff(array->bound, i);
 		bound = isl_pw_aff_from_range(bound);
 		bound = isl_pw_aff_add_dims(bound, isl_dim_in, array->n_index);
 		bound = isl_pw_aff_set_tuple_id(bound, isl_dim_in,
@@ -818,8 +836,8 @@ static __isl_give isl_set *array_extent(
 	return extent;
 }
 
-/* Return a map from the first group->depth dimensions of the computed
- * schedule to the array tile in
+/* Return a map from the first group->shared_tile->depth dimensions
+ * of the computed schedule to the array tile in
  * global memory that corresponds to the shared memory copy.
  *
  * In particular, return a map
@@ -873,15 +891,15 @@ static __isl_give isl_map *group_tile(st
 
 /* Given a mapping "iterator_map" from the AST schedule to a domain,
  * return the corresponding mapping from the AST schedule to
- * to the outer kernel->shared_schedule_dim dimensions of
+ * to the outer kernel->copy_schedule_dim dimensions of
  * the schedule computed by PPCG for this kernel.
  *
- * Note that kernel->shared_schedule_dim is at least as large as
+ * Note that kernel->copy_schedule_dim is at least as large as
  * the largest depth of any array reference group associated to the kernel.
  * This is needed as the returned schedule is used to extract a mapping
- * to the outer group->depth dimensions in transform_index.
+ * to the outer tile->depth dimensions in transform_index.
  */
-static __isl_give isl_pw_multi_aff *compute_sched_to_shared(
+static __isl_give isl_pw_multi_aff *compute_sched_to_copy(
 	struct ppcg_kernel *kernel, __isl_take isl_pw_multi_aff *iterator_map)
 {
 	isl_union_pw_multi_aff *upma;
@@ -891,9 +909,9 @@ static __isl_give isl_pw_multi_aff *comp
 	space = isl_space_range(isl_pw_multi_aff_get_space(iterator_map));
 	space = isl_space_from_domain(space);
 	space = isl_space_add_dims(space, isl_dim_out,
-					kernel->shared_schedule_dim);
+					kernel->copy_schedule_dim);
 
-	upma = isl_union_pw_multi_aff_copy(kernel->shared_schedule);
+	upma = isl_union_pw_multi_aff_copy(kernel->copy_schedule);
 	pma = isl_union_pw_multi_aff_extract_pw_multi_aff(upma, space);
 	isl_union_pw_multi_aff_free(upma);
 
@@ -929,11 +947,11 @@ static void check_shared_memory_bound(st
 
 		for (j = 0; j < local->n_group; ++j) {
 			struct gpu_array_ref_group *group;
+			enum ppcg_group_access_type type;
 
 			group = local->groups[j];
-			if (group->private_tile)
-				continue;
-			if (!group->shared_tile)
+			type = gpu_array_ref_group_type(group);
+			if (type != ppcg_access_shared)
 				continue;
 
 			size = gpu_array_tile_size(group->shared_tile);
@@ -991,41 +1009,6 @@ static void compute_group_tilings(struct
 	}
 }
 
-/* Compute the size of a bounding box around the origin and "set",
- * where "set" is assumed to contain only non-negative elements.
- * In particular, compute the maximal value of "set" in each direction
- * and add one.
- */
-static __isl_give isl_multi_pw_aff *extract_size(__isl_take isl_set *set,
-	__isl_take isl_set *context)
-{
-	int i, n;
-	isl_multi_pw_aff *mpa;
-
-	context = isl_set_params(context);
-	n = isl_set_dim(set, isl_dim_set);
-	mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
-	for (i = 0; i < n; ++i) {
-		isl_space *space;
-		isl_aff *one;
-		isl_pw_aff *bound;
-
-		bound = isl_set_dim_max(isl_set_copy(set), i);
-		bound = isl_pw_aff_coalesce(bound);
-		bound = isl_pw_aff_gist(bound, isl_set_copy(context));
-
-		space = isl_pw_aff_get_domain_space(bound);
-		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
-		one = isl_aff_add_constant_si(one, 1);
-		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
-		mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
-	}
-	isl_set_free(set);
-	isl_set_free(context);
-
-	return mpa;
-}
-
 /* Compute the effective grid size as a list of the sizes in each dimension.
  *
  * The grid size specified by the user or set by default
@@ -1050,6 +1033,8 @@ static __isl_give isl_multi_pw_aff *extr
 {
 	int i;
 	isl_set *grid;
+	isl_set *context;
+	isl_multi_pw_aff *size;
 
 	domain = isl_union_set_intersect(domain,
 				    isl_union_set_copy(kernel->block_filter));
@@ -1068,7 +1053,10 @@ static __isl_give isl_multi_pw_aff *extr
 		grid = isl_set_project_out(grid, isl_dim_param, pos, 1);
 	}
 
-	return extract_size(grid, isl_set_copy(kernel->context));
+	grid = isl_set_coalesce(grid);
+	size = ppcg_size_from_extent(grid);
+	context = isl_set_params(isl_set_copy(kernel->context));
+	return isl_multi_pw_aff_gist(size, context);
 }
 
 /* Compute the size of a fixed bounding box around the origin and "set",
@@ -1110,7 +1098,7 @@ static void extract_fixed_size(__isl_tak
  * to the smallest block size that ensures that all threads
  * that actually execute code are included in the block.
  *
- * The possible values of the thread ids is obtained from
+ * The set of possible values of the thread ids is obtained from
  * the domain elements "domain" and kernel->thread_filter.
  * The current implementation eliminates all parameters, ensuring
  * that the size is a fixed constant in each dimension.
@@ -1118,7 +1106,7 @@ static void extract_fixed_size(__isl_tak
  * We would have to make sure to project out all b%d and t%d parameters,
  * however.
  */
-static void extract_block_size(struct ppcg_kernel *kernel,
+static isl_stat extract_block_size(struct ppcg_kernel *kernel,
 	__isl_take isl_union_set *domain)
 {
 	int i;
@@ -1134,17 +1122,28 @@ static void extract_block_size(struct pp
 		int pos;
 		isl_id *id;
 
+		if (!block)
+			return isl_stat_error;
+
 		id = isl_id_list_get_id(kernel->thread_ids, i);
 		pos = isl_set_find_dim_by_id(block, isl_dim_param, id);
 		isl_id_free(id);
-		assert(pos >= 0);
+		if (pos < 0)
+			isl_die(isl_set_get_ctx(block), isl_error_internal,
+				"missing constraints on thread identifier",
+				block = isl_set_free(block));
 		block = isl_set_equate(block, isl_dim_param, pos,
 					isl_dim_set, i);
 	}
 	nparam = isl_set_dim(block, isl_dim_param);
 	block = isl_set_project_out(block, isl_dim_param, 0, nparam);
 
+	if (!block)
+		return isl_stat_error;
+
 	extract_fixed_size(block, kernel->block_dim);
+
+	return isl_stat_ok;
 }
 
 struct ppcg_kernel *ppcg_kernel_free(struct ppcg_kernel *kernel)
@@ -1157,14 +1156,17 @@ struct ppcg_kernel *ppcg_kernel_free(str
 	isl_id_list_free(kernel->block_ids);
 	isl_id_list_free(kernel->thread_ids);
 	isl_multi_pw_aff_free(kernel->grid_size);
+	isl_ast_expr_free(kernel->grid_size_expr);
 	isl_set_free(kernel->context);
 	isl_union_set_free(kernel->core);
 	isl_union_set_free(kernel->arrays);
+	isl_union_pw_multi_aff_free(kernel->contraction);
+	isl_union_set_free(kernel->expanded_domain);
 	isl_space_free(kernel->space);
 	isl_ast_node_free(kernel->tree);
 	isl_union_set_free(kernel->block_filter);
 	isl_union_set_free(kernel->thread_filter);
-	isl_union_pw_multi_aff_free(kernel->shared_schedule);
+	isl_union_pw_multi_aff_free(kernel->copy_schedule);
 	isl_union_set_free(kernel->sync_writes);
 
 	for (i = 0; i < kernel->n_array; ++i) {
@@ -1174,7 +1176,8 @@ struct ppcg_kernel *ppcg_kernel_free(str
 			gpu_array_ref_group_free(array->groups[j]);
 		free(array->groups);
 
-		isl_pw_aff_list_free(array->bound);
+		isl_multi_pw_aff_free(array->bound);
+		isl_ast_expr_free(array->bound_expr);
 	}
 	free(kernel->array);
 
@@ -1204,16 +1207,11 @@ static void create_kernel_var(isl_ctx *c
 	int j;
 	struct gpu_array_tile *tile;
 	isl_printer *p;
-	char *name;
 
 	var->array = group->array;
 
-	tile = group->private_tile;
-	var->type = ppcg_access_private;
-	if (!tile) {
-		tile = group->shared_tile;
-		var->type = ppcg_access_shared;
-	}
+	var->type = gpu_array_ref_group_type(group);
+	tile = gpu_array_ref_group_tile(group);
 
 	p = isl_printer_to_str(ctx);
 	p = gpu_array_ref_group_print_name(group, p);
@@ -1237,7 +1235,10 @@ static int create_kernel_vars(struct ppc
 
 		for (j = 0; j < array->n_group; ++j) {
 			struct gpu_array_ref_group *group = array->groups[j];
-			if (group->private_tile || group->shared_tile)
+			enum ppcg_group_access_type type;
+
+			type = gpu_array_ref_group_type(group);
+			if (type != ppcg_access_global)
 				++n;
 		}
 	}
@@ -1253,7 +1254,10 @@ static int create_kernel_vars(struct ppc
 
 		for (j = 0; j < array->n_group; ++j) {
 			struct gpu_array_ref_group *group = array->groups[j];
-			if (!group->private_tile && !group->shared_tile)
+			enum ppcg_group_access_type type;
+
+			type = gpu_array_ref_group_type(group);
+			if (type == ppcg_access_global)
 				continue;
 			create_kernel_var(kernel->ctx, group, &kernel->var[n]);
 			++n;
@@ -1304,27 +1308,27 @@ static void localize_bounds(struct ppcg_
 
 	for (i = 0; i < kernel->n_array; ++i) {
 		struct gpu_local_array_info *local = &kernel->array[i];
-		isl_pw_aff_list *bound;
+		isl_multi_pw_aff *bound;
 		int n_index;
 
 		if (local->n_group == 0)
 			continue;
 
 		n_index = local->array->n_index;
-		bound = isl_pw_aff_list_alloc(kernel->ctx, n_index);
+		bound = isl_multi_pw_aff_copy(local->array->bound);
 
 		for (j = 0; j < n_index; ++j) {
 			isl_pw_aff *pwaff;
 			int empty;
 
-			pwaff = isl_pw_aff_copy(local->array->bound[j]);
+			pwaff = isl_multi_pw_aff_get_pw_aff(bound, j);
 			pwaff = isl_pw_aff_gist(pwaff, isl_set_copy(context));
 			empty = isl_pw_aff_is_empty(pwaff);
 			if (empty < 0)
 				pwaff = isl_pw_aff_free(pwaff);
 			else if (empty)
 				pwaff = set_universally_zero(pwaff);
-			bound = isl_pw_aff_list_add(bound, pwaff);
+			bound = isl_multi_pw_aff_set_pw_aff(bound, j, pwaff);
 		}
 
 		local->n_index = n_index;
@@ -1384,7 +1388,6 @@ static struct gpu_stmt *find_stmt(struct
 
 void ppcg_kernel_stmt_free(void *user)
 {
-	int i;
 	struct ppcg_kernel_stmt *stmt = user;
 
 	if (!stmt)
@@ -1441,7 +1444,7 @@ static int find_array_index(struct ppcg_
  * "accesses" is the list of gpu_stmt_access in the statement.
  * "iterator_map" expresses the statement iterators in terms of
  * the AST loop iterators.
- * "sched2shared" expresses the outer shared_schedule_dim dimensions of
+ * "sched2copy" expresses the outer copy_schedule_dim dimensions of
  * the kernel schedule in terms of the AST loop iterators and
  * may be NULL if we are not inside a kernel.
  *
@@ -1453,11 +1456,10 @@ static int find_array_index(struct ppcg_
  * to the current kernel.
  */
 struct ppcg_transform_data {
-        struct ppcg_options *options;
 	struct ppcg_kernel *kernel;
 	struct gpu_stmt_access *accesses;
 	isl_pw_multi_aff *iterator_map;
-	isl_pw_multi_aff *sched2shared;
+	isl_pw_multi_aff *sched2copy;
 
 	struct gpu_array_info *array;
 	int global;
@@ -1484,6 +1486,66 @@ static struct gpu_array_ref_group *find_
 	return NULL;
 }
 
+/* Given an index expression "index" of the form
+ *
+ *	L -> F(A),
+ *
+ * with F(A) either A or some subfield of A and L the AST loop iterators,
+ * and a tiling "tiling" of the form
+ *
+ *	[L -> A] -> T
+ *
+ * apply the tiling to the outer array in the index expression to obtain
+ *
+ *	L -> T(A)
+ *
+ * If F(A) is some subfield of A, then separate the member access
+ * into the base index expression and the field index expression,
+ * apply the tiling to the base index expression and combine the result
+ * with the field index expression.
+ *
+ * If F(A) is A, then modify index to keep track of the iterators
+ *
+ *	L -> [L -> A]
+ *
+ * and combine the result with the tiling to obtain a tiled index expression
+ * in terms of the AST loop iterators
+ *
+ *	L -> T
+ */
+static __isl_give isl_multi_pw_aff *tile_outer(
+	__isl_take isl_multi_pw_aff *index, __isl_take isl_multi_pw_aff *tiling)
+{
+	isl_bool is_wrapping;
+	isl_space *space;
+	isl_multi_pw_aff *mpa;
+
+	is_wrapping = isl_multi_pw_aff_range_is_wrapping(index);
+	if (is_wrapping < 0)
+		goto error;
+	if (is_wrapping) {
+		isl_multi_pw_aff *field;
+
+		field = isl_multi_pw_aff_copy(index);
+		field = isl_multi_pw_aff_range_factor_range(field);
+		index = isl_multi_pw_aff_range_factor_domain(index);
+		index = tile_outer(index, tiling);
+		return isl_multi_pw_aff_range_product(index, field);
+	}
+
+	space = isl_space_domain(isl_multi_pw_aff_get_space(index));
+	space = isl_space_map_from_set(space);
+	mpa = isl_multi_pw_aff_identity(space);
+	index = isl_multi_pw_aff_range_product(mpa, index);
+	index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
+
+	return index;
+error:
+	isl_multi_pw_aff_free(index);
+	isl_multi_pw_aff_free(tiling);
+	return NULL;
+}
+
 /* Index transformation callback for pet_stmt_build_ast_exprs.
  *
  * "index" expresses the array indices in terms of statement iterators
@@ -1504,7 +1566,7 @@ static struct gpu_array_ref_group *find_
  *
  *	[D -> A] -> T
  *
- * where D corresponds to the outer group->depth dimensions of
+ * where D corresponds to the outer tile->depth dimensions of
  * the kernel schedule.
  * The index is of the form
  *
@@ -1514,14 +1576,16 @@ static struct gpu_array_ref_group *find_
  *
  *	[L -> A] -> T
  *
- * and modify index to keep track of those iterators
- *
- *	L -> [L -> A]
- *
- * Combining these two yields a tiled index expression in terms
+ * and combine it with the index to obtain a tiled index expression in terms
  * of the AST loop iterators
  *
  *	L -> T
+ *
+ * Note that while the tiling applies directly to an outer array.
+ * the index may refer to some subfield of this outer array.
+ * In such cases, the result will refer to the same subfield of the tile.
+ * That is, an index expression of the form  L -> F(A) will be transformed
+ * into an index expression of the form L -> F(T).
  */
 static __isl_give isl_multi_pw_aff *transform_index(
 	__isl_take isl_multi_pw_aff *index, __isl_keep isl_id *ref_id,
@@ -1538,7 +1602,6 @@ static __isl_give isl_multi_pw_aff *tran
 	isl_space *space;
 	isl_multi_pw_aff *tiling;
 	isl_pw_multi_aff *pma;
-	isl_multi_pw_aff *mpa;
 	isl_pw_multi_aff *sched2depth;
 
 	data->array = NULL;
@@ -1570,30 +1633,25 @@ static __isl_give isl_multi_pw_aff *tran
 		return index;
 	}
 
-	tile = group->private_tile;
-	if (!tile)
-		tile = group->shared_tile;
+	tile = gpu_array_ref_group_tile(group);
 	data->global = !tile;
 	if (!tile)
 		return index;
 
-	space = isl_space_range(isl_multi_pw_aff_get_space(index));
+	space = isl_space_domain(isl_multi_aff_get_space(tile->tiling));
+	space = isl_space_range(isl_space_unwrap(space));
 	space = isl_space_map_from_set(space);
 	pma = isl_pw_multi_aff_identity(space);
-	sched2depth = isl_pw_multi_aff_copy(data->sched2shared);
+	sched2depth = isl_pw_multi_aff_copy(data->sched2copy);
 	dim = isl_pw_multi_aff_dim(sched2depth, isl_dim_out);
 	sched2depth = isl_pw_multi_aff_drop_dims(sched2depth, isl_dim_out,
-					    group->depth, dim - group->depth);
+					    tile->depth, dim - tile->depth);
 	pma = isl_pw_multi_aff_product(sched2depth, pma);
 	tiling = isl_multi_pw_aff_from_multi_aff(
 				    isl_multi_aff_copy(tile->tiling));
 	tiling = isl_multi_pw_aff_pullback_pw_multi_aff(tiling, pma);
 
-	space = isl_space_domain(isl_multi_pw_aff_get_space(index));
-	space = isl_space_map_from_set(space);
-	mpa = isl_multi_pw_aff_identity(space);
-	index = isl_multi_pw_aff_range_product(mpa, index);
-	index = isl_multi_pw_aff_pullback_multi_pw_aff(tiling, index);
+	index = tile_outer(index, tiling);
 
 	return index;
 }
@@ -1666,21 +1724,18 @@ static __isl_give isl_ast_expr *derefere
  * element while the default linearized expression would refer to
  * a single element, we return the expression
  *
- *	A + (..((i_0 * b_1 + i_1) ... ) * b_n]
+ *	A + (..((i_0 * b_1 + i_1) ... ) * b_l + i_l)
  *
  * instead.  Note that because of the special case handling above,
- * we can assume here that here that there is at least one index expression.
+ * we can assume here that there is at least one index expression.
  */
 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
 	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr)
 {
 	int i, n;
-	isl_ctx *ctx;
-	isl_set *context;
 	isl_ast_expr *arg0;
 	isl_ast_expr *res;
 	isl_ast_expr_list *list;
-	isl_ast_build *build;
 
 	arg0 = isl_ast_expr_get_op_arg(expr, 0);
 	if (isl_ast_expr_get_type(arg0) == isl_ast_expr_op &&
@@ -1699,18 +1754,12 @@ __isl_give isl_ast_expr *gpu_local_array
 	if (isl_ast_expr_get_op_n_arg(expr) == 1)
 		return expr;
 
-	ctx = isl_ast_expr_get_ctx(expr);
-	context = isl_set_universe(isl_space_params_alloc(ctx, 0));
-	build = isl_ast_build_from_context(context);
-
 	n = isl_ast_expr_get_op_n_arg(expr);
 	res = isl_ast_expr_get_op_arg(expr, 1);
 	for (i = 1; i < array->n_index; ++i) {
-		isl_pw_aff *bound_i;
 		isl_ast_expr *expr_i;
 
-		bound_i = isl_pw_aff_list_get_pw_aff(array->bound, i);
-		expr_i = isl_ast_build_expr_from_pw_aff(build, bound_i);
+		expr_i = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
 		res = isl_ast_expr_mul(res, expr_i);
 
 		if (i + 1 >= n)
@@ -1719,8 +1768,6 @@ __isl_give isl_ast_expr *gpu_local_array
 		res = isl_ast_expr_add(res, expr_i);
 	}
 
-	isl_ast_build_free(build);
-
 	if (1 + array->n_index > n) {
 		res = isl_ast_expr_add(isl_ast_expr_get_op_arg(expr, 0), res);
 	} else {
@@ -1782,20 +1829,19 @@ static __isl_give isl_ast_expr *transfor
  * with name "user".
  * These AST expressions are computed from iterator_map,
  * which expresses the domain
- * elements in terms of the generated loops, and sched2shared,
- * which expresses the outer shared_schedule_dim dimensions of
+ * elements in terms of the generated loops, and sched2copy,
+ * which expresses the outer copy_schedule_dim dimensions of
  * the kernel schedule computed by PPCG in terms of the generated loops.
  */
 static __isl_give isl_ast_node *create_domain_leaf(
 	struct ppcg_kernel *kernel, __isl_take isl_ast_node *node,
-	__isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt,
-        struct gpu_gen *gen)
+	__isl_keep isl_ast_build *build, struct gpu_stmt *gpu_stmt)
 {
 	struct ppcg_transform_data data;
 	struct ppcg_kernel_stmt *stmt;
 	isl_ctx *ctx;
 	isl_id *id;
-	isl_pw_multi_aff *sched2shared;
+	isl_pw_multi_aff *sched2copy;
 	isl_map *map;
 	isl_pw_multi_aff *iterator_map;
 	isl_union_map *schedule;
@@ -1812,10 +1858,10 @@ static __isl_give isl_ast_node *create_d
 	map = isl_map_reverse(isl_map_from_union_map(schedule));
 	iterator_map = isl_pw_multi_aff_from_map(map);
 	if (kernel)
-		sched2shared = compute_sched_to_shared(kernel,
+		sched2copy = compute_sched_to_copy(kernel,
 					isl_pw_multi_aff_copy(iterator_map));
 	else
-		sched2shared = NULL;
+		sched2copy = NULL;
 
 	stmt->type = ppcg_kernel_domain;
 	stmt->u.d.stmt = gpu_stmt;
@@ -1823,12 +1869,13 @@ static __isl_give isl_ast_node *create_d
 	data.kernel = kernel;
 	data.accesses = stmt->u.d.stmt->accesses;
 	data.iterator_map = iterator_map;
-	data.sched2shared = sched2shared;
-	stmt->u.d.ref2expr = gen->build_ast_expr(stmt->u.d.stmt->stmt,
+	data.sched2copy = sched2copy;
+	stmt->u.d.ref2expr = pet_stmt_build_ast_exprs(stmt->u.d.stmt->stmt,
 					    build, &transform_index, &data,
 					    &transform_expr, &data);
+
 	isl_pw_multi_aff_free(iterator_map);
-	isl_pw_multi_aff_free(sched2shared);
+	isl_pw_multi_aff_free(sched2copy);
 
 	id = isl_id_alloc(ctx, "user", stmt);
 	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
@@ -1846,7 +1893,7 @@ static __isl_give isl_ast_node *create_d
  *
  *	type[D -> A] -> L
  *
- * where D corresponds to the outer group->depth dimensions of
+ * where D corresponds to the outer tile->depth dimensions of
  * the kernel schedule, A to the global array and L to the outer
  * generated AST schedule.
  * We compute the inverse and strip off the type, resulting in
@@ -1867,6 +1914,7 @@ static __isl_give isl_ast_node *create_d
  *
  * and store the corresponding expressions in stmt->index and stmt->local_index,
  * where stmt points to the ppcg_kernel_stmt that is attached to the node.
+ * stmt->index is linearized if the global memory array is linearized.
  */
 static __isl_give isl_ast_node *create_access_leaf(struct ppcg_kernel *kernel,
 	struct gpu_array_ref_group *group, __isl_take isl_ast_node *node,
@@ -1898,6 +1946,9 @@ static __isl_give isl_ast_node *create_a
 	pma2 = isl_pw_multi_aff_pullback_pw_multi_aff(pma2,
 						    isl_pw_multi_aff_copy(pma));
 	expr = isl_ast_build_access_from_pw_multi_aff(build, pma2);
+	if (group->array->linearize)
+		expr = gpu_local_array_info_linearize_index(group->local_array,
+							    expr);
 	stmt->u.c.index = expr;
 
 	tile = gpu_array_ref_group_tile(group);
@@ -1911,7 +1962,7 @@ static __isl_give isl_ast_node *create_a
 	stmt->u.c.local_array = group->local_array;
 	stmt->type = ppcg_kernel_copy;
 
-	id = isl_id_alloc(kernel->ctx, NULL, stmt);
+	id = isl_id_alloc(kernel->ctx, "copy", stmt);
 	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
 	return isl_ast_node_set_annotation(node, id);
 }
@@ -1931,11 +1982,57 @@ static __isl_give isl_ast_node *create_s
 		return isl_ast_node_free(node);
 
 	stmt->type = ppcg_kernel_sync;
-	id = isl_id_alloc(kernel->ctx, NULL, stmt);
+	id = isl_id_alloc(kernel->ctx, "sync", stmt);
 	id = isl_id_set_free_user(id, &ppcg_kernel_stmt_free);
 	return isl_ast_node_set_annotation(node, id);
 }
 
+/* Build AST expressions for the device array sizes of all arrays in "prog"
+ * that require allocation on the device using "build", as well as
+ * for the original array sizes of all arrays that need to be declared
+ * on the host.
+ * "node" is freed in case of error.
+ */
+static __isl_give isl_ast_node *build_array_bounds(
+	__isl_take isl_ast_node *node, struct gpu_prog *prog,
+	__isl_keep isl_ast_build *build)
+{
+	int i;
+
+	for (i = 0; i < prog->n_array; ++i) {
+		struct gpu_array_info *array = &prog->array[i];
+		isl_multi_pw_aff *size;
+		isl_ast_expr *expr;
+
+		if (!gpu_array_requires_device_allocation(array))
+			continue;
+
+		size = isl_multi_pw_aff_copy(array->bound);
+		expr = ppcg_build_size_expr(size, build);
+		array->bound_expr = expr;
+		if (!expr)
+			return isl_ast_node_free(node);
+	}
+
+	for (i = 0; i < prog->n_array; ++i) {
+		struct gpu_array_info *array = &prog->array[i];
+		isl_set *extent;
+		isl_multi_pw_aff *size;
+		isl_ast_expr *expr;
+
+		if (!array->declare_local)
+			continue;
+		extent = isl_set_copy(array->declared_extent);
+		size = ppcg_size_from_extent(extent);
+		expr = ppcg_build_size_expr(size, build);
+		array->declared_size = expr;
+		if (!expr)
+			return isl_ast_node_free(node);
+	}
+
+	return node;
+}
+
 /* Internal data structure for at_domain.
  *
  * "prog" represents the entire scop.
@@ -1945,7 +2042,6 @@ static __isl_give isl_ast_node *create_s
  */
 struct ppcg_at_domain_data {
 	struct gpu_prog *prog;
-	struct gpu_gen *gen;
 	struct ppcg_kernel *kernel;
 };
 
@@ -1959,9 +2055,11 @@ struct ppcg_at_domain_data {
  * requires special handling.
  *
  * If the user statement is one of the original user statements, then we call
- * create_domain_leaf.  Otherwise, we check if it is a copy or synchronization
+ * create_domain_leaf.  If it is "init_device", then we call
+ * build_array_bounds.  Otherwise, we check if it is a copy or synchronization
  * statement and call the appropriate functions.  Statements that copy an array
  * to/from the device do not need any further treatment.
+ * Neither does "clear_device".
  */
 static __isl_give isl_ast_node *at_domain(__isl_take isl_ast_node *node,
 	__isl_keep isl_ast_build *build, void *user)
@@ -1987,11 +2085,14 @@ static __isl_give isl_ast_node *at_domai
 	isl_id_free(id);
 
 	if (gpu_stmt)
-		return create_domain_leaf(data->kernel, node, build, gpu_stmt,
-                                          data->gen);
+		return create_domain_leaf(data->kernel, node, build, gpu_stmt);
 
 	if (!prefixcmp(name, "to_device_") || !prefixcmp(name, "from_device_"))
 		return node;
+	if (!strcmp(name, "init_device"))
+		return build_array_bounds(node, data->prog, build);
+	if (!strcmp(name, "clear_device"))
+		return node;
 	if (is_sync < 0)
 		return isl_ast_node_free(node);
 	if (!strcmp(name, "read") || !strcmp(name, "write")) {
@@ -2049,6 +2150,8 @@ static __isl_give isl_union_map *wrapped
  * remove those reads if ("read" is 1) or writes (if "read" is 0)
  * that are only needed to communicate data within
  * the same iteration of "sched".
+ * The domain of "sched" corresponds to the original statement instances,
+ * i.e., those that appear in the domains of the access relations.
  * "tagged" contains all tagged access relations to all
  * the array reference groups accessed by "access" from statement
  * instances scheduled by "sched".
@@ -2189,17 +2292,19 @@ static __isl_give isl_union_map *remove_
 
 /* Given an access relation "access" from "group", remove those reads
  * if ("read" is 1) or writes (if "read" is 0) that are only needed to
- * communicate data within the same iteration of the schedule at the
- * position where the copying of the group is inserted.
- * "node" points to this position, i.e., the depth at "node"
- * is equal to group->depth.
+ * communicate data within the same iteration of the schedule "prefix"
+ * at the position where the copying of the group is inserted.
+ * That is, the output dimension of "prefix"
+ * is equal to tile->depth.
+ * The domain of "prefix" corresponds to the original statement instances,
+ * i.e., those that appear in the domains of the access relations.
  *
- * We extract a schedule that picks out the iterations of the outer
- * group->depth dimensions and call remove_local_accesses.
+ * Extract the tagged access relation of "group" and
+ * then call remove_local_accesses.
  */
 static __isl_give isl_union_map *remove_local_accesses_group(
 	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
-	__isl_take isl_union_map *access, __isl_keep isl_schedule_node *node,
+	__isl_take isl_union_map *access, __isl_keep isl_union_map *prefix,
 	int read)
 {
 	isl_union_map *sched, *tagged;
@@ -2208,27 +2313,85 @@ static __isl_give isl_union_map *remove_
 		return access;
 
 	tagged = group_tagged_access_relation(group);
-	sched = isl_schedule_node_get_prefix_schedule_relation(node);
+	sched = isl_union_map_copy(prefix);
 
 	return remove_local_accesses(kernel->prog, tagged, access, sched, read);
 }
 
+/* Build an access AST expression for the effective grid size using "build".
+ * Store the result in kernel->grid_size_expr.
+ */
+static isl_stat build_grid_size(struct ppcg_kernel *kernel,
+	__isl_keep isl_ast_build *build)
+{
+	isl_multi_pw_aff *size;
+
+	size = isl_multi_pw_aff_copy(kernel->grid_size);
+	size = isl_multi_pw_aff_set_tuple_name(size, isl_dim_out, "grid");
+	kernel->grid_size_expr = ppcg_build_size_expr(size, build);
+
+	if (!kernel->grid_size_expr)
+		return isl_stat_error;
+	return isl_stat_ok;
+}
+
+/* Build access AST expressions for the localized array sizes using "build".
+ * Store the result in local->bound_expr.
+ * Only do this for arrays for which localized bounds have been computed.
+ */
+static isl_stat build_local_array_sizes(struct ppcg_kernel *kernel,
+	__isl_keep isl_ast_build *build)
+{
+	int i;
+
+	for (i = 0; i < kernel->n_array; ++i) {
+		struct gpu_local_array_info *local = &kernel->array[i];
+		isl_multi_pw_aff *size;
+
+		if (local->n_group == 0)
+			continue;
+		size = isl_multi_pw_aff_copy(local->bound);
+		local->bound_expr = ppcg_build_size_expr(size, build);
+		if (!local->bound_expr)
+			return isl_stat_error;
+	}
+
+	return isl_stat_ok;
+}
+
+/* Build access AST expressions for the effective grid size and
+ * the localized array sizes using "build".
+ */
+static isl_stat build_grid_and_local_array_sizes(struct ppcg_kernel *kernel,
+	__isl_keep isl_ast_build *build)
+{
+	if (build_grid_size(kernel, build) < 0)
+		return isl_stat_error;
+	if (build_local_array_sizes(kernel, build) < 0)
+		return isl_stat_error;
+	return isl_stat_ok;
+}
+
 /* This function is called before the AST generator starts traversing
  * the schedule subtree of a node with mark "mark".
  *
  * If the mark is called "kernel", store the kernel pointer in data->kernel
- * for use in at_domain.
+ * for use in at_domain and build AST expressions for the grid size and
+ * the localized array sizes.
  */
-static int before_mark(__isl_keep isl_id *mark,
+static isl_stat before_mark(__isl_keep isl_id *mark,
 	__isl_keep isl_ast_build *build, void *user)
 {
 	struct ppcg_at_domain_data *data = user;
 
 	if (!mark)
-		return -1;
-	if (!strcmp(isl_id_get_name(mark), "kernel"))
+		return isl_stat_error;
+	if (!strcmp(isl_id_get_name(mark), "kernel")) {
 		data->kernel = isl_id_get_user(mark);
-	return 0;
+		if (build_grid_and_local_array_sizes(data->kernel, build) < 0)
+			return isl_stat_error;
+	}
+	return isl_stat_ok;
 }
 
 /* This function is called after the AST generator has finished traversing
@@ -2297,7 +2460,7 @@ static isl_bool update_depth(__isl_keep
  * The ASTs for the device code are embedded in ppcg_kernel objects
  * attached to the leaf nodes that call "kernel".
  */
-__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
+static __isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
 	__isl_take isl_schedule *schedule)
 {
 	struct ppcg_at_domain_data data;
@@ -2307,7 +2470,6 @@ __isl_give isl_ast_node *generate_code(s
 	int depth;
 
 	data.prog = gen->prog;
-	data.gen = gen;
 	data.kernel = NULL;
 
 	depth = 0;
@@ -2376,42 +2538,73 @@ static isl_bool set_permutable(__isl_kee
 	return isl_bool_error;
 }
 
+/* Does the subtree rooted at "node" have any suitably permutable band nodes?
+ * That is, does it have any nodes that are permutable and that
+ * have a least one coincident dimension?
+ */
+static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node)
+{
+	int any_parallelism = 0;
+
+	if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable,
+						&any_parallelism) < 0 &&
+	    !any_parallelism)
+		return -1;
+
+	return any_parallelism;
+}
+
 /* Does "schedule" contain any permutable band with at least one coincident
  * member?
  */
-int has_any_permutable_node(__isl_keep isl_schedule *schedule)
+static int has_any_permutable_node(__isl_keep isl_schedule *schedule)
 {
-	int any_permutable = 0;
+	isl_schedule_node *root;
+	int any_permutable;
 
-	if (isl_schedule_foreach_schedule_node_top_down(schedule,
-				    &set_permutable, &any_permutable) < 0 &&
-	    !any_permutable)
-		return -1;
+	root = isl_schedule_get_root(schedule);
+	any_permutable = subtree_has_permutable_bands(root);
+	isl_schedule_node_free(root);
 
 	return any_permutable;
 }
 
-/* Is "node" a leaf or can it be tiled and then mapped to
- * block and thread identifiers?
+/* Is "node" a candidate for mapping to block and thread identifiers?
+ * In particular, is it permutable with at least one coincident dimension?
+ * Alternatively, does the subtree rooted at "node" not contain
+ * any such permutable node?  Filter nodes are skipped in this case,
+ * because a band node will be inserted in front of the returned
+ * node and this is not possible for filter nodes that are children
+ * of set or sequence nodes.
  */
-static int is_leaf_or_tilable(__isl_keep isl_schedule_node *node)
+static int is_candidate(__isl_keep isl_schedule_node *node)
 {
+	int permutable;
+
 	if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf)
 		return 1;
-	return is_permutable(node);
+	permutable = is_permutable(node);
+	if (permutable < 0 || permutable)
+		return permutable;
+	if (isl_schedule_node_get_type(node) == isl_schedule_node_filter)
+		return 0;
+	permutable = subtree_has_permutable_bands(node);
+	if (permutable < 0)
+		return -1;
+	return !permutable;
 }
 
 /* Is "node" the outermost node in its branch that can be tiled
  * and then mapped to block and thread identifiers?
- * If there are no such nodes in the branch and if "node" is a leaf,
- * then it is accepted too.
+ * If there are no such nodes in the subtree at "node" and
+ * if "node" is not a filter node, then it is accepted too.
  */
 static int is_outer_tilable(__isl_keep isl_schedule_node *node)
 {
 	int tilable;
 	isl_schedule_node *ancestor;
 
-	tilable = is_leaf_or_tilable(node);
+	tilable = is_candidate(node);
 	if (tilable < 0)
 		return -1;
 	if (!tilable)
@@ -2422,7 +2615,7 @@ static int is_outer_tilable(__isl_keep i
 	while (isl_schedule_node_has_parent(ancestor)) {
 		ancestor = isl_schedule_node_parent(ancestor);
 
-		tilable = is_permutable(ancestor);
+		tilable = is_candidate(ancestor);
 		if (tilable < 0 || tilable)
 			break;
 	}
@@ -2510,11 +2703,13 @@ static __isl_give isl_union_set *collect
 
 		for (j = 0; j < array->n_group; ++j) {
 			struct gpu_array_ref_group *group = array->groups[j];
+			enum ppcg_group_access_type type;
 			isl_union_set *writes_ij;
 
 			if (!group->write)
 				continue;
-			if (group->private_tile)
+			type = gpu_array_ref_group_type(group);
+			if (type == ppcg_access_private)
 				continue;
 			writes_ij = group_tagged_writes(group);
 			writes = isl_union_set_union(writes, writes_ij);
@@ -2551,26 +2746,13 @@ static int any_global_or_shared_sync_wri
 static __isl_give isl_multi_val *construct_band_tiles_sizes(
 	__isl_keep isl_schedule_node *node, int *tile_size)
 {
-	int i, n;
-	isl_ctx *ctx;
 	isl_space *space;
-	isl_multi_val *mv;
 
 	if (!node)
 		return NULL;
 
-	ctx = isl_schedule_node_get_ctx(node);
 	space = isl_schedule_node_band_get_space(node);
-	n = isl_schedule_node_band_n_member(node);
-	mv = isl_multi_val_zero(space);
-	for (i = 0; i < n; ++i) {
-		isl_val *v;
-
-		v = isl_val_int_from_si(ctx, tile_size[i]);
-		mv = isl_multi_val_set_val(mv, i, v);
-	}
-
-	return mv;
+	return ppcg_multi_val_from_int_list(space, tile_size);
 }
 
 /* Replace the partial schedule S of the band node "node" by
@@ -2671,7 +2853,9 @@ static __isl_give isl_set *extract_conte
 }
 
 /* Return the set of outer array elements accessed by
- * by the statement instance in "domain" in "prog".
+ * by the statement instances in "domain" in "prog".
+ * The instances in "domain" are those that appear
+ * in the domains of the access relations in "prog".
  */
 static __isl_give isl_union_set *accessed_by_domain(
 	__isl_take isl_union_set *domain, struct gpu_prog *prog)
@@ -2940,12 +3124,7 @@ static int kernel_requires_unroll(struct
  */
 static __isl_give isl_schedule_node *unroll(__isl_take isl_schedule_node *node)
 {
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i)
-		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
-							isl_ast_loop_unroll);
+	node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
 
 	node = isl_schedule_node_band_sink(node);
 
@@ -2966,11 +3145,14 @@ static __isl_give isl_schedule_node *unr
  * may have a different mapping from between shared memory elements and
  * threads, such that synchronization is required after the core.
  * "node" is assumed to point to the kernel node.
+ *
+ * If the shared and the thread mark point to the same node, then make
+ * sure the synchronization is inserted outside of the shared mark.
  */
 static __isl_give isl_schedule_node *add_sync(struct ppcg_kernel *kernel,
 	__isl_take isl_schedule_node *node)
 {
-	int kernel_depth;
+	int depth;
 	int need_sync;
 
 	need_sync = any_global_or_shared_sync_writes(kernel);
@@ -2979,12 +3161,13 @@ static __isl_give isl_schedule_node *add
 	if (!need_sync)
 		return node;
 
-	kernel_depth = isl_schedule_node_get_schedule_depth(node);
-
 	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	if (kernel_depth == isl_schedule_node_get_schedule_depth(node))
-		return gpu_tree_move_up_to_kernel(node);
+	depth = isl_schedule_node_get_schedule_depth(node);
+	node = gpu_tree_move_up_to_kernel(node);
+	if (depth == isl_schedule_node_get_schedule_depth(node))
+		return node;
 
+	node = gpu_tree_move_down_to_depth(node, depth, kernel->core);
 	node = gpu_tree_ensure_following_sync(node, kernel);
 
 	node = gpu_tree_move_up_to_kernel(node);
@@ -3009,9 +3192,12 @@ static __isl_give isl_union_map *anchore
 	isl_union_map *access;
 	isl_union_map *prefix;
 
-	access = gpu_array_ref_group_access_relation(group, read, !read);
-	access = remove_local_accesses_group(kernel, group, access, node, read);
 	prefix = isl_schedule_node_get_prefix_schedule_relation(node);
+	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
+			    isl_union_pw_multi_aff_copy(kernel->contraction));
+	access = gpu_array_ref_group_access_relation(group, read, !read);
+	access = remove_local_accesses_group(kernel, group, access, prefix,
+						read);
 	access = isl_union_map_range_product(prefix, access);
 
 	return access;
@@ -3026,18 +3212,20 @@ static __isl_give isl_union_map *anchore
  *	write[D -> A] -> [D -> A]
  *
  * if "read" is not set.
- * D corresponds to the outer group->depth dimensions of
+ * D corresponds to the outer tile->depth dimensions of
  * the kernel schedule.
  */
 static __isl_give isl_multi_aff *create_from_access(isl_ctx *ctx,
 	struct gpu_array_ref_group *group, int read)
 {
+	struct gpu_array_tile *tile;
 	isl_space *space;
 	isl_id *id;
 
+	tile = gpu_array_ref_group_tile(group);
 	space = isl_space_copy(group->array->space);
 	space = isl_space_from_range(space);
-	space = isl_space_add_dims(space, isl_dim_in, group->depth);
+	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
 	space = isl_space_wrap(space);
 	space = isl_space_map_from_set(space);
 
@@ -3074,9 +3262,12 @@ static __isl_give isl_schedule_node *add
 		node = isl_schedule_node_child(node, 0);
 		node = gpu_tree_ensure_following_sync(node, kernel);
 	} else if (shared) {
+		struct gpu_array_tile *tile;
+
+		tile = gpu_array_ref_group_tile(group);
 		node = isl_schedule_node_parent(node);
 		node = isl_schedule_node_parent(node);
-		node = gpu_tree_move_down_to_depth(node, group->depth,
+		node = gpu_tree_move_down_to_depth(node, tile->depth,
 							kernel->core);
 		node = gpu_tree_move_left_to_sync(node, kernel);
 	}
@@ -3094,14 +3285,14 @@ static __isl_give isl_schedule_node *add
  *
  * The copies are performed in the order of the array elements.
  * The copy statement instances include a reference to the outer
- * group->depth dimensions of the kernel schedule for ease of
+ * tile->depth dimensions of the kernel schedule for ease of
  * combining them with the group tiling.
  *
  * That is, the extra schedule is of the form
  *
  *	type[D -> A] -> A
  *
- * where D corresponds to the outer group->depth dimensions of
+ * where D corresponds to the outer tile->depth dimensions of
  * the kernel schedule and A to the global array.
  * This schedule is unrolled because registers are not addressable.
  *
@@ -3133,20 +3324,22 @@ static __isl_give isl_schedule_node *add
 	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
 	__isl_take isl_schedule_node *node, int read)
 {
+	struct gpu_array_tile *tile;
 	isl_union_map *access;
-	isl_union_map *prefix;
 	isl_union_set *domain;
 	isl_space *space;
 	isl_multi_aff *from_access;
 	isl_multi_pw_aff *mpa;
 	isl_multi_union_pw_aff *mupa;
+	isl_union_pw_multi_aff *contraction;
 	isl_schedule_node *graft;
 	isl_union_set *filter;
 	int kernel_depth;
 	int empty;
 
 	kernel_depth = isl_schedule_node_get_schedule_depth(node);
-	node = gpu_tree_move_down_to_depth(node, group->depth, kernel->core);
+	tile = gpu_array_ref_group_tile(group);
+	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);
 
 	access = anchored_non_local_accesses(kernel, group, node, read);
 	empty = isl_union_map_is_empty(access);
@@ -3165,6 +3358,8 @@ static __isl_give isl_schedule_node *add
 	access = isl_union_map_preimage_range_multi_aff(access, from_access);
 
 	filter = isl_union_set_copy(kernel->thread_filter);
+	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
+	filter = isl_union_set_preimage_union_pw_multi_aff(filter, contraction);
 	filter = isl_union_set_apply(filter, isl_union_map_copy(access));
 	filter = isl_union_set_detect_equalities(filter);
 	filter = isl_union_set_coalesce(filter);
@@ -3192,7 +3387,7 @@ static __isl_give isl_schedule_node *add
 		node = isl_schedule_node_graft_before(node, graft);
 	else {
 		node = isl_schedule_node_graft_after(node, graft);
-		if (kernel_depth < group->depth)
+		if (kernel_depth < tile->depth)
 			node = add_group_write_sync(node, kernel, group, 0);
 	}
 
@@ -3212,7 +3407,7 @@ static __isl_give isl_schedule_node *add
  * The copies are performed in the order of the corresponding shared
  * memory tile.
  * The copy statement instances include a reference to the outer
- * group->depth dimensions of the kernel schedule for ease of
+ * tile->depth dimensions of the kernel schedule for ease of
  * combining them with the group tiling.
  *
  * If we are performing a read from global memory to shared memory and
@@ -3228,7 +3423,7 @@ static __isl_give isl_schedule_node *add
  *
  *	type[D -> A] -> T
  *
- * where D corresponds to the outer group->depth dimensions of
+ * where D corresponds to the outer tile->depth dimensions of
  * the kernel schedule, A to the global array and T is the corresponding
  * shared memory tile.
  *
@@ -3241,6 +3436,9 @@ static __isl_give isl_schedule_node *add
  * by the group.  In the case of read from a non-scalar, this set
  * is replaced by the entire shared memory tile.
  *
+ * If the "unroll_copy_shared" option is set, then the AST generator
+ * is instructed to unroll the copying code.
+ *
  * A filter is inserted on type[D -> A] to map the copy instances
  * to the threads.  In particular, the thread identifiers are
  * equated to the position inside the shared memory tile (T)
@@ -3283,7 +3481,6 @@ static __isl_give isl_schedule_node *add
 	struct gpu_array_tile *tile;
 	isl_union_map *access;
 	isl_union_set *domain;
-	isl_union_set *sync;
 	isl_multi_aff *ma;
 	isl_multi_aff *from_access;
 	isl_multi_pw_aff *mpa;
@@ -3294,8 +3491,9 @@ static __isl_give isl_schedule_node *add
 	int kernel_depth;
 	int empty;
 
+	tile = gpu_array_ref_group_tile(group);
 	kernel_depth = isl_schedule_node_get_schedule_depth(node);
-	node = gpu_tree_move_down_to_depth(node, group->depth, kernel->core);
+	node = gpu_tree_move_down_to_depth(node, tile->depth, kernel->core);
 
 	access = anchored_non_local_accesses(kernel, group, node, read);
 	empty = isl_union_map_is_empty(access);
@@ -3311,7 +3509,6 @@ static __isl_give isl_schedule_node *add
 
 	from_access = create_from_access(kernel->ctx, group, read);
 
-	tile = gpu_array_ref_group_tile(group);
 	ma = isl_multi_aff_copy(tile->tiling);
 	ma = isl_multi_aff_pullback_multi_aff(ma,
 					    isl_multi_aff_copy(from_access));
@@ -3336,6 +3533,8 @@ static __isl_give isl_schedule_node *add
 	graft = isl_schedule_node_child(graft, 0);
 
 	graft = isl_schedule_node_insert_partial_schedule(graft, mupa);
+	if (kernel->options->unroll_copy_shared)
+		graft = ppcg_set_schedule_node_type(graft, isl_ast_loop_unroll);
 
 	if (tile->n > kernel->n_block && kernel->n_block > 0) {
 		graft = isl_schedule_node_band_split(graft,
@@ -3359,14 +3558,14 @@ static __isl_give isl_schedule_node *add
 		graft = isl_schedule_node_parent(graft);
 
 	if (read) {
-		if (kernel_depth < group->depth)
+		if (kernel_depth < tile->depth)
 			node = gpu_tree_ensure_sync_after_core(node, kernel);
 		node = gpu_tree_move_left_to_sync(node, kernel);
 		node = isl_schedule_node_graft_before(node, graft);
 	} else {
 		node = gpu_tree_move_right_to_sync(node, kernel);
 		node = isl_schedule_node_graft_after(node, graft);
-		if (kernel_depth < group->depth)
+		if (kernel_depth < tile->depth)
 			node = add_group_write_sync(node, kernel, group, 1);
 	}
 
@@ -3388,9 +3587,12 @@ static __isl_give isl_schedule_node *add
 	struct ppcg_kernel *kernel, struct gpu_array_ref_group *group,
 	__isl_take isl_schedule_node *node, int read)
 {
-	if (group->private_tile)
+	enum ppcg_group_access_type type;
+
+	type = gpu_array_ref_group_type(group);
+	if (type == ppcg_access_private)
 		return add_copies_group_private(kernel, group, node, read);
-	if (group->shared_tile)
+	if (type == ppcg_access_shared)
 		return add_copies_group_shared(kernel, group, node, read);
 	return node;
 }
@@ -3429,14 +3631,7 @@ static __isl_give isl_schedule_node *add
  */
 static __isl_give isl_schedule_node *atomic(__isl_take isl_schedule_node *node)
 {
-	int i, n;
-
-	n = isl_schedule_node_band_n_member(node);
-	for (i = 0; i < n; ++i)
-		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
-							isl_ast_loop_atomic);
-
-	return node;
+	return ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
 }
 
 /* Mark "node" atomic, if it is a band node.
@@ -3507,14 +3702,22 @@ static __isl_give isl_union_set *compute
 	isl_union_map *equal;
 	isl_union_set *wrap;
 	isl_union_set *domain;
+	isl_union_pw_multi_aff *contraction;
 
-	domain = isl_schedule_node_get_universe_domain(node);
 	kernel_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
 	node = isl_schedule_node_copy(node);
 	node = gpu_tree_move_down_to_thread(node, kernel->core);
 	thread_prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
 	isl_schedule_node_free(node);
 
+	contraction = kernel->contraction;
+	kernel_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
+		    kernel_prefix, isl_union_pw_multi_aff_copy(contraction));
+	thread_prefix = isl_union_map_preimage_domain_union_pw_multi_aff(
+		    thread_prefix, isl_union_pw_multi_aff_copy(contraction));
+	domain = isl_union_set_copy(kernel->expanded_domain);
+	domain = isl_union_set_universe(domain);
+
 	may_writes = isl_union_map_copy(kernel->prog->scop->tagged_may_writes);
 	may_writes = isl_union_map_curry(may_writes);
 	may_writes = isl_union_map_intersect_domain(may_writes, domain);
@@ -3564,7 +3767,10 @@ static __isl_give isl_schedule_node *gro
  * The band that "node" points to is the band that needs to be mapped
  * to block identifiers.  The band that needs to be mapped to thread
  * identifiers should be marked by a "thread" mark by the caller.
- * This mark is removed by this function.
+ * The linear branch between the current node and the "thread" mark
+ * may also have a "shared" mark.  If present, the mapping to shared
+ * memory is computed at that point.
+ * Both marks are removed by this function.
  * If "scale" is set, then the band that "node" points to is scaled
  * by "sizes".
  *
@@ -3607,7 +3813,7 @@ static __isl_give isl_schedule_node *gro
  * to be unrolled, then we perform the required unrolling.
  *
  * We save a copy of the schedule that may influence the mappings
- * to shared or private memory in kernel->shared_schedule.
+ * to shared or private memory in kernel->copy_schedule.
  *
  * Finally, we add synchronization and copy statements to the schedule tree,
  * remove the "thread" mark and create representations for the local
@@ -3617,7 +3823,7 @@ static __isl_give isl_schedule_node *gro
  * that the kernel does not get destroyed if the schedule node
  * is freed due to some error condition.
  */
-static __isl_give isl_schedule_node *create_kernel(struct gpu_gen *gen,
+__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
 	__isl_take isl_schedule_node *node, int scale,
 	__isl_keep isl_multi_val *sizes)
 {
@@ -3625,10 +3831,15 @@ static __isl_give isl_schedule_node *cre
 	isl_id *id;
 	isl_schedule_node *node_thread;
 	isl_union_map *host_schedule;
+	isl_union_pw_multi_aff *contraction;
 	isl_set *host_domain;
-	isl_union_set *domain;
+	isl_union_set *domain, *expanded;
 	int single_statement;
 
+	node = gpu_tree_insert_shared_before_thread(node);
+	if (!node)
+		return NULL;
+
 	kernel = isl_calloc_type(gen->ctx, struct ppcg_kernel);
 	kernel = ppcg_kernel_create_local_arrays(kernel, gen->prog);
 	if (!kernel)
@@ -3642,8 +3853,13 @@ static __isl_give isl_schedule_node *cre
 	kernel->options = gen->options;
 	kernel->context = extract_context(node, gen->prog);
 	kernel->core = isl_union_set_universe(isl_union_set_copy(domain));
-	kernel->arrays = accessed_by_domain(isl_union_set_copy(domain),
-						gen->prog);
+	contraction = isl_schedule_node_get_subtree_contraction(node);
+	kernel->contraction = isl_union_pw_multi_aff_copy(contraction);
+	expanded = isl_union_set_copy(domain);
+	expanded = isl_union_set_preimage_union_pw_multi_aff(expanded,
+						contraction);
+	kernel->expanded_domain = isl_union_set_copy(expanded);
+	kernel->arrays = accessed_by_domain(expanded, gen->prog);
 	kernel->n_grid = n_outer_coincidence(node);
 	node_thread = isl_schedule_node_copy(node);
 	node_thread = gpu_tree_move_down_to_thread(node_thread, kernel->core);
@@ -3693,7 +3909,8 @@ static __isl_give isl_schedule_node *cre
 						kernel->n_block, "t");
 	kernel->thread_filter = set_schedule_modulo(node, kernel->thread_ids,
 						kernel->block_dim);
-	extract_block_size(kernel, domain);
+	if (extract_block_size(kernel, domain) < 0)
+		node = isl_schedule_node_free(node);
 
 	node = gpu_tree_move_up_to_kernel(node);
 	node = isl_schedule_node_child(node, 0);
@@ -3726,16 +3943,22 @@ static __isl_give isl_schedule_node *cre
 	}
 
 	node = gpu_tree_move_up_to_thread(node);
-	kernel->shared_schedule_dim =
-		isl_schedule_node_get_schedule_depth(node);
-	kernel->shared_schedule =
+	kernel->copy_schedule_dim = isl_schedule_node_get_schedule_depth(node);
+	kernel->copy_schedule =
 		isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(node);
+	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
+	kernel->copy_schedule =
+		isl_union_pw_multi_aff_pullback_union_pw_multi_aff(
+					    kernel->copy_schedule, contraction);
 
 	node = gpu_tree_move_up_to_kernel(node);
 
 	node = add_sync(kernel, node);
 	node = add_copies(kernel, node);
 
+	node = gpu_tree_move_down_to_shared(node, kernel->core);
+	node = isl_schedule_node_delete(node);
+
 	node = gpu_tree_move_down_to_thread(node, kernel->core);
 	node = isl_schedule_node_delete(node);
 
@@ -3776,18 +3999,86 @@ static __isl_give isl_schedule_node *ins
 	return node;
 }
 
+/* See if hybrid tiling can be performed on "node" and its parent.
+ * If so, apply hybrid tiling and return the updated schedule tree.
+ * If not, return the original schedule tree.
+ * Return NULL on error.
+ *
+ * First check if "node", together with its parent, meets
+ * the basic requirements for hybrid tiling.
+ * If so, compute the relative dependence distances of "node"
+ * with respect to its parent and check if they are sufficiently bounded.
+ * If so, apply hybrid tiling using user specified tile sizes.
+ *
+ * The tile sizes are read before the dependence distance bounds are
+ * computed, because the user may have specified fewer dimensions
+ * than are available.  In this case, the remaining schedule dimensions
+ * are split off and the dependence distances should be computed
+ * after these dimensions have been split off.
+ */
+static __isl_give isl_schedule_node *try_hybrid_tile(struct gpu_gen *gen,
+	__isl_take isl_schedule_node *node)
+{
+	int tile_len;
+	int *tile_size;
+	isl_bool ok;
+	isl_schedule_node *orig = node;
+	ppcg_ht_bounds *bounds;
+
+	ok = ppcg_ht_parent_has_input_pattern(node);
+	if (ok < 0)
+		return isl_schedule_node_free(node);
+	if (!ok)
+		return orig;
+
+	tile_len = 1 + isl_schedule_node_band_n_member(node);
+	tile_size = read_tile_sizes(gen, &tile_len);
+	if (!tile_size)
+		return isl_schedule_node_free(node);
+
+	node = isl_schedule_node_copy(node);
+	node = split_band(node, tile_len - 1);
+	node = isl_schedule_node_parent(node);
+	bounds = ppcg_ht_compute_bounds(gen->prog->scop, node);
+	node = isl_schedule_node_child(node, 0);
+
+	ok = ppcg_ht_bounds_is_valid(bounds);
+	if (ok >= 0 && ok)
+		node = gpu_hybrid_tile(gen, node, bounds, tile_size);
+	else
+		ppcg_ht_bounds_free(bounds);
+	free(tile_size);
+
+	if (ok >= 0 && !ok) {
+		isl_schedule_node_free(node);
+		return orig;
+	}
+	isl_schedule_node_free(orig);
+	if (ok < 0)
+		return isl_schedule_node_free(node);
+	return node;
+}
+
 /* If "node" is the outermost permutable band that can be mapped to block and
- * thread identifiers in its branch (or a leaf with no such outer bands),
+ * thread identifiers in its branch (or the root of a subtree with
+ * no such outer bands),
  * then mark the band as such, attaching a ppcg_kernel to the mark.
  *
- * If "node" originally points to a leaf, then insert a zero-dimensional
- * permutable band such that we can assume that "node" always
- * points to a band node.
+ * If hybrid tiling is allowed, then first try and apply it
+ * to "node" and its parent.
+ *
+ * If "node" is the root of a subtree without permutable bands,
+ * then insert a zero-dimensional permutable band such that
+ * we can assume that "node" always points to a band node.
+ * This includes the case where "node" already points to a band node,
+ * but one without any coincident dimension.  In this case,
+ * the extra node ensures that this original node does not get tiled.
  *
  * Tile "node" using user specified tile sizes, after splitting the band
  * if the number of specified tile sizes is smaller than the dimension
  * of the band.  Mark the point band of this tiling as the band that
- * needs to be mapped to threads.
+ * needs to be mapped to threads and instruct the AST generator to unroll
+ * the band if the "unroll_gpu_tile" option is set.
  * Create a kernel representing the domain instances that reach "node" and
  * insert a mark node pointing to the ppcg_kernel before the band node.
  */
@@ -3808,7 +4099,16 @@ static __isl_give isl_schedule_node *mar
 	if (!outer)
 		return node;
 
-	if (isl_schedule_node_get_type(node) == isl_schedule_node_leaf)
+	if (gen->options->hybrid) {
+		isl_schedule_node *saved = isl_schedule_node_copy(node);
+		node = try_hybrid_tile(gen, node);
+		isl_schedule_node_free(saved);
+		if (node != saved)
+			return node;
+	}
+
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_band ||
+	    !isl_schedule_node_band_member_get_coincident(node, 0))
 		node = insert_empty_permutable_band(node);
 
 	tile_len = isl_schedule_node_band_n_member(node);
@@ -3820,46 +4120,106 @@ static __isl_give isl_schedule_node *mar
 	sizes = construct_band_tiles_sizes(node, tile_size);
 	node = tile_band(node, isl_multi_val_copy(sizes));
 	node = isl_schedule_node_child(node, 0);
+	if (gen->options->unroll_gpu_tile)
+		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
 	id = isl_id_alloc(gen->ctx, "thread", NULL);
 	node = isl_schedule_node_insert_mark(node, id);
 	node = isl_schedule_node_parent(node);
 
 	scale = gen->options->scale_tile_loops;
-	node = create_kernel(gen, node, scale, sizes);
+	node = gpu_create_kernel(gen, node, scale, sizes);
 	isl_multi_val_free(sizes);
 	free(tile_size);
 
 	return node;
 }
 
-/* Does the subtree rooted at "node" have any suitably permutable band nodes?
- * That is, does it have any nodes that are permutable and that
- * have a least one coincident dimension?
+/* Given a set or sequence node, return the union the filters of either all
+ * (if "only_initial" is not set) or the initial (if "only_initial" is set)
+ * direct subtrees that do not contain any suitably permutable bands
+ * (according to subtree_has_permutable_bands).
  */
-static int subtree_has_permutable_bands(__isl_keep isl_schedule_node *node)
+static __isl_give isl_union_set *get_non_parallel_subtree_filters(
+	__isl_keep isl_schedule_node *node, int only_initial)
 {
-	int any_parallelism = 0;
+	isl_space *space;
+	isl_union_set *filter;
+	int i, n;
 
-	if (isl_schedule_node_foreach_descendant_top_down(node, &set_permutable,
-						&any_parallelism) < 0 &&
-	    !any_parallelism)
-		return -1;
+	n = isl_schedule_node_n_children(node);
+	if (n < 0)
+		return NULL;
 
-	return any_parallelism;
+	node = isl_schedule_node_copy(node);
+	node = isl_schedule_node_child(node, 0);
+	filter = isl_schedule_node_filter_get_filter(node);
+	node = isl_schedule_node_parent(node);
+	space = isl_union_set_get_space(filter);
+	isl_union_set_free(filter);
+	filter = isl_union_set_empty(space);
+
+	for (i = 0; i < n; ++i) {
+		int parallelism;
+
+		node = isl_schedule_node_child(node, i);
+		parallelism = subtree_has_permutable_bands(node);
+		if (parallelism < 0) {
+			filter = isl_union_set_free(filter);
+		} else if (!parallelism) {
+			isl_union_set *filter_i;
+			filter_i = isl_schedule_node_filter_get_filter(node);
+			filter = isl_union_set_union(filter, filter_i);
+		} else if (only_initial)
+			break;
+		node = isl_schedule_node_parent(node);
+	}
+
+	isl_schedule_node_free(node);
+
+	return filter;
+}
+
+/* Given a set or sequence node, return the union of the filters of
+ * the direct subtrees that do not contain any suitably permutable bands
+ * (according to subtree_has_permutable_bands).
+ */
+static __isl_give isl_union_set *get_all_non_parallel_subtree_filters(
+	__isl_keep isl_schedule_node *node)
+{
+	return get_non_parallel_subtree_filters(node, 0);
+}
+
+/* Given a set or sequence node, return the union of the filters of
+ * the initial direct subtrees that do not contain any suitably permutable
+ * bands (according to subtree_has_permutable_bands).
+ */
+static __isl_give isl_union_set *get_initial_non_parallel_subtree_filters(
+	__isl_keep isl_schedule_node *node)
+{
+	return get_non_parallel_subtree_filters(node, 1);
 }
 
 /* Mark all variables that are accessed by the statement instances in "domain"
  * and that are local to "prog" as requiring a declaration in the host code.
+ * The statement instances in "domain" correspond to (a subset of)
+ * the active instances at "node".
+ * "node" is not modified by this function, except that NULL is returned
+ * in case of error.
  */
-static int declare_accessed_local_variables(struct gpu_prog *prog,
+static __isl_give isl_schedule_node *declare_accessed_local_variables(
+	__isl_take isl_schedule_node *node, struct gpu_prog *prog,
 	__isl_keep isl_union_set *domain)
 {
+	isl_union_pw_multi_aff *contraction;
 	isl_union_set *arrays;
 	int i;
 
 	if (!ppcg_scop_any_hidden_declarations(prog->scop))
-		return 0;
-	arrays = accessed_by_domain(isl_union_set_copy(domain), prog);
+		return node;
+	contraction = isl_schedule_node_get_subtree_contraction(node);
+	domain = isl_union_set_copy(domain);
+	domain = isl_union_set_preimage_union_pw_multi_aff(domain, contraction);
+	arrays = accessed_by_domain(domain, prog);
 
 	for (i = 0; i < prog->n_array; ++i) {
 		isl_space *space;
@@ -3879,10 +4239,10 @@ static int declare_accessed_local_variab
 	}
 
 	isl_union_set_free(arrays);
-	return 0;
+	return node;
 error:
 	isl_union_set_free(arrays);
-	return -1;
+	return isl_schedule_node_free(node);
 }
 
 /* If "node" points to a set node, then separate its children
@@ -3891,51 +4251,33 @@ error:
  * Adjust the schedule tree in order to execute the second group
  * after the first group and return a pointer to the first group,
  * assuming there are any such subtrees.
- * Mark all local variables in "prog" that are accessed by
- * the second group as requiring a declaration on the host.
+ * If "node" points to a sequence node, then separate the initial
+ * children that do not have suitably permutable bands and
+ * return a pointer to the subsequence of children that do have such bands,
+ * assuming there are any such subtrees.
+ *
+ * In both cases, mark all local variables in "prog" that are accessed by
+ * the group without permutable bands as requiring a declaration on the host.
  */
 static __isl_give isl_schedule_node *isolate_permutable_subtrees(
 	__isl_take isl_schedule_node *node, struct gpu_prog *prog)
 {
-	isl_space *space;
 	isl_union_set *filter;
-	int i, n;
+	enum isl_schedule_node_type type;
 
 	if (!node)
 		return NULL;
-	if (isl_schedule_node_get_type(node) != isl_schedule_node_set)
-		return node;
-
-	n = isl_schedule_node_n_children(node);
-	if (n < 0)
-		return isl_schedule_node_free(node);
-
-	node = isl_schedule_node_child(node, 0);
-	filter = isl_schedule_node_filter_get_filter(node);
-	node = isl_schedule_node_parent(node);
-	space = isl_union_set_get_space(filter);
-	isl_union_set_free(filter);
-	filter = isl_union_set_empty(space);
-
-	for (i = 0; i < n; ++i) {
-		int parallelism;
-
-		node = isl_schedule_node_child(node, i);
-		parallelism = subtree_has_permutable_bands(node);
-		if (parallelism < 0) {
-			node = isl_schedule_node_free(node);
-		} else if (!parallelism) {
-			isl_union_set *filter_i;
-			filter_i = isl_schedule_node_filter_get_filter(node);
-			filter = isl_union_set_union(filter, filter_i);
-		}
-		node = isl_schedule_node_parent(node);
+	type = isl_schedule_node_get_type(node);
+	if (type == isl_schedule_node_set) {
+		filter = get_all_non_parallel_subtree_filters(node);
+		node = declare_accessed_local_variables(node, prog, filter);
+		node = isl_schedule_node_order_after(node, filter);
+	} else if (type == isl_schedule_node_sequence) {
+		filter = get_initial_non_parallel_subtree_filters(node);
+		node = declare_accessed_local_variables(node, prog, filter);
+		node = isl_schedule_node_order_before(node, filter);
 	}
 
-	if (declare_accessed_local_variables(prog, filter) < 0)
-		node = isl_schedule_node_free(node);
-	node = isl_schedule_node_order_after(node, filter);
-
 	return node;
 }
 
@@ -3983,51 +4325,6 @@ static __isl_give isl_schedule_node *mar
 						&mark_outer_permutable, gen);
 }
 
-/* Save the schedule "schedule" to a file called "filename".
- * The schedule is printed in block style.
- */
-static void save_schedule(__isl_keep isl_schedule *schedule,
-	const char *filename)
-{
-	FILE *file;
-	isl_ctx *ctx;
-	isl_printer *p;
-
-	if (!schedule)
-		return;
-
-	file = fopen(filename, "w");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for writing\n", filename);
-		return;
-	}
-	ctx = isl_schedule_get_ctx(schedule);
-	p = isl_printer_to_file(ctx, file);
-	p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
-	p = isl_printer_print_schedule(p, schedule);
-	isl_printer_free(p);
-	fclose(file);
-}
-
-/* Load and return a schedule from a file called "filename".
- */
-static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
-	const char *filename)
-{
-	FILE *file;
-	isl_schedule *schedule;
-
-	file = fopen(filename, "r");
-	if (!file) {
-		fprintf(stderr, "Unable to open '%s' for reading\n", filename);
-		return NULL;
-	}
-	schedule = isl_schedule_read_from_file(ctx, file);
-	fclose(file);
-
-	return schedule;
-}
-
 /* Construct schedule constraints from the dependences in prog->scop and
  * the array order dependences in prog->array_order.
  *
@@ -4108,6 +4405,8 @@ static __isl_give isl_schedule_constrain
  * We derive schedule constraints from the dependences in gen->prog->scop
  * and then use isl to compute a schedule that has a parallel loop
  * in each tilable band.
+ * During the schedule construction, some statement instances
+ * may be grouped first based on the input schedule.
  */
 static __isl_give isl_schedule *compute_schedule(struct gpu_gen *gen)
 {
@@ -4115,7 +4414,8 @@ static __isl_give isl_schedule *compute_
 	isl_schedule *schedule;
 
 	sc = construct_schedule_constraints(gen->prog);
-	schedule = isl_schedule_constraints_compute_schedule(sc);
+	schedule = gen->prog->scop->schedule;
+	schedule = ppcg_compute_schedule(sc, schedule, gen->options);
 
 	return schedule;
 }
@@ -4265,30 +4565,27 @@ static __isl_give isl_schedule *determin
 	return schedule;
 }
 
+/* Compute a schedule or determine the properties of the original schedule
+ * depending on the value of the "reschedule" option.
+ */
+static __isl_give isl_schedule *compute_or_set_properties(void *user)
+{
+	struct gpu_gen *gen = user;
+
+	if (gen->options->reschedule)
+		return compute_schedule(gen);
+	else
+		return determine_properties_original_schedule(gen);
+}
+
 /* Obtain a schedule for the scop, by reading it from
  * a file, by computing one or by determining the properties
  * of the original schedule.
  */
-__isl_give isl_schedule *get_schedule(struct gpu_gen *gen)
+static __isl_give isl_schedule *get_schedule(struct gpu_gen *gen)
 {
-	isl_schedule *schedule;
-
-	if (gen->options->load_schedule_file) {
-		schedule = load_schedule(gen->ctx,
-					gen->options->load_schedule_file);
-	} else {
-		if (gen->options->reschedule)
-			schedule = compute_schedule(gen);
-		else
-			schedule = determine_properties_original_schedule(gen);
-		if (gen->options->save_schedule_file)
-			save_schedule(schedule,
-					gen->options->save_schedule_file);
-	}
-	if (gen->options->debug->dump_schedule)
-		isl_schedule_dump(schedule);
-
-	return schedule;
+	return ppcg_get_schedule(gen->ctx, gen->options,
+				&compute_or_set_properties, gen);
 }
 
 /* Construct the string "<a>_<b>".
@@ -4686,7 +4983,6 @@ static int update_may_persist_at_filter(
 	isl_space *space;
 	isl_union_pw_multi_aff *contraction;
 	isl_union_set *before, *after, *filter;
-	isl_union_map *flow;
 
 	type = isl_schedule_node_get_parent_type(node);
 	if (type != isl_schedule_node_sequence && type != isl_schedule_node_set)
@@ -4785,7 +5081,6 @@ static __isl_give isl_union_set *node_ma
 	__isl_keep isl_schedule_node *node, struct gpu_prog *prog)
 {
 	struct ppcg_may_persist_data data;
-	isl_schedule_node *root;
 	isl_union_pw_multi_aff *contraction;
 	isl_union_set *domain;
 	isl_union_set *persist;
@@ -4826,11 +5121,11 @@ static __isl_give isl_union_set *node_ma
 
 /* Add nodes for copying outer arrays in and out of the device
  * before and after the subtree "node", which contains one or more kernels.
- * "domain" contains the original reaching domain elements before
- * the kernels were created, i.e., before the contraction that
- * may have been performed in creating the kernels has been applied.
+ * "domain" contains the original statement instances, i.e.,
+ * those that correspond to the domains of the access relations in "prog".
+ * In particular, the domain has not been contracted in any way.
  * "prefix" contains the prefix schedule at that point, in terms
- * of the same original reaching domain elements.
+ * of the same original statement instances.
  *
  * We first compute the sets of outer array elements that need
  * to be copied in and out and then graft in the nodes for
@@ -4868,7 +5163,7 @@ static __isl_give isl_schedule_node *add
 	__isl_take isl_union_map *prefix, struct gpu_prog *prog)
 {
 	isl_union_set *local;
-	isl_union_set *to_device, *from_device, *may_persist;
+	isl_union_set *may_persist;
 	isl_union_map *may_write, *must_write, *copy_out, *not_written;
 	isl_union_map *read, *copy_in;
 	isl_union_map *tagged;
@@ -4932,38 +5227,90 @@ static __isl_give isl_schedule_node *add
 	return node;
 }
 
+/* Add nodes for initializing ("init_device") and clearing ("clear_device")
+ * the device before and after "node".
+ */
+static __isl_give isl_schedule_node *add_init_clear_device(
+	__isl_take isl_schedule_node *node)
+{
+	isl_ctx *ctx;
+	isl_space *space;
+	isl_union_set *domain;
+	isl_schedule_node *graft;
+
+	ctx = isl_schedule_node_get_ctx(node);
+
+	space = isl_space_set_alloc(ctx, 0, 0);
+	space = isl_space_set_tuple_name(space, isl_dim_set, "init_device");
+	domain = isl_union_set_from_set(isl_set_universe(space));
+	graft = isl_schedule_node_from_domain(domain);
+
+	node = isl_schedule_node_graft_before(node, graft);
+
+	space = isl_space_set_alloc(ctx, 0, 0);
+	space = isl_space_set_tuple_name(space, isl_dim_set, "clear_device");
+	domain = isl_union_set_from_set(isl_set_universe(space));
+	graft = isl_schedule_node_from_domain(domain);
+
+	node = isl_schedule_node_graft_after(node, graft);
+
+	return node;
+}
+
 /* Update "schedule" for mapping to a GPU device.
  *
  * In particular, insert a context node, create kernels for
- * each outermost tilable band and introduce node for copying array
- * in and out of the device.
+ * each outermost tilable band and introduce nodes for copying arrays
+ * in and out of the device and for initializing and clearing the device.
  * If the child of the initial root points to a set node,
  * then children of this node that do not contain any tilable bands
  * are separated from the other children and are not mapped to
  * the device.
+ *
+ * The GPU code is generated in a context where at least one
+ * statement instance is executed.  The corresponding guard is inserted
+ * around the entire schedule.
  */
-__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
+static __isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
 	__isl_take isl_schedule *schedule)
 {
 	isl_schedule_node *node;
 	isl_set *context;
+	isl_set *guard;
 	isl_union_set *domain;
 	isl_union_map *prefix;
+	isl_union_pw_multi_aff *contraction;
+	struct gpu_prog *prog;
 
 	context = isl_set_copy(gen->prog->context);
 	context = isl_set_from_params(context);
 	schedule = isl_schedule_insert_context(schedule, context);
 
+	prog = gen->prog;
+	guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
+	prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
+	guard = isl_set_from_params(guard);
+
 	node = isl_schedule_get_root(schedule);
 	isl_schedule_free(schedule);
 	node = isl_schedule_node_child(node, 0);
-	if (isl_schedule_node_get_type(node) == isl_schedule_node_context)
-		node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_child(node, 0);
 	node = isolate_permutable_subtrees(node, gen->prog);
 	domain = isl_schedule_node_get_domain(node);
+	contraction = isl_schedule_node_get_subtree_contraction(node);
+	domain = isl_union_set_preimage_union_pw_multi_aff(domain,
+				    isl_union_pw_multi_aff_copy(contraction));
 	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
+	prefix = isl_union_map_preimage_domain_union_pw_multi_aff(prefix,
+				    contraction);
 	node = mark_kernels(gen, node);
 	node = add_to_from_device(node, domain, prefix, gen->prog);
+	node = isl_schedule_node_root(node);
+	node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_insert_guard(node, guard);
+	node = isl_schedule_node_child(node, 0);
+	node = add_init_clear_device(node);
 	schedule = isl_schedule_node_get_schedule(node);
 	isl_schedule_node_free(node);
 
@@ -5032,13 +5379,88 @@ error:
 	return NULL;
 }
 
+/* Does the index expression "index" of "expr" represent an access
+ * to a single element?
+ * That is, is "index" completely specified?
+ *
+ * If "expr" accesses elements from different spaces (i.e., fields
+ * of a structure), then it does not access a single element.
+ * Otherwise, if the single space of the access matches the space
+ * of "index", then the index expression is completely specified
+ * (no pointer to a lower-dimensional slice of the accessed array)
+ * and a single element is being accessed.
+ */
+static isl_bool complete_index(__isl_keep pet_expr *expr,
+	__isl_keep isl_multi_pw_aff *index)
+{
+	isl_union_map *read, *write, *all;
+	isl_map *map;
+	isl_space *space1, *space2;
+	isl_bool complete;
+
+	read = pet_expr_access_get_may_read(expr);
+	write = pet_expr_access_get_may_write(expr);
+	all = isl_union_map_union(read, write);
+	if (!all)
+		return isl_bool_error;
+	if (isl_union_map_n_map(all) != 1) {
+		isl_union_map_free(all);
+		return isl_bool_false;
+	}
+	map = isl_map_from_union_map(all);
+	space1 = isl_map_get_space(map);
+	isl_map_free(map);
+	space2 = isl_multi_pw_aff_get_space(index);
+	complete = isl_space_tuple_is_equal(space1, isl_dim_out,
+					    space2, isl_dim_out);
+	isl_space_free(space1);
+	isl_space_free(space2);
+
+	return complete;
+}
+
+/* Does "expr" access a single, fixed element (independently of the statement
+ * instance)?
+ * That is, does it have a completely specified constant index expression?
+ *
+ * Note that it is not sufficient for the index expression to be
+ * piecewise constant.  isl_multi_pw_aff_is_cst can therefore not be used.
+ */
+static isl_bool accesses_fixed_element(__isl_keep pet_expr *expr)
+{
+	int i, n;
+	isl_multi_pw_aff *index;
+	isl_bool fixed = isl_bool_true;
+
+	index = pet_expr_access_get_index(expr);
+	if (index < 0)
+		return isl_bool_error;
+	n = isl_multi_pw_aff_dim(index, isl_dim_out);
+	for (i = 0; i < n; ++i) {
+		isl_pw_aff *pa;
+
+		pa = isl_multi_pw_aff_get_pw_aff(index, 0);
+		fixed = isl_pw_aff_n_piece(pa) == 1;
+		if (fixed)
+			fixed = isl_pw_aff_is_cst(pa);
+		isl_pw_aff_free(pa);
+		if (fixed < 0 || !fixed)
+			break;
+	}
+	if (fixed >= 0 && fixed)
+		fixed = complete_index(expr, index);
+	isl_multi_pw_aff_free(index);
+
+	return fixed;
+}
+
 /* Extract a gpu_stmt_access from "expr", append it to the list
  * that ends in *data->next_access and update the end of the list.
  * If the access expression performs a write, then it is considered
  * exact only if it appears in a single expression statement and
  * if its may access relation is equal to its must access relation.
  *
- * The combined set of may accesses may be union if member accesses
+ * The combined set of may accesses may be a union if member accesses
  * are involved, but the entire set is derived from a single reference and
  * therefore from a single index expression.  These accesses therefore
  * all map to the same outer array.
@@ -5081,11 +5503,12 @@ static int extract_access(__isl_keep pet
 	access->tagged_access = extract_single_tagged_access(tagged, expr);
 	access->access = isl_map_copy(access->tagged_access);
 	access->access = isl_map_domain_factor_domain(access->access);
+	access->fixed_element = accesses_fixed_element(expr);
 
 	*data->next_access = access;
 	data->next_access = &(*data->next_access)->next;
 
-	if (!access->access)
+	if (!access->access || access->fixed_element < 0)
 		return -1;
 
 	return 0;
@@ -5109,10 +5532,31 @@ static int pet_stmt_extract_accesses(str
 						&extract_access, &data);
 }
 
+/* Has statement "stmt" been killed from "scop"?
+ * That is, is the instance set of "scop" free from any
+ * instances of "stmt"?
+ */
+static isl_bool is_stmt_killed(struct ppcg_scop *scop, struct pet_stmt *stmt)
+{
+	isl_space *space;
+	isl_set *left;
+	isl_bool empty;
+
+	if (!scop || !stmt)
+		return isl_bool_error;
+	space = isl_set_get_space(stmt->domain);
+	left = isl_union_set_extract_set(scop->domain, space);
+	empty = isl_set_plain_is_empty(left);
+	isl_set_free(left);
+
+	return empty;
+}
+
 /* Return an array of gpu_stmt representing the statements in "scop".
+ * Do not collect array accesses for statements that have been killed.
  */
 static struct gpu_stmt *extract_stmts(isl_ctx *ctx, struct ppcg_scop *scop,
-	__isl_keep isl_set *context, __isl_keep isl_union_map *any_to_outer)
+	__isl_keep isl_union_map *any_to_outer)
 {
 	int i;
 	struct gpu_stmt *stmts;
@@ -5123,9 +5567,15 @@ static struct gpu_stmt *extract_stmts(is
 
 	for (i = 0; i < scop->pet->n_stmt; ++i) {
 		struct gpu_stmt *s = &stmts[i];
+		isl_bool killed;
 
 		s->id = isl_set_get_tuple_id(scop->pet->stmts[i]->domain);
 		s->stmt = scop->pet->stmts[i];
+		killed = is_stmt_killed(scop, scop->pet->stmts[i]);
+		if (killed < 0)
+			return free_stmts(stmts, i + 1);
+		if (killed)
+			continue;
 		if (pet_stmt_extract_accesses(s, any_to_outer) < 0)
 			return free_stmts(stmts, i + 1);
 	}
@@ -5133,16 +5583,6 @@ static struct gpu_stmt *extract_stmts(is
 	return stmts;
 }
 
-/* Callback for ppcg_print_guarded that calls the callback for generate_gpu.
- */
-static __isl_give isl_printer *print_gpu(__isl_take isl_printer *p, void *user)
-{
-	struct gpu_gen *gen = user;
-
-	return gen->print(p, gen->prog, gen->tree, &gen->types,
-			    gen->print_user);
-}
-
 /* Generate CUDA code for "scop" and print it to "p".
  * After generating an AST for the transformed scop as explained below,
  * we call "gen->print" to print the AST in the desired output format
@@ -5151,11 +5591,9 @@ static __isl_give isl_printer *print_gpu
  * If it turns out that it does not make sense to generate GPU code,
  * then we generate CPU code instead.
  *
- * The GPU code is generated in a context where at least one
- * statement instance is executed.  The corresponding guard (if any) is printed
- * around the entire generated GPU code, except for the declaration
- * of the arrays that are visible outside of the scop and that therefore
- * cannot be declared inside the body of any possible guard.
+ * The declarations of the arrays that are visible outside of the scop
+ * are printed outside of the code generated from the schedule,
+ * because the generated code may involve a guard around the entire code.
  *
  * We first compute a schedule that respects the dependences
  * of the original program and select the outermost bands
@@ -5210,7 +5648,6 @@ static __isl_give isl_printer *generate(
 {
 	struct gpu_prog *prog;
 	isl_ctx *ctx;
-	isl_set *context, *guard;
 	isl_schedule *schedule;
 	int any_permutable;
 
@@ -5222,17 +5659,11 @@ static __isl_give isl_printer *generate(
 	if (!prog)
 		return isl_printer_free(p);
 
-	context = isl_set_copy(prog->context);
-	guard = isl_union_set_params(isl_union_set_copy(prog->scop->domain));
-	prog->context = isl_set_intersect(prog->context, isl_set_copy(guard));
-
 	gen->prog = prog;
 	schedule = get_schedule(gen);
 
 	any_permutable = has_any_permutable_node(schedule);
 	if (any_permutable < 0 || !any_permutable) {
-		isl_set_free(context);
-		isl_set_free(guard);
 		if (any_permutable < 0)
 			p = isl_printer_free(p);
 		else
@@ -5241,9 +5672,10 @@ static __isl_give isl_printer *generate(
 	} else {
 		schedule = map_to_device(gen, schedule);
 		gen->tree = generate_code(gen, schedule);
-		p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
+		p = ppcg_set_macro_names(p);
 		p = ppcg_print_exposed_declarations(p, prog->scop);
-		p = ppcg_print_guarded(p, guard, context, &print_gpu, gen);
+		p = gen->print(p, gen->prog, gen->tree, &gen->types,
+				    gen->print_user);
 		isl_ast_node_free(gen->tree);
 	}
 
@@ -5309,7 +5741,7 @@ int generate_gpu(isl_ctx *ctx, const cha
  * arrays that are not local to "prog" and remove those elements that
  * are definitely killed or definitely written by "prog".
  */
-__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
+static __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog)
 {
 	int i;
 	isl_union_set *may_persist, *killed;
@@ -5363,8 +5795,7 @@ struct gpu_prog *gpu_prog_alloc(isl_ctx
 	space = isl_space_map_from_set(space);
 	id = isl_map_identity(space);
 	prog->any_to_outer = isl_union_map_add_map(prog->any_to_outer, id);
-	prog->stmts = extract_stmts(ctx, scop,
-					prog->context, prog->any_to_outer);
+	prog->stmts = extract_stmts(ctx, scop, prog->any_to_outer);
 	prog->read = isl_union_map_copy(scop->reads);
 	prog->may_write = isl_union_map_copy(scop->may_writes);
 	prog->must_write = isl_union_map_copy(scop->must_writes);

Modified: polly/trunk/lib/External/ppcg/gpu.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu.h (original)
+++ polly/trunk/lib/External/ppcg/gpu.h Thu Jul 20 08:48:13 2017
@@ -2,11 +2,59 @@
 #define _GPU_H
 
 #include <isl/ast.h>
+#include <isl/id.h>
 #include <isl/id_to_ast_expr.h>
 
+#include <pet.h>
+
 #include "ppcg.h"
 #include "ppcg_options.h"
 
+/* An access to an outer array element or an iterator.
+ * Accesses to iterators have an access relation that maps to an unnamed space.
+ * An access may be both read and write.
+ * If the access relation is empty, then the output dimension may
+ * not be equal to the dimension of the corresponding array.
+ */
+struct gpu_stmt_access {
+	/* Access reads elements */
+	int read;
+	/* Access writes elements */
+	int write;
+	/* All writes are definite writes. */
+	int exact_write;
+	/* Is a single, fixed element being accessed? */
+	isl_bool fixed_element;
+	/* The number of index expressions specified in the access. */
+	int n_index;
+
+	/* May access relation */
+	isl_map *access;
+	/* May access relation with as domain a mapping from iteration domain
+	 * to a reference identifier.
+	 */
+	isl_map *tagged_access;
+	/* The reference id of the corresponding pet_expr. */
+	isl_id *ref_id;
+
+	struct gpu_stmt_access *next;
+};
+
+/* A representation of a user statement.
+ * "stmt" points to the corresponding pet statement.
+ * "id" is the identifier of the instance set of the statement.
+ * "accesses" is a linked list of accesses performed by the statement.
+ * If the statement has been killed, i.e., if it will not be scheduled,
+ * then this linked list may be empty even if the actual statement does
+ * perform accesses.
+ */
+struct gpu_stmt {
+	isl_id *id;
+	struct pet_stmt *stmt;
+
+	struct gpu_stmt_access *accesses;
+};
+
 /* Represents an outer array possibly accessed by a gpu_prog.
  */
 struct gpu_array_info {
@@ -18,12 +66,20 @@ struct gpu_array_info {
 	int size;
 	/* Name of the array. */
 	char *name;
+	/* Declared extent of original array. */
+	isl_set *declared_extent;
+	/* AST expression for declared size of original array. */
+	isl_ast_expr *declared_size;
 	/* Extent of the array that needs to be copied. */
 	isl_set *extent;
 	/* Number of indices. */
 	unsigned n_index;
 	/* For each index, a bound on "extent" in that direction. */
-	isl_pw_aff **bound;
+	isl_multi_pw_aff *bound;
+	/* The corresponding access AST expression, if the array needs
+	 * to be allocated on the device.
+	 */
+	isl_ast_expr *bound_expr;
 
 	/* All references to this array; point to elements of a linked list. */
 	int n_ref;
@@ -38,6 +94,9 @@ struct gpu_array_info {
 	/* Are the elements of the array structures? */
 	int has_compound_element;
 
+	/* Are the elements only accessed through constant index expressions? */
+	int only_fixed_element;
+
 	/* Is the array local to the scop? */
 	int local;
 	/* Is the array local and should it be declared on the host? */
@@ -54,8 +113,6 @@ struct gpu_array_info {
 	 * It is set to NULL otherwise.
 	 */
 	isl_union_map *dep_order;
-
-        void *user;
 };
 
 /* Represents an outer array accessed by a ppcg_kernel, localized
@@ -67,8 +124,8 @@ struct gpu_array_info {
  * must be mapped to a register.
  * "global" is set if the global device memory corresponding
  * to this array is accessed by the kernel.
- * For each index i with 0 <= i < n_index,
- * bound[i] is equal to array->bound[i] specialized to the current kernel.
+ * "bound" is equal to array->bound specialized to the current kernel.
+ * "bound_expr" is the corresponding access AST expression.
  */
 struct gpu_local_array_info {
 	struct gpu_array_info *array;
@@ -80,7 +137,8 @@ struct gpu_local_array_info {
 	int global;
 
 	unsigned n_index;
-	isl_pw_aff_list *bound;
+	isl_multi_pw_aff *bound;
+	isl_ast_expr *bound_expr;
 };
 
 __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
@@ -125,7 +183,7 @@ struct gpu_prog {
 	/* A mapping from the outer arrays to all corresponding inner arrays. */
 	isl_union_map *to_inner;
 	/* A mapping from all intermediate arrays to their outer arrays,
-	 * including an identity mapping from the anoymous 1D space to itself.
+	 * including an identity mapping from the anonymous 1D space to itself.
 	 */
 	isl_union_map *any_to_outer;
 
@@ -150,17 +208,6 @@ struct gpu_gen {
 		struct gpu_types *types, void *user);
 	void *print_user;
 
-        isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
-	        isl_ast_build *build,
-        	isl_multi_pw_aff *(*fn_index)(
-	        	__isl_take isl_multi_pw_aff *mpa, isl_id *id,
-		        void *user),
-                void *user_index,
-        	isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
-		        isl_id *id, void *user),
-        void *user_expr);
-
-
 	struct gpu_prog *prog;
 	/* The generated AST. */
 	isl_ast_node *tree;
@@ -178,7 +225,7 @@ struct gpu_gen {
 	int kernel_id;
 };
 
-enum ppcg_kernel_access_type {
+enum ppcg_group_access_type {
 	ppcg_access_global,
 	ppcg_access_shared,
 	ppcg_access_private
@@ -238,7 +285,7 @@ struct ppcg_kernel_stmt {
  */
 struct ppcg_kernel_var {
 	struct gpu_array_info *array;
-	enum ppcg_kernel_access_type type;
+	enum ppcg_group_access_type type;
 	char *name;
 	isl_vec *size;
 };
@@ -262,6 +309,8 @@ struct ppcg_kernel_var {
  * refers to the x dimension.
  *
  * grid_size reflects the effective grid size.
+ * grid_size_expr contains a corresponding access AST expression, built within
+ * the context where the launch appears.
  *
  * context contains the values of the parameters and outer schedule dimensions
  * for which any statement instance in this kernel needs to be executed.
@@ -272,7 +321,14 @@ struct ppcg_kernel_var {
  * core contains the spaces of the statement domains that form
  * the core computation of the kernel.  It is used to navigate
  * the tree during the construction of the device part of the schedule
- * tree in create_kernel.
+ * tree in gpu_create_kernel.
+ *
+ * expanded_domain contains the original statement instances,
+ * i.e., those that appear in the domains of access relations,
+ * that are involved in the kernel.
+ * contraction maps those original statement instances to
+ * the statement instances that are active at the point
+ * in the schedule tree where the kernel is created.
  *
  * arrays is the set of possibly accessed outer array elements.
  *
@@ -297,10 +353,12 @@ struct ppcg_kernel_var {
  * are represented by "n_block" parameters with as names the elements
  * of "thread_ids".
  *
- * shared_schedule corresponds to the schedule dimensions of
+ * copy_schedule corresponds to the schedule dimensions of
  * the (tiled) schedule for this kernel that have been taken into account
  * for computing private/shared memory tiles.
- * shared_schedule_dim is the dimension of this schedule.
+ * The domain corresponds to the original statement instances, i.e.,
+ * those that appear in the leaves of the schedule tree.
+ * copy_schedule_dim is the dimension of this schedule.
  *
  * sync_writes contains write references that require synchronization.
  * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
@@ -323,12 +381,16 @@ struct ppcg_kernel {
 	int block_dim[3];
 
 	isl_multi_pw_aff *grid_size;
+	isl_ast_expr *grid_size_expr;
 	isl_set *context;
 
 	int n_sync;
 	isl_union_set *core;
 	isl_union_set *arrays;
 
+	isl_union_pw_multi_aff *contraction;
+	isl_union_set *expanded_domain;
+
 	isl_space *space;
 
 	int n_array;
@@ -341,8 +403,8 @@ struct ppcg_kernel {
 
 	isl_union_set *block_filter;
 	isl_union_set *thread_filter;
-	isl_union_pw_multi_aff *shared_schedule;
-	int shared_schedule_dim;
+	isl_union_pw_multi_aff *copy_schedule;
+	int copy_schedule_dim;
 
 	isl_union_set *sync_writes;
 
@@ -353,6 +415,7 @@ int gpu_array_is_scalar(struct gpu_array
 int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
 int gpu_array_requires_device_allocation(struct gpu_array_info *array);
 __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
+isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
 
 struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
 void *gpu_prog_free(struct gpu_prog *prog);
@@ -365,13 +428,8 @@ int generate_gpu(isl_ctx *ctx, const cha
 		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 		struct gpu_types *types, void *user), void *user);
 
-__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
-int has_any_permutable_node(__isl_keep isl_schedule *schedule);
-__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule);
-__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
-                                       __isl_take isl_schedule *schedule);
+__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
+	__isl_take isl_schedule_node *node, int scale,
+	__isl_keep isl_multi_val *sizes);
 
-__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
-void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
 #endif

Modified: polly/trunk/lib/External/ppcg/gpu_array_tile.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_array_tile.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_array_tile.h (original)
+++ polly/trunk/lib/External/ppcg/gpu_array_tile.h Thu Jul 20 08:48:13 2017
@@ -9,7 +9,7 @@
  * if shift != NULL.
  * If so, they express that current index is such that if you add shift,
  * then the result is always a multiple of stride.
- * Let D represent the initial group->depth dimensions of the computed schedule.
+ * Let D represent the initial tile->depth dimensions of the computed schedule.
  * The spaces of "lb" and "shift" are of the form
  *
  *	D -> [b]
@@ -22,11 +22,14 @@ struct gpu_array_bound {
 	isl_aff *shift;
 };
 
-/* A tile of an array.
+/* A tile of an outer array.
  *
  * requires_unroll is set if the schedule dimensions that are mapped
  * to threads need to be unrolled for this (private) tile to be used.
  *
+ * "depth" reflects the number of schedule dimensions that affect the tile.
+ * The copying into and/or out of the tile is performed at that depth.
+ *
  * n is the dimension of the array.
  * bound is an array of size "n" representing the lower bound
  *	and size for each index.
@@ -36,12 +39,13 @@ struct gpu_array_bound {
  *
  *	{ [D[i] -> A[a]] -> T[(a + shift(i))/stride - lb(i)] }
  *
- * where D represents the initial group->depth dimensions
+ * where D represents the initial "depth" dimensions
  * of the computed schedule.
  */
 struct gpu_array_tile {
 	isl_ctx *ctx;
 	int requires_unroll;
+	int depth;
 	int n;
 	struct gpu_array_bound *bound;
 	isl_multi_aff *tiling;

Modified: polly/trunk/lib/External/ppcg/gpu_group.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_group.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_group.c (original)
+++ polly/trunk/lib/External/ppcg/gpu_group.c Thu Jul 20 08:48:13 2017
@@ -1,3 +1,16 @@
+/*
+ * Copyright 2010-2011 INRIA Saclay
+ * Copyright 2012-2014 Ecole Normale Superieure
+ * Copyright 2015      Sven Verdoolaege
+ *
+ * Use of this software is governed by the MIT license
+ *
+ * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
+ * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
+ * 91893 Orsay, France
+ * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
+ */
+
 #include <isl/constraint.h>
 #include <isl/ilp.h>
 
@@ -12,10 +25,12 @@ __isl_give isl_printer *gpu_array_ref_gr
 	struct gpu_array_ref_group *group, __isl_take isl_printer *p)
 {
 	int global = 0;
+	enum ppcg_group_access_type type;
 
-	if (group->private_tile)
+	type = gpu_array_ref_group_type(group);
+	if (type == ppcg_access_private)
 		p = isl_printer_print_str(p, "private_");
-	else if (group->shared_tile)
+	else if (type == ppcg_access_shared)
 		p = isl_printer_print_str(p, "shared_");
 	else
 		global = 1;
@@ -52,19 +67,40 @@ __isl_give isl_union_map *gpu_array_ref_
 	return access;
 }
 
-/* Return the effective gpu_array_tile associated to "group" or
- * NULL if there is no such gpu_array_tile.
+/* Should this array reference group be mapped to private, shared or global
+ * memory?
  * If we have computed both a private and a shared tile, then
- * the private tile is used.
+ * the tile with the smallest depth is used.  If both have the same depth,
+ * then the private tile is used.
  */
-struct gpu_array_tile *gpu_array_ref_group_tile(
+enum ppcg_group_access_type gpu_array_ref_group_type(
 	struct gpu_array_ref_group *group)
 {
+	if (group->private_tile && group->shared_tile &&
+	    group->shared_tile->depth < group->private_tile->depth)
+		return ppcg_access_shared;
 	if (group->private_tile)
-		return group->private_tile;
+		return ppcg_access_private;
 	if (group->shared_tile)
+		return ppcg_access_shared;
+	return ppcg_access_global;
+}
+
+
+/* Return the effective gpu_array_tile associated to "group" or
+ * NULL if there is no such gpu_array_tile.
+ */
+struct gpu_array_tile *gpu_array_ref_group_tile(
+	struct gpu_array_ref_group *group)
+{
+	switch (gpu_array_ref_group_type(group)) {
+	case ppcg_access_global:
+		return NULL;
+	case ppcg_access_shared:
 		return group->shared_tile;
-	return NULL;
+	case ppcg_access_private:
+		return group->private_tile;
+	}
 }
 
 /* Does the tile associated to "group" require unrolling of the schedule
@@ -371,11 +407,15 @@ static int compute_array_dim_size(struct
  *
  * We project the accesses on each index in turn and look for a parametric
  * offset such that the size is constant.
+ *
+ * tile->depth is initialized to the input dimension of the computed bounds.
  */
 static int can_tile(__isl_keep isl_map *access, struct gpu_array_tile *tile)
 {
 	int i;
 
+	tile->depth = isl_map_dim(access, isl_dim_in);
+
 	for (i = 0; i < tile->n; ++i) {
 		isl_map *access_i;
 		isl_basic_map *hull;
@@ -399,9 +439,12 @@ static int can_tile(__isl_keep isl_map *
  * kernel_depth is the schedule depth where the kernel launch will
  * be introduced, i.e., it is the depth of the band that is mapped
  * to blocks.
+ * shared_depth is the schedule depth at which the copying to/from
+ * shared memory is computed.  The copy operation may then
+ * later be hoisted to a higher level.
  * thread_depth is the schedule depth where the thread mark is located,
  * i.e., it is the depth of the band that is mapped to threads and also
- * the schedule depth at which the copying to/from shared/private memory
+ * the schedule depth at which the copying to/from private memory
  * is computed.  The copy operation may then later be hoisted to
  * a higher level.
  * n_thread is the number of schedule dimensions in the band that
@@ -410,20 +453,27 @@ static int can_tile(__isl_keep isl_map *
  * of dimension thread_depth + n_thread) and encodes the mapping
  * to thread identifiers (as parameters).
  * host_sched contains the kernel_depth dimensions of the host schedule.
- * shared_sched contains the first thread_depth dimensions of the
+ * shared_sched contains the first shared_depth dimensions of the
+ * kernel schedule.
+ * copy_sched contains the first thread_depth dimensions of the
  * kernel schedule.
  * thread_sched contains the first (thread_depth + n_thread) dimensions
  * of the kernel schedule.
  * full_sched is a union_map representation of the entire kernel schedule.
+ * The schedules are all formulated in terms of the original statement
+ * instances, i.e., those that appear in the domains of the access
+ * relations.
  */
 struct gpu_group_data {
 	struct ppcg_scop *scop;
 	int kernel_depth;
+	int shared_depth;
 	int thread_depth;
 	int n_thread;
 	isl_set *privatization;
 	isl_union_map *host_sched;
 	isl_union_map *shared_sched;
+	isl_union_map *copy_sched;
 	isl_union_map *thread_sched;
 	isl_union_map *full_sched;
 };
@@ -466,6 +516,7 @@ static __isl_give isl_map *next(__isl_ta
 static int access_is_coalesced(struct gpu_group_data *data,
 	__isl_keep isl_union_map *access)
 {
+	int dim;
 	isl_space *space;
 	isl_set *accessed;
 	isl_map *access_map;
@@ -481,7 +532,11 @@ static int access_is_coalesced(struct gp
 
 	space = isl_map_get_space(access_map);
 	space = isl_space_range(space);
-	next_element = next(space, isl_space_dim(space, isl_dim_set) - 1);
+	dim = isl_space_dim(space, isl_dim_set);
+	if (dim == 0)
+		next_element = isl_map_empty(isl_space_map_from_set(space));
+	else
+		next_element = next(space, dim - 1);
 
 	accessed = isl_map_range(isl_map_copy(access_map));
 	map = isl_map_copy(next_element);
@@ -576,7 +631,7 @@ static int compute_tile_depth(struct gpu
 {
 	int i, j;
 
-	for (j = data->thread_depth - 1; j >= data->kernel_depth; --j) {
+	for (j = tile->depth - 1; j >= data->kernel_depth; --j) {
 		for (i = 0; i < tile->n; ++i) {
 			isl_aff *lb;
 			isl_aff *shift;
@@ -598,57 +653,156 @@ static int compute_tile_depth(struct gpu
 	return ++j;
 }
 
-/* Adjust the fields of "tile" to reflect the new input dimension "new_dim",
- * where "old_dim" is the old dimension.
- * The dimension beyond "new_dim" are assumed not to affect the tile,
+/* Return the lowest depth between data->kernel_depth and data->thread_depth
+ * at which every array element accessed through "acc" is accessed
+ * by a single thread.  The input dimension of "acc" is
+ * data->thread_depth + data->n_thread, where the final data->n_thread
+ * dimensions are those that will be mapped to threads.
+ * If the values for these dimensions are uniquely determined
+ * by the array index and a given number of outer dimensions, then
+ * there is only one thread accessing that array element within those
+ * outer dimensions.
+ *
+ * The input space of "acc" is first split up, such that it has the form
+ *
+ *	[O -> T] -> A
+ *
+ * with O the outer dimensions, T the dimensions that will be mapped to threads
+ * and A the array index.
+ *
+ * Then the positions of T and A are interchanged to simplify the test
+ * whether T uniquely depends on O and A.
+ * In particular, the above access relation is first combined with
+ *
+ *	[O -> T] -> T
+ *
+ * to form
+ *
+ *	[O -> T] -> [A -> T]
+ *
+ * from which
+ *
+ *	O -> [A -> T]
+ *
+ * is extracted, which is then uncurried to
+ *
+ *	[O -> A] -> T
+ *
+ * Finally, the final dimensions of O are projected out one by one
+ * until T is no longer uniquely determined by A and the remaining
+ * dimensions in O.  The value returned is that of the last dimension
+ * that was successfully projected out.
+ * Note that there is no need to test whether [O -> A] -> T itself
+ * is single-valued as that was already tested in access_is_bijective.
+ */
+static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data,
+	__isl_keep isl_map *acc)
+{
+	int i;
+	isl_space *space;
+	isl_map *map;
+	isl_bool sv;
+
+	if (data->thread_depth == data->kernel_depth)
+		return data->thread_depth;
+
+	acc = isl_map_copy(acc);
+
+	space = isl_map_get_space(acc);
+	space = isl_space_params(space);
+	space = isl_space_set_from_params(space);
+	space = isl_space_add_dims(space, isl_dim_set, data->thread_depth);
+	space = isl_space_from_domain(space);
+	space = isl_space_add_dims(space, isl_dim_out, data->n_thread);
+	space = isl_space_wrap(space);
+	map = isl_set_flatten_map(isl_set_universe(space));
+	acc = isl_map_apply_range(map, acc);
+
+	space = isl_space_domain(isl_map_get_space(acc));
+	map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
+	acc = isl_map_range_product(acc, map);
+	acc = isl_map_domain_factor_domain(acc);
+	acc = isl_map_uncurry(acc);
+
+	for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) {
+		acc = isl_map_project_out(acc, isl_dim_in, i, 1);
+		sv = isl_map_is_single_valued(acc);
+		if (sv < 0)
+			return -1;
+		if (!sv)
+			break;
+	}
+
+	isl_map_free(acc);
+
+	return ++i;
+}
+
+/* Adjust the fields of "tile" to reflect the new input dimension "depth".
+ * The dimension beyond "depth" are assumed not to affect the tile,
  * so they can simply be dropped.
  */
-static int tile_adjust_depth(struct gpu_array_tile *tile,
-	int old_dim, int new_dim)
+static int tile_adjust_depth(struct gpu_array_tile *tile, int depth)
 {
 	int i;
 
-	if (old_dim == new_dim)
+	if (tile->depth == depth)
 		return 0;
 
 	for (i = 0; i < tile->n; ++i) {
 		tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb,
-					isl_dim_in, new_dim, old_dim - new_dim);
+					isl_dim_in, depth, tile->depth - depth);
 		if (!tile->bound[i].lb)
 			return -1;
 		if (!tile->bound[i].shift)
 			continue;
 		tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift,
-					isl_dim_in, new_dim, old_dim - new_dim);
+					isl_dim_in, depth, tile->depth - depth);
 		if (!tile->bound[i].shift)
 			return -1;
 	}
 
+	tile->depth = depth;
+
 	return 0;
 }
 
 /* Determine the number of schedule dimensions that affect the offset of the
- * shared or private tile and store the result in group->depth, with
+ * shared or private tile "tile" and store the result in tile->depth, with
  * a lower bound of data->kernel_depth.
- * If there is no tile defined on the array reference group,
- * then set group->depth to data->thread_depth.
- * Also adjust the fields of the tile to only refer to the group->depth
+ * Also adjust the fields of the tile to only refer to the tile->depth
  * outer schedule dimensions.
  */
-static int set_depth(struct gpu_group_data *data,
-	struct gpu_array_ref_group *group)
+static isl_stat tile_set_depth(struct gpu_group_data *data,
+	struct gpu_array_tile *tile)
 {
-	struct gpu_array_tile *tile;
+	if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0)
+		return isl_stat_error;
 
-	group->depth = data->thread_depth;
+	return isl_stat_ok;
+}
 
-	tile = gpu_array_ref_group_tile(group);
-	if (!tile)
-		return 0;
+/* Determine the number of schedule dimensions that affect the offset of the
+ * shared tile and store the minimum of the private and shared tile depth
+ * in group->min_depth, with a lower bound of data->kernel_depth.
+ * If there is no tile defined on the array reference group,
+ * then set group->min_depth to data->thread_depth.
+ */
+static int set_depth(struct gpu_group_data *data,
+	struct gpu_array_ref_group *group)
+{
+	group->min_depth = data->thread_depth;
 
-	group->depth = compute_tile_depth(data, tile);
-	if (tile_adjust_depth(tile, data->thread_depth, group->depth) < 0)
-		return -1;
+	if (group->private_tile) {
+		if (group->private_tile->depth < group->min_depth)
+			group->min_depth = group->private_tile->depth;
+	}
+	if (group->shared_tile) {
+		if (tile_set_depth(data, group->shared_tile) < 0)
+			return -1;
+		if (group->shared_tile->depth < group->min_depth)
+			group->min_depth = group->shared_tile->depth;
+	}
 
 	return 0;
 }
@@ -666,7 +820,7 @@ static int populate_array_references(str
 {
 	int i;
 	int n;
-	isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched);
+	isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched);
 
 	n = 0;
 	for (i = 0; i < local->array->n_ref; ++i) {
@@ -678,7 +832,7 @@ static int populate_array_references(str
 		map = isl_map_copy(access->access);
 		umap = isl_union_map_from_map(map);
 		umap = isl_union_map_apply_domain(umap,
-				isl_union_map_copy(data->shared_sched));
+				isl_union_map_copy(data->copy_sched));
 
 		if (isl_union_map_is_empty(umap)) {
 			isl_union_map_free(umap);
@@ -727,7 +881,7 @@ struct gpu_array_ref_group *gpu_array_re
 }
 
 /* Check if the access relations of group1 and group2 overlap within
- * shared_sched.
+ * copy_sched.
  */
 static int accesses_overlap(struct gpu_array_ref_group *group1,
 	struct gpu_array_ref_group *group2)
@@ -846,6 +1000,24 @@ static int check_requires_unroll(struct
 	return !bijective;
 }
 
+/* Map the domain of "access" to the outer data->shared_depth
+ * schedule dimensions.  When data->shared_depth is equal to
+ * data->thread_depth, this result is already available in group->access.
+ */
+static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group,
+	__isl_keep isl_union_map *access, struct gpu_group_data *data)
+{
+	isl_union_map *shared;
+
+	if (data->shared_depth == data->thread_depth)
+		return isl_map_copy(group->access);
+
+	shared = isl_union_map_copy(access);
+	shared = isl_union_map_apply_domain(shared,
+			isl_union_map_copy(data->shared_sched));
+	return isl_map_from_union_map(shared);
+}
+
 /* Compute the private and/or shared memory tiles for the array
  * reference group "group" of array "array".
  * Return 0 on success and -1 on error.
@@ -883,6 +1055,15 @@ static int check_requires_unroll(struct
  * and then they could be allowed to access the same memory elements,
  * but our check does not allow this situation.
  *
+ * For private memory tiles, the number of schedule dimensions that
+ * affect the offset is computed and stored in tile->depth, with
+ * a lower bound of data->kernel_depth.  If this depth is smaller
+ * than the minimal depth that still ensures that every element
+ * is accessed by a single thread, then the depth is raised
+ * to this minimal depth.
+ * The fields of the tile are then adjusted to only refer to the tile->depth
+ * outer schedule dimensions.
+ *
  * We also check that the index expression only depends on parallel
  * loops.  That way, we can move those loops innermost and unroll them.
  * Again, we use a test that is stricter than necessary.
@@ -901,7 +1082,7 @@ static int check_requires_unroll(struct
  * that are forcibly mapped to private memory.
  *
  * If the array is marked force_private, then we bypass all checks
- * and assume we can (and should) use registers.
+ * and assume we can (and should) use registers only.
  *
  * If it turns out we can (or have to) use registers, we compute
  * the private memory tile size using can_tile, after introducing a dependence
@@ -916,11 +1097,12 @@ static int compute_group_bounds_core(str
 	int no_reuse, coalesced;
 	isl_map *acc;
 	int force_private = group->local_array->force_private;
-	int use_shared = kernel->options->use_shared_memory &&
+	int use_shared = !force_private && kernel->options->use_shared_memory &&
 				data->n_thread > 0;
 	int use_private = force_private || kernel->options->use_private_memory;
 	int r = 0;
 	int requires_unroll;
+	int unique_depth;
 
 	if (!use_shared && !use_private)
 		return 0;
@@ -947,11 +1129,13 @@ static int compute_group_bounds_core(str
 	if (use_shared && (!no_reuse || !coalesced)) {
 		group->shared_tile = gpu_array_tile_create(ctx,
 							group->array->n_index);
+		acc = shared_access(group, access, data);
 		if (!group->shared_tile)
 			r = -1;
-		else if (!can_tile(group->access, group->shared_tile))
+		else if (!can_tile(acc, group->shared_tile))
 			group->shared_tile =
 					gpu_array_tile_free(group->shared_tile);
+		isl_map_free(acc);
 	}
 
 	if (r < 0 || (!force_private && (!use_private || no_reuse))) {
@@ -969,11 +1153,13 @@ static int compute_group_bounds_core(str
 		return 0;
 	}
 
+	unique_depth = compute_accessed_by_single_thread_depth(data, acc);
+
 	acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization));
 	acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth,
 								data->n_thread);
 	requires_unroll = check_requires_unroll(data, acc, force_private);
-	if (requires_unroll < 0 ||
+	if (unique_depth < 0 || requires_unroll < 0 ||
 	    (requires_unroll && kernel->any_force_private)) {
 		isl_map_free(acc);
 		return requires_unroll < 0 ? -1 : 0;
@@ -990,6 +1176,15 @@ static int compute_group_bounds_core(str
 
 	isl_map_free(acc);
 
+	if (group->private_tile) {
+		struct gpu_array_tile *tile = group->private_tile;
+		int tile_depth = compute_tile_depth(data, tile);
+		if (tile_depth < unique_depth)
+			tile_depth = unique_depth;
+		if (tile_adjust_depth(tile, tile_depth) < 0)
+			return -1;
+	}
+
 	if (force_private && !group->private_tile)
 		isl_die(ctx, isl_error_internal,
 			"unable to map array reference group to registers",
@@ -1071,7 +1266,7 @@ static int group_overlapping_writes(stru
 }
 
 /* Check if the access relations of group1 and group2 overlap within
- * the outermost min(group1->depth, group2->depth) loops.
+ * the outermost min(group1->min_depth, group2->min_depth) loops.
  */
 static int depth_accesses_overlap(struct gpu_array_ref_group *group1,
 	struct gpu_array_ref_group *group2)
@@ -1081,9 +1276,9 @@ static int depth_accesses_overlap(struct
 	int empty;
 	isl_map *map_i, *map_j, *map;
 
-	depth = group1->depth;
-	if (group2->depth < depth)
-		depth = group2->depth;
+	depth = group1->min_depth;
+	if (group2->min_depth < depth)
+		depth = group2->min_depth;
 	map_i = isl_map_copy(group1->access);
 	dim = isl_map_dim(map_i, isl_dim_in);
 	map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth);
@@ -1150,25 +1345,17 @@ static int group_common_shared_memory_ti
 {
 	int i, j;
 	int recompute_overlap = 0;
-	isl_ctx *ctx = isl_space_get_ctx(array->space);
 
 	for (i = 0; i < n; ++i) {
 		if (!groups[i]->shared_tile)
 			continue;
 		for (j = n - 1; j > i; --j) {
-			isl_map *map;
-			int empty;
 			struct gpu_array_ref_group *group;
 
 			if (!groups[j]->shared_tile)
 				continue;
 
-			map = isl_map_intersect(isl_map_copy(groups[i]->access),
-					    isl_map_copy(groups[j]->access));
-			empty = isl_map_is_empty(map);
-			isl_map_free(map);
-
-			if (empty)
+			if (!depth_accesses_overlap(groups[i], groups[j]))
 				continue;
 
 			group = join_groups(groups[i], groups[j]);
@@ -1184,8 +1371,8 @@ static int group_common_shared_memory_ti
 				continue;
 			}
 
-			if (group->depth < groups[i]->depth ||
-			    group->depth < groups[j]->depth)
+			if (group->min_depth < groups[i]->min_depth ||
+			    group->min_depth < groups[j]->min_depth)
 				recompute_overlap = 1;
 			gpu_array_ref_group_free(groups[i]);
 			gpu_array_ref_group_free(groups[j]);
@@ -1208,7 +1395,7 @@ static int group_common_shared_memory_ti
 static void set_array_groups(struct gpu_local_array_info *array,
 	int n, struct gpu_array_ref_group **groups)
 {
-	int i, j;
+	int i;
 
 	array->n_group = n;
 	array->groups = groups;
@@ -1251,7 +1438,8 @@ static int join_all_groups(int n, struct
  * If the array contains structures, then we compute a single
  * reference group without trying to find any tiles
  * since we do not map such arrays to private or shared
- * memory.
+ * memory.  The only exception is when those arrays of structures
+ * are required to be mapped to private memory.
  */
 static int group_array_references(struct ppcg_kernel *kernel,
 	struct gpu_local_array_info *local, struct gpu_group_data *data)
@@ -1268,7 +1456,7 @@ static int group_array_references(struct
 
 	n = populate_array_references(local, groups, data);
 
-	if (local->array->has_compound_element) {
+	if (local->array->has_compound_element && !local->force_private) {
 		n = join_all_groups(n, groups);
 		set_array_groups(local, n, groups);
 		return 0;
@@ -1295,42 +1483,51 @@ static int group_array_references(struct
 	return -1;
 }
 
-/* For each scalar in the input program, check if there are any
- * order dependences active inside the current kernel, within
- * the same iteration of "host_schedule".
- * If so, mark the scalar as force_private so that it will be
- * mapped to a register.
+/* For each array in the input program that can be mapped to private memory,
+ * check if there are any order dependences active inside the current kernel,
+ * within the same iteration of the host schedule, i.e., the prefix
+ * schedule at "node".
+ * If so, mark the array as force_private so that its reference groups will be
+ * mapped to a registers.
+ *
+ * Note that the arrays that cannot be mapped to private memory have
+ * had their order dependences added to prog->array_order and
+ * subsequently to the coincidence constraints.
  */
-static void check_scalar_live_ranges_in_host(struct ppcg_kernel *kernel,
-	__isl_take isl_union_map *host_schedule)
+static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel,
+	__isl_keep isl_schedule_node *node)
 {
 	int i;
-	isl_union_map *sched;
 	isl_union_set *domain;
-	isl_union_map *same_host_iteration;
+	isl_multi_union_pw_aff *prefix;
+	isl_union_pw_multi_aff *contraction;
 
-	kernel->any_force_private = 0;
+	if (!kernel->options->live_range_reordering)
+		return;
 
-	sched = isl_union_map_universe(isl_union_map_copy(host_schedule));
-	domain = isl_union_map_domain(sched);
+	kernel->any_force_private = 0;
 
-	same_host_iteration = isl_union_map_apply_range(host_schedule,
-		    isl_union_map_reverse(isl_union_map_copy(host_schedule)));
+	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
+	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
+	prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
+								contraction);
+	domain = isl_union_set_copy(kernel->expanded_domain);
+	domain = isl_union_set_universe(domain);
 
 	for (i = 0; i < kernel->n_array; ++i) {
 		struct gpu_local_array_info *local = &kernel->array[i];
 		isl_union_map *order;
 
 		local->force_private = 0;
-		if (local->array->n_index != 0)
+		if (!gpu_array_can_be_private(local->array))
 			continue;
 		order = isl_union_map_copy(local->array->dep_order);
 		order = isl_union_map_intersect_domain(order,
 						    isl_union_set_copy(domain));
 		order = isl_union_map_intersect_range(order,
 						    isl_union_set_copy(domain));
-		order = isl_union_map_intersect(order,
-				    isl_union_map_copy(same_host_iteration));
+		order = isl_union_map_eq_at_multi_union_pw_aff(order,
+					isl_multi_union_pw_aff_copy(prefix));
 		if (!isl_union_map_is_empty(order)) {
 			local->force_private = 1;
 			kernel->any_force_private = 1;
@@ -1338,45 +1535,40 @@ static void check_scalar_live_ranges_in_
 		isl_union_map_free(order);
 	}
 
-	isl_union_map_free(same_host_iteration);
+	isl_multi_union_pw_aff_free(prefix);
 	isl_union_set_free(domain);
 }
 
-/* For each scalar in the input program, check if there are any
- * order dependences active inside the current kernel, within
- * the same iteration of the host schedule, i.e., the prefix
- * schedule at "node".
- * If so, mark the scalar as force_private so that it will be
- * mapped to a register.
+/* Expand the domain of the schedule "s" by plugging in
+ * the contraction "contraction" and return the result.
  */
-static void check_scalar_live_ranges(struct ppcg_kernel *kernel,
-	__isl_keep isl_schedule_node *node)
+static __isl_give isl_union_map *expand(__isl_take isl_union_map *s,
+	__isl_keep isl_union_pw_multi_aff *contraction)
 {
-	isl_union_map *sched;
-
-	if (!kernel->options->live_range_reordering)
-		return;
-
-	sched = isl_schedule_node_get_prefix_schedule_union_map(node);
-
-	check_scalar_live_ranges_in_host(kernel, sched);
+	contraction = isl_union_pw_multi_aff_copy(contraction);
+	s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction);
+	return s;
 }
 
 /* Create a set of dimension data->thread_depth + data->n_thread
  * that equates the residue of the final data->n_thread dimensions
- * modulo the "sizes" to the thread identifiers.
- * "space" is a parameter space containing the thread identifiers.
+ * modulo the kernel->block_dim sizes to the thread identifiers.
  * Store the computed set in data->privatization.
+ *
+ * The construction starts with the space of kernel->thread_filter,
+ * which is known to reference all thread identifiers.
  */
 static void compute_privatization(struct gpu_group_data *data,
-	__isl_take isl_space *space, int *sizes)
+	struct ppcg_kernel *kernel)
 {
 	int i;
 	isl_ctx *ctx;
+	isl_space *space;
 	isl_local_space *ls;
 	isl_set *set;
 
 	ctx = isl_union_map_get_ctx(data->shared_sched);
+	space = isl_union_set_get_space(kernel->thread_filter);
 	space = isl_space_set_from_params(space);
 	space = isl_space_add_dims(space, isl_dim_set,
 				    data->thread_depth + data->n_thread);
@@ -1388,15 +1580,16 @@ static void compute_privatization(struct
 		isl_aff *aff, *aff2;
 		isl_constraint *c;
 		isl_val *v;
-		char name[20];
+		isl_id *id;
 		int pos;
 
 		aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
 					isl_dim_set, data->thread_depth + i);
-		v = isl_val_int_from_si(ctx, sizes[i]);
+		v = isl_val_int_from_si(ctx, kernel->block_dim[i]);
 		aff = isl_aff_mod_val(aff, v);
-		snprintf(name, sizeof(name), "t%d", i);
-		pos = isl_set_find_dim_by_name(set, isl_dim_param, name);
+		id = isl_id_list_get_id(kernel->thread_ids, i);
+		pos = isl_set_find_dim_by_id(set, isl_dim_param, id);
+		isl_id_free(id);
 		aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls),
 					isl_dim_param, pos);
 		aff = isl_aff_sub(aff, aff2);
@@ -1408,8 +1601,24 @@ static void compute_privatization(struct
 	data->privatization = set;
 }
 
+/* Return the prefix schedule at "node" as a relation
+ * between domain elements and schedule dimensions after detecting
+ * equalities in this relation.
+ */
+static __isl_give isl_union_map *prefix_with_equalities(
+	__isl_keep isl_schedule_node *node)
+{
+	isl_union_map *schedule;
+
+	schedule = isl_schedule_node_get_prefix_schedule_relation(node);
+	schedule = isl_union_map_detect_equalities(schedule);
+
+	return schedule;
+}
+
 /* Group references of all arrays in "kernel".
  * "node" points to the kernel mark.
+ * The mapping to shared memory in computed at the "shared" mark.
  *
  * We first extract all required schedule information into
  * a gpu_group_data structure and then consider each array
@@ -1420,10 +1629,10 @@ int gpu_group_references(struct ppcg_ker
 {
 	int i;
 	int r = 0;
-	isl_space *space;
+	isl_union_pw_multi_aff *contraction;
 	struct gpu_group_data data;
 
-	check_scalar_live_ranges(kernel, node);
+	check_can_be_private_live_ranges(kernel, node);
 
 	data.scop = kernel->prog->scop;
 
@@ -1431,26 +1640,42 @@ int gpu_group_references(struct ppcg_ker
 	data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node);
 
 	node = isl_schedule_node_copy(node);
-	node = gpu_tree_move_down_to_thread(node, kernel->core);
-	data.shared_sched =
-		isl_schedule_node_get_prefix_schedule_relation(node);
-	data.shared_sched = isl_union_map_detect_equalities(data.shared_sched);
+	node = gpu_tree_move_down_to_shared(node, kernel->core);
+	data.shared_depth = isl_schedule_node_get_schedule_depth(node);
+	data.shared_sched = prefix_with_equalities(node);
 
+	node = gpu_tree_move_down_to_thread(node, kernel->core);
 	node = isl_schedule_node_child(node, 0);
 	data.thread_depth = isl_schedule_node_get_schedule_depth(node);
 	data.n_thread = isl_schedule_node_band_n_member(node);
-	data.thread_sched = isl_union_map_copy(data.shared_sched);
+	if (data.thread_depth == data.shared_depth)
+		data.copy_sched = isl_union_map_copy(data.shared_sched);
+	else
+		data.copy_sched = prefix_with_equalities(node);
+	data.thread_sched = isl_union_map_copy(data.copy_sched);
 	data.thread_sched = isl_union_map_flat_range_product(data.thread_sched,
 		isl_schedule_node_band_get_partial_schedule_union_map(node));
 	data.thread_sched = isl_union_map_detect_equalities(data.thread_sched);
+
+	contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
+	data.host_sched = expand(data.host_sched, contraction);
+	data.shared_sched = expand(data.shared_sched, contraction);
+	if (data.thread_depth == data.shared_depth) {
+		isl_union_map_free(data.copy_sched);
+		data.copy_sched = isl_union_map_copy(data.shared_sched);
+	} else {
+		data.copy_sched = expand(data.copy_sched, contraction);
+	}
+	data.thread_sched = expand(data.thread_sched, contraction);
+	isl_union_pw_multi_aff_free(contraction);
+
 	node = isl_schedule_node_child(node, 0);
 	data.full_sched = isl_union_map_copy(data.thread_sched);
 	data.full_sched = isl_union_map_flat_range_product(data.full_sched,
 		isl_schedule_node_get_subtree_schedule_union_map(node));
 	isl_schedule_node_free(node);
 
-	space = isl_union_set_get_space(kernel->thread_filter);
-	compute_privatization(&data, space, kernel->block_dim);
+	compute_privatization(&data, kernel);
 
 	for (i = 0; i < kernel->n_array; ++i) {
 		r = group_array_references(kernel, &kernel->array[i], &data);
@@ -1460,6 +1685,7 @@ int gpu_group_references(struct ppcg_ker
 
 	isl_union_map_free(data.host_sched);
 	isl_union_map_free(data.shared_sched);
+	isl_union_map_free(data.copy_sched);
 	isl_union_map_free(data.thread_sched);
 	isl_union_map_free(data.full_sched);
 	isl_set_free(data.privatization);
@@ -1471,7 +1697,7 @@ int gpu_group_references(struct ppcg_ker
  *
  *	{ D -> A }
  *
- * where D represents the first group->depth schedule dimensions
+ * where D represents the first tile->depth schedule dimensions
  * and A represents the array, construct an isl_multi_aff
  *
  *	{ [D[i] -> A[a]] -> A'[a'] }
@@ -1542,7 +1768,7 @@ static __isl_give isl_multi_aff *strided
  *
  *	{ [D[i] -> A[a]] -> T[t] }
  *
- * where D represents the first group->depth schedule dimensions,
+ * where D represents the first tile->depth schedule dimensions,
  * A represents the global array and T represents the shared or
  * private memory tile.  The name of T is the name of the local
  * array.
@@ -1558,24 +1784,19 @@ static __isl_give isl_multi_aff *strided
 void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group)
 {
 	int i;
-	int dim;
 	struct gpu_array_tile *tile;
-	struct gpu_array_info *array = group->array;
 	isl_space *space;
 	isl_multi_aff *tiling, *lb, *insert_array;
 	isl_printer *p;
 	char *local_name;
 
-	tile = group->private_tile;
-	if (!tile)
-		tile = group->shared_tile;
+	tile = gpu_array_ref_group_tile(group);
 	if (!tile)
 		return;
 
 	space = isl_map_get_space(group->access);
-	dim = isl_space_dim(space, isl_dim_in);
-	space = isl_space_drop_dims(space, isl_dim_in, group->depth,
-							dim - group->depth);
+	space = isl_space_from_range(isl_space_range(space));
+	space = isl_space_add_dims(space, isl_dim_in, tile->depth);
 	insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
 
 	for (i = 0; i < tile->n; ++i)

Modified: polly/trunk/lib/External/ppcg/gpu_group.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_group.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_group.h (original)
+++ polly/trunk/lib/External/ppcg/gpu_group.h Thu Jul 20 08:48:13 2017
@@ -10,10 +10,6 @@
  * Otherwise, it is accessed from global memory.
  * Note that if both private_tile and shared_tile are set, then shared_tile
  * is only used inside group_common_shared_memory_tile.
- * "depth" reflects the number of schedule dimensions that affect the tile
- * (private_tile if set; shared_tile if shared_tile is set and private_tile
- * is not).  The copying into and/or out of the tile is performed at that
- * depth.
  */
 struct gpu_array_ref_group {
 	/* The references in this group access this local array. */
@@ -24,18 +20,20 @@ struct gpu_array_ref_group {
 	int nr;
 
 	/* The following fields are use during the construction of the groups.
-	 * access is the combined access relation relative to the shared
+	 * access is the combined access relation relative to the private
 	 * memory tiling.  In particular, the domain of the map corresponds
-	 * to the first shared_schedule_dim dimensions of the kernel schedule.
+	 * to the first thread_depth dimensions of the kernel schedule.
 	 * write is set if any access in the group is a write.
 	 * exact_write is set if all writes are definite writes.
 	 * slice is set if there is at least one access in the group
 	 * that refers to more than one element
+	 * "min_depth" is the minimum of the tile depths and thread_depth.
 	 */
 	isl_map *access;
 	int write;
 	int exact_write;
 	int slice;
+	int min_depth;
 
 	/* The shared memory tile, NULL if none. */
 	struct gpu_array_tile *shared_tile;
@@ -43,8 +41,6 @@ struct gpu_array_ref_group {
 	/* The private memory tile, NULL if none. */
 	struct gpu_array_tile *private_tile;
 
-	int depth;
-
 	/* References in this group; point to elements of a linked list. */
 	int n_ref;
 	struct gpu_stmt_access **refs;
@@ -59,6 +55,8 @@ void gpu_array_ref_group_compute_tiling(
 __isl_give isl_union_map *gpu_array_ref_group_access_relation(
 	struct gpu_array_ref_group *group, int read, int write);
 int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group);
+enum ppcg_group_access_type gpu_array_ref_group_type(
+	struct gpu_array_ref_group *group);
 struct gpu_array_tile *gpu_array_ref_group_tile(
 	struct gpu_array_ref_group *group);
 struct gpu_array_ref_group *gpu_array_ref_group_free(

Added: polly/trunk/lib/External/ppcg/gpu_hybrid.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_hybrid.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_hybrid.c (added)
+++ polly/trunk/lib/External/ppcg/gpu_hybrid.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2013      Ecole Normale Superieure
+ * Copyright 2015      Sven Verdoolaege
+ *
+ * Use of this software is governed by the MIT license
+ *
+ * Written by Sven Verdoolaege,
+ * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
+ */
+
+#include <string.h>
+
+#include <isl/val.h>
+#include <isl/space.h>
+#include <isl/union_set.h>
+#include <isl/schedule_node.h>
+
+#include "hybrid.h"
+#include "gpu_hybrid.h"
+#include "gpu_tree.h"
+#include "schedule.h"
+#include "util.h"
+
+/* Have all domain elements been filtered out before reaching
+ * the "node" position in the schedule tree?
+ */
+static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node)
+{
+	isl_union_set *domain;
+	isl_bool empty;
+
+	domain = isl_schedule_node_get_domain(node);
+	empty = isl_union_set_is_empty(domain);
+	isl_union_set_free(domain);
+
+	return empty;
+}
+
+/* Given a pointer to a phase in the result of hybrid tiling,
+ * map the phase to the device, provided the phase is non-empty.
+ * Empty phases can occur if the input schedule domain can be
+ * covered by a small number of hexagons that all belong to the same phase.
+ *
+ * The input has the following form:
+ *
+ *	M - CT - P - C - ...
+ *
+ * with M the phase marker, CT the space tiling, P the original
+ * parent band and C the original child band.
+ * The (outer dimensions of the) C band need to be mapped to threads.
+ * The (outer dimension of the) CT band needs to be mapped to blocks.
+ * The mapping to shared memory needs to be computed between the CT and
+ * the P band.
+ *
+ * The C band is first shifted to start at zero.
+ * Then the appropriate markers are introduced and a kernel is
+ * created for the tree rooted at CT.
+ * If the "unroll_gpu_tile" option is set, then the AST generator
+ * is instructed to unroll the P and C bands.
+ */
+static __isl_give isl_schedule_node *update_phase(
+	__isl_take isl_schedule_node *node, void *user)
+{
+	struct gpu_gen *gen = user;
+	int depth0, depth;
+	isl_ctx *ctx;
+	isl_id *id;
+	isl_bool empty_domain;
+	ppcg_ht_phase *phase;
+
+	empty_domain = has_empty_domain(node);
+	if (empty_domain < 0)
+		return isl_schedule_node_free(node);
+	if (empty_domain)
+		return node;
+
+	if (!node)
+		return NULL;
+	ctx = isl_schedule_node_get_ctx(node);
+
+	phase = ppcg_ht_phase_extract_from_mark(node);
+
+	depth0 = isl_schedule_node_get_tree_depth(node);
+
+	node = isl_schedule_node_child(node, 0);
+
+	node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_child(node, 0);
+	node = ppcg_ht_phase_shift_space_point(phase, node);
+	if (gen->options->unroll_gpu_tile)
+		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
+	id = isl_id_alloc(ctx, "thread", NULL);
+	node = isl_schedule_node_insert_mark(node, id);
+	node = isl_schedule_node_parent(node);
+	if (gen->options->unroll_gpu_tile)
+		node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll);
+	id = isl_id_alloc(ctx, "shared", NULL);
+	node = isl_schedule_node_insert_mark(node, id);
+	node = isl_schedule_node_parent(node);
+
+	node = gpu_create_kernel(gen, node, 0, NULL);
+
+	depth = isl_schedule_node_get_tree_depth(node);
+	node = isl_schedule_node_ancestor(node, depth - depth0);
+
+	return node;
+}
+
+/* Apply hybrid tiling on "node" and its parent based on the (valid)
+ * bounds on the relative dependence distances "bounds" and
+ * the tile sizes in "tile_sizes".
+ * The number of elements in "tile_sizes" is at least as large
+ * as the sum of the dimensions of the parent and the child node.
+ *
+ * Convert the tile_sizes to an isl_multi_val in the right space,
+ * insert the hybrid tiling and then create a kernel inside each phase.
+ * Finally, remove the phase marks.
+ */
+__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
+	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
+	int *tile_sizes)
+{
+	isl_multi_val *mv;
+	isl_space *space, *space2;
+
+	if (!node || !bounds)
+		goto error;
+
+	space2 = isl_schedule_node_band_get_space(node);
+	node = isl_schedule_node_parent(node);
+	space = isl_schedule_node_band_get_space(node);
+	space = isl_space_product(space, space2);
+	mv = ppcg_multi_val_from_int_list(space, tile_sizes);
+
+	node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options);
+
+	node = hybrid_tile_foreach_phase(node, &update_phase, gen);
+
+	node = hybrid_tile_drop_phase_marks(node);
+
+	return node;
+error:
+	isl_schedule_node_free(node);
+	ppcg_ht_bounds_free(bounds);
+	return NULL;
+}

Added: polly/trunk/lib/External/ppcg/gpu_hybrid.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_hybrid.h?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_hybrid.h (added)
+++ polly/trunk/lib/External/ppcg/gpu_hybrid.h Thu Jul 20 08:48:13 2017
@@ -0,0 +1,13 @@
+#ifndef GPU_HYBRID_H
+#define GPU_HYBRID_H
+
+#include <isl/schedule_node.h>
+
+#include "gpu.h"
+#include "hybrid.h"
+
+__isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen,
+	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds,
+	int *tile_sizes);
+
+#endif

Modified: polly/trunk/lib/External/ppcg/gpu_print.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_print.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_print.c (original)
+++ polly/trunk/lib/External/ppcg/gpu_print.c Thu Jul 20 08:48:13 2017
@@ -22,19 +22,19 @@ __isl_give isl_printer *gpu_print_local_
 	struct gpu_prog *prog)
 {
 	int i;
-	isl_ast_build *build;
 
 	if (!prog)
 		return isl_printer_free(p);
 
-	build = isl_ast_build_from_context(isl_set_copy(prog->scop->context));
 	for (i = 0; i < prog->n_array; ++i) {
-		if (!prog->array[i].declare_local)
+		struct gpu_array_info *array = &prog->array[i];
+		isl_ast_expr *size;
+
+		if (!array->declare_local)
 			continue;
-		p = ppcg_print_declaration(p, prog->scop->pet->arrays[i],
-					    build);
+		size = array->declared_size;
+		p = ppcg_print_declaration_with_size(p, array->type, size);
 	}
-	isl_ast_build_free(build);
 
 	return p;
 }
@@ -47,8 +47,12 @@ __isl_give isl_printer *gpu_array_info_p
 	int i;
 
 	for (i = 0; i < array->n_index; ++i) {
+		isl_ast_expr *bound;
+
 		prn = isl_printer_print_str(prn, "(");
-		prn = isl_printer_print_pw_aff(prn, array->bound[i]);
+		bound = isl_ast_expr_get_op_arg(array->bound_expr, 1 + i);
+		prn = isl_printer_print_ast_expr(prn, bound);
+		isl_ast_expr_free(bound);
 		prn = isl_printer_print_str(prn, ") * ");
 	}
 	prn = isl_printer_print_str(prn, "sizeof(");
@@ -63,18 +67,10 @@ __isl_give isl_printer *gpu_array_info_p
 static __isl_give isl_printer *print_non_linearized_declaration_argument(
 	__isl_take isl_printer *p, struct gpu_array_info *array)
 {
-	int i;
-
 	p = isl_printer_print_str(p, array->type);
 	p = isl_printer_print_str(p, " ");
 
-	p = isl_printer_print_str(p, array->name);
-
-	for (i = 0; i < array->n_index; i++) {
-		p = isl_printer_print_str(p, "[");
-		p = isl_printer_print_pw_aff(p, array->bound[i]);
-		p = isl_printer_print_str(p, "]");
-	}
+	p = isl_printer_print_ast_expr(p, array->bound_expr);
 
 	return p;
 }
@@ -136,16 +132,11 @@ static __isl_give isl_printer *stmt_prin
 /* Print an access to the element in the global memory copy
  * described by "stmt".  The index of the copy is recorded in
  * stmt->index as an access to the array.
- *
- * The copy in global memory has been linearized, so we need to take
- * the array size into account.
  */
 static __isl_give isl_printer *stmt_print_global_index(
 	__isl_take isl_printer *p, struct ppcg_kernel_stmt *stmt)
 {
-	int i;
 	struct gpu_array_info *array = stmt->u.c.array;
-	struct gpu_local_array_info *local = stmt->u.c.local_array;
 	isl_ast_expr *index;
 
 	if (gpu_array_is_scalar(array)) {
@@ -156,8 +147,6 @@ static __isl_give isl_printer *stmt_prin
 	}
 
 	index = isl_ast_expr_copy(stmt->u.c.index);
-	if (array->linearize)
-		index = gpu_local_array_info_linearize_index(local, index);
 
 	p = isl_printer_print_ast_expr(p, index);
 	isl_ast_expr_free(index);
@@ -200,6 +189,72 @@ __isl_give isl_printer *ppcg_kernel_prin
 	return pet_stmt_print_body(stmt->u.d.stmt->stmt, p, stmt->u.d.ref2expr);
 }
 
+/* This function is called for each node in a GPU AST.
+ * In case of a user node, print the macro definitions required
+ * for printing the AST expressions in the annotation, if any.
+ * For other nodes, return true such that descendants are also
+ * visited.
+ *
+ * In particular, for a kernel launch, print the macro definitions
+ * needed for the grid size.
+ * For a copy statement, print the macro definitions needed
+ * for the two index expressions.
+ * For an original user statement, print the macro definitions
+ * needed for the substitutions.
+ */
+static isl_bool at_node(__isl_keep isl_ast_node *node, void *user)
+{
+	const char *name;
+	isl_id *id;
+	int is_kernel;
+	struct ppcg_kernel *kernel;
+	struct ppcg_kernel_stmt *stmt;
+	isl_printer **p = user;
+
+	if (isl_ast_node_get_type(node) != isl_ast_node_user)
+		return isl_bool_true;
+
+	id = isl_ast_node_get_annotation(node);
+	if (!id)
+		return isl_bool_false;
+
+	name = isl_id_get_name(id);
+	if (!name)
+		return isl_bool_error;
+	is_kernel = !strcmp(name, "kernel");
+	kernel = is_kernel ? isl_id_get_user(id) : NULL;
+	stmt = is_kernel ? NULL : isl_id_get_user(id);
+	isl_id_free(id);
+
+	if ((is_kernel && !kernel) || (!is_kernel && !stmt))
+		return isl_bool_error;
+
+	if (is_kernel) {
+		*p = ppcg_ast_expr_print_macros(kernel->grid_size_expr, *p);
+	} else if (stmt->type == ppcg_kernel_copy) {
+		*p = ppcg_ast_expr_print_macros(stmt->u.c.index, *p);
+		*p = ppcg_ast_expr_print_macros(stmt->u.c.local_index, *p);
+	} else if (stmt->type == ppcg_kernel_domain) {
+		*p = ppcg_print_body_macros(*p, stmt->u.d.ref2expr);
+	}
+	if (!*p)
+		return isl_bool_error;
+
+	return isl_bool_false;
+}
+
+/* Print the required macros for the GPU AST "node" to "p",
+ * including those needed for the user statements inside the AST.
+ */
+__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
+	__isl_keep isl_ast_node *node)
+{
+	if (isl_ast_node_foreach_descendant_top_down(node, &at_node, &p) < 0)
+		return isl_printer_free(p);
+	p = ppcg_print_macros(p, node);
+	return p;
+}
+
 /* Was the definition of "type" printed before?
  * That is, does its name appear in the list of printed types "types"?
  */

Modified: polly/trunk/lib/External/ppcg/gpu_print.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_print.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_print.h (original)
+++ polly/trunk/lib/External/ppcg/gpu_print.h Thu Jul 20 08:48:13 2017
@@ -9,6 +9,9 @@ __isl_give isl_printer *gpu_print_local_
 __isl_give isl_printer *gpu_print_types(__isl_take isl_printer *p,
 	struct gpu_types *types, struct gpu_prog *prog);
 
+__isl_give isl_printer *gpu_print_macros(__isl_take isl_printer *p,
+	__isl_keep isl_ast_node *node);
+
 __isl_give isl_printer *gpu_array_info_print_size(__isl_take isl_printer *prn,
 	struct gpu_array_info *array);
 __isl_give isl_printer *gpu_array_info_print_declaration_argument(

Modified: polly/trunk/lib/External/ppcg/gpu_tree.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_tree.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_tree.c (original)
+++ polly/trunk/lib/External/ppcg/gpu_tree.c Thu Jul 20 08:48:13 2017
@@ -63,6 +63,13 @@ int gpu_tree_node_is_kernel(__isl_keep i
 	return is_marked(node, "kernel");
 }
 
+/* Is "node" a mark node with an identifier called "shared"?
+ */
+static int node_is_shared(__isl_keep isl_schedule_node *node)
+{
+	return is_marked(node, "shared");
+}
+
 /* Is "node" a mark node with an identifier called "thread"?
  */
 static int node_is_thread(__isl_keep isl_schedule_node *node)
@@ -70,6 +77,77 @@ static int node_is_thread(__isl_keep isl
 	return is_marked(node, "thread");
 }
 
+/* Insert a mark node with identifier "shared" in front of "node".
+ */
+static __isl_give isl_schedule_node *insert_shared(
+	__isl_take isl_schedule_node *node)
+{
+	isl_ctx *ctx;
+	isl_id *id;
+
+	ctx = isl_schedule_node_get_ctx(node);
+	id = isl_id_alloc(ctx, "shared", NULL);
+	node = isl_schedule_node_insert_mark(node, id);
+
+	return node;
+}
+
+/* Insert a "shared" mark in front of the "thread" mark
+ * provided the linear branch between "node" and the "thread" mark
+ * does not contain such a "shared" mark already.
+ *
+ * As a side effect, this function checks that the subtree at "node"
+ * actually contains a "thread" mark and that there is no branching
+ * in between "node" and this "thread" mark.
+ */
+__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
+	__isl_take isl_schedule_node *node)
+{
+	int depth0, depth;
+	int any_shared = 0;
+
+	if (!node)
+		return NULL;
+
+	depth0 = isl_schedule_node_get_tree_depth(node);
+
+	for (;;) {
+		int is_thread;
+		int n;
+
+		if (!any_shared) {
+			any_shared = node_is_shared(node);
+			if (any_shared < 0)
+				return isl_schedule_node_free(node);
+		}
+		is_thread = node_is_thread(node);
+		if (is_thread < 0)
+			return isl_schedule_node_free(node);
+		if (is_thread)
+			break;
+		n = isl_schedule_node_n_children(node);
+		if (n == 0)
+			isl_die(isl_schedule_node_get_ctx(node),
+				isl_error_invalid,
+				"no thread marker found",
+				return isl_schedule_node_free(node));
+		if (n > 1)
+			isl_die(isl_schedule_node_get_ctx(node),
+				isl_error_invalid,
+				"expecting single thread marker",
+				return isl_schedule_node_free(node));
+
+		node = isl_schedule_node_child(node, 0);
+	}
+
+	if (!any_shared)
+		node = insert_shared(node);
+	depth = isl_schedule_node_get_tree_depth(node);
+	node = isl_schedule_node_ancestor(node, depth - depth0);
+
+	return node;
+}
+
 /* Assuming "node" is a filter node, does it correspond to the branch
  * that contains the "thread" mark, i.e., does it contain any elements
  * in "core"?
@@ -127,6 +205,23 @@ static __isl_give isl_schedule_node *cor
 }
 
 /* Move down the branch between "kernel" and "thread" until
+ * the "shared" mark is reached, where the branch containing the "shared"
+ * mark is identified by the domain elements in "core".
+ */
+__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
+	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core)
+{
+	int is_shared;
+
+	while ((is_shared = node_is_shared(node)) == 0)
+		node = core_child(node, core);
+	if (is_shared < 0)
+		node = isl_schedule_node_free(node);
+
+	return node;
+}
+
+/* Move down the branch between "kernel" and "thread" until
  * the "thread" mark is reached, where the branch containing the "thread"
  * mark is identified by the domain elements in "core".
  */
@@ -189,7 +284,8 @@ __isl_give isl_schedule_node *gpu_tree_m
 	__isl_take isl_schedule_node *node, int depth,
 	__isl_keep isl_union_set *core)
 {
-	int is_thread;
+	int is_shared;
+	int is_thread = 0;
 
 	while (node && isl_schedule_node_get_schedule_depth(node) < depth) {
 		if (isl_schedule_node_get_type(node) ==
@@ -203,10 +299,11 @@ __isl_give isl_schedule_node *gpu_tree_m
 		}
 		node = core_child(node, core);
 	}
-	while ((is_thread = node_is_thread(node)) == 0 &&
+	while ((is_shared = node_is_shared(node)) == 0 &&
+	    (is_thread = node_is_thread(node)) == 0 &&
 	    isl_schedule_node_get_type(node) != isl_schedule_node_band)
 		node = core_child(node, core);
-	if (is_thread < 0)
+	if (is_shared < 0 || is_thread < 0)
 		node = isl_schedule_node_free(node);
 
 	return node;

Modified: polly/trunk/lib/External/ppcg/gpu_tree.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/gpu_tree.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/gpu_tree.h (original)
+++ polly/trunk/lib/External/ppcg/gpu_tree.h Thu Jul 20 08:48:13 2017
@@ -5,7 +5,11 @@
 
 #include "gpu.h"
 
+__isl_give isl_schedule_node *gpu_tree_insert_shared_before_thread(
+	__isl_take isl_schedule_node *node);
 int gpu_tree_node_is_kernel(__isl_keep isl_schedule_node *node);
+__isl_give isl_schedule_node *gpu_tree_move_down_to_shared(
+	__isl_take isl_schedule_node *node, __isl_keep isl_union_set *core);
 __isl_give isl_schedule_node *gpu_tree_move_up_to_thread(
 	__isl_take isl_schedule_node *node);
 __isl_give isl_schedule_node *gpu_tree_move_down_to_thread(

Added: polly/trunk/lib/External/ppcg/grouping.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/grouping.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/grouping.c (added)
+++ polly/trunk/lib/External/ppcg/grouping.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,684 @@
+/*
+ * Copyright 2016      Sven Verdoolaege
+ *
+ * Use of this software is governed by the MIT license
+ *
+ * Written by Sven Verdoolaege.
+ */
+
+#include <isl/ctx.h>
+#include <isl/id.h>
+#include <isl/val.h>
+#include <isl/space.h>
+#include <isl/aff.h>
+#include <isl/set.h>
+#include <isl/map.h>
+#include <isl/union_set.h>
+#include <isl/union_map.h>
+#include <isl/schedule.h>
+#include <isl/schedule_node.h>
+
+#include "ppcg.h"
+
+/* Internal data structure for use during the detection of statements
+ * that can be grouped.
+ *
+ * "sc" contains the original schedule constraints (not a copy).
+ * "dep" contains the intersection of the validity and the proximity
+ * constraints in "sc".  It may be NULL if it has not been computed yet.
+ * "group_id" is the identifier for the next group that is extracted.
+ *
+ * "domain" is the set of statement instances that belong to any of the groups.
+ * "contraction" maps the elements of "domain" to the corresponding group
+ * instances.
+ * "schedule" schedules the statements in each group relatively to each other.
+ * These last three fields are NULL if no groups have been found so far.
+ */
+struct ppcg_grouping {
+	isl_schedule_constraints *sc;
+
+	isl_union_map *dep;
+	int group_id;
+
+	isl_union_set *domain;
+	isl_union_pw_multi_aff *contraction;
+	isl_schedule *schedule;
+};
+
+/* Clear all memory allocated by "grouping".
+ */
+static void ppcg_grouping_clear(struct ppcg_grouping *grouping)
+{
+	isl_union_map_free(grouping->dep);
+	isl_union_set_free(grouping->domain);
+	isl_union_pw_multi_aff_free(grouping->contraction);
+	isl_schedule_free(grouping->schedule);
+}
+
+/* Compute the intersection of the proximity and validity dependences
+ * in grouping->sc and store the result in grouping->dep, unless
+ * this intersection has been computed before.
+ */
+static isl_stat ppcg_grouping_compute_dep(struct ppcg_grouping *grouping)
+{
+	isl_union_map *validity, *proximity;
+
+	if (grouping->dep)
+		return isl_stat_ok;
+
+	validity = isl_schedule_constraints_get_validity(grouping->sc);
+	proximity = isl_schedule_constraints_get_proximity(grouping->sc);
+	grouping->dep = isl_union_map_intersect(validity, proximity);
+
+	if (!grouping->dep)
+		return isl_stat_error;
+
+	return isl_stat_ok;
+}
+
+/* Information extracted from one or more consecutive leaves
+ * in the input schedule.
+ *
+ * "list" contains the sets of statement instances in the leaves,
+ * one element in the list for each original leaf.
+ * "domain" contains the union of the sets in "list".
+ * "prefix" contains the prefix schedule of these elements.
+ */
+struct ppcg_grouping_leaf {
+	isl_union_set *domain;
+	isl_union_set_list *list;
+	isl_multi_union_pw_aff *prefix;
+};
+
+/* Free all memory allocated for "leaves".
+ */
+static void ppcg_grouping_leaf_free(int n, struct ppcg_grouping_leaf leaves[n])
+{
+	int i;
+
+	if (!leaves)
+		return;
+
+	for (i = 0; i < n; ++i) {
+		isl_union_set_free(leaves[i].domain);
+		isl_union_set_list_free(leaves[i].list);
+		isl_multi_union_pw_aff_free(leaves[i].prefix);
+	}
+
+	free(leaves);
+}
+
+/* Short-hand for retrieving the prefix schedule at "node"
+ * in the form of an isl_multi_union_pw_aff.
+ */
+static __isl_give isl_multi_union_pw_aff *get_prefix(
+	__isl_keep isl_schedule_node *node)
+{
+	return isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
+}
+
+/* Return an array of "n" elements with information extracted from
+ * the "n" children of "node" starting at "first", all of which
+ * are known to be filtered leaves.
+ */
+struct ppcg_grouping_leaf *extract_leaves(__isl_keep isl_schedule_node *node,
+	int first, int n)
+{
+	int i;
+	isl_ctx *ctx;
+	struct ppcg_grouping_leaf *leaves;
+
+	if (!node)
+		return NULL;
+
+	ctx = isl_schedule_node_get_ctx(node);
+	leaves = isl_calloc_array(ctx, struct ppcg_grouping_leaf, n);
+	if (!leaves)
+		return NULL;
+
+	for (i = 0; i < n; ++i) {
+		isl_schedule_node *child;
+		isl_union_set *domain;
+
+		child = isl_schedule_node_get_child(node, first + i);
+		child = isl_schedule_node_child(child, 0);
+		domain = isl_schedule_node_get_domain(child);
+		leaves[i].domain = isl_union_set_copy(domain);
+		leaves[i].list = isl_union_set_list_from_union_set(domain);
+		leaves[i].prefix = get_prefix(child);
+		isl_schedule_node_free(child);
+	}
+
+	return leaves;
+}
+
+/* Internal data structure used by merge_leaves.
+ *
+ * "src" and "dst" point to the two consecutive leaves that are
+ * under investigation for being merged.
+ * "merge" is initially set to 0 and is set to 1 as soon as
+ * it turns out that it is useful to merge the two leaves.
+ */
+struct ppcg_merge_leaves_data {
+	int merge;
+	struct ppcg_grouping_leaf *src;
+	struct ppcg_grouping_leaf *dst;
+};
+
+/* Given a relation "map" between instances of two statements A and B,
+ * does it relate every instance of A (according to the domain of "src")
+ * to every instance of B (according to the domain of "dst")?
+ */
+static isl_bool covers_src_and_dst(__isl_keep isl_map *map,
+	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
+{
+	isl_space *space;
+	isl_set *set1, *set2;
+	isl_bool is_subset;
+
+	space = isl_space_domain(isl_map_get_space(map));
+	set1 = isl_union_set_extract_set(src->domain, space);
+	set2 = isl_map_domain(isl_map_copy(map));
+	is_subset = isl_set_is_subset(set1, set2);
+	isl_set_free(set1);
+	isl_set_free(set2);
+	if (is_subset < 0 || !is_subset)
+		return is_subset;
+
+	space = isl_space_range(isl_map_get_space(map));
+	set1 = isl_union_set_extract_set(dst->domain, space);
+	set2 = isl_map_range(isl_map_copy(map));
+	is_subset = isl_set_is_subset(set1, set2);
+	isl_set_free(set1);
+	isl_set_free(set2);
+
+	return is_subset;
+}
+
+/* Given a relation "map" between instances of two statements A and B,
+ * are pairs of related instances executed together in the input schedule?
+ * That is, is each pair of instances assigned the same value
+ * by the corresponding prefix schedules?
+ *
+ * In particular, select the subset of "map" that has pairs of elements
+ * with the same value for the prefix schedules and then check
+ * if "map" is still a subset of the result.
+ */
+static isl_bool matches_prefix(__isl_keep isl_map *map,
+	struct ppcg_grouping_leaf *src, struct ppcg_grouping_leaf *dst)
+{
+	isl_union_map *umap, *equal;
+	isl_multi_union_pw_aff *src_prefix, *dst_prefix, *prefix;
+	isl_bool is_subset;
+
+	src_prefix = isl_multi_union_pw_aff_copy(src->prefix);
+	dst_prefix = isl_multi_union_pw_aff_copy(dst->prefix);
+	prefix = isl_multi_union_pw_aff_union_add(src_prefix, dst_prefix);
+
+	umap = isl_union_map_from_map(isl_map_copy(map));
+	equal = isl_union_map_copy(umap);
+	equal = isl_union_map_eq_at_multi_union_pw_aff(equal, prefix);
+
+	is_subset = isl_union_map_is_subset(umap, equal);
+
+	isl_union_map_free(umap);
+	isl_union_map_free(equal);
+
+	return is_subset;
+}
+
+/* Given a set of validity and proximity schedule constraints "map"
+ * between statements in consecutive leaves in a valid schedule,
+ * should the two leaves be merged into one?
+ *
+ * In particular, the two are merged if the constraints form
+ * a bijection between every instance of the first statement and
+ * every instance of the second statement.  Moreover, each
+ * pair of such dependent instances needs to be executed consecutively
+ * in the input schedule.  That is, they need to be assigned
+ * the same value by their prefix schedules.
+ *
+ * What this means is that for each instance of the first statement
+ * there is exactly one instance of the second statement that
+ * is executed immediately after the instance of the first statement and
+ * that, moreover, both depends on this statement instance and
+ * should be brought as close as possible to this statement instance.
+ * In other words, it is both possible to execute the two instances
+ * together (according to the input schedule) and desirable to do so
+ * (according to the validity and proximity schedule constraints).
+ */
+static isl_stat check_merge(__isl_take isl_map *map, void *user)
+{
+	struct ppcg_merge_leaves_data *data = user;
+	isl_bool ok;
+
+	ok = covers_src_and_dst(map, data->src, data->dst);
+	if (ok >= 0 && ok)
+		ok = isl_map_is_bijective(map);
+	if (ok >= 0 && ok)
+		ok = matches_prefix(map, data->src, data->dst);
+
+	isl_map_free(map);
+
+	if (ok < 0)
+		return isl_stat_error;
+	if (!ok)
+		return isl_stat_ok;
+
+	data->merge = 1;
+	return isl_stat_error;
+}
+
+/* Merge the leaves at position "pos" and "pos + 1" in "leaves".
+ */
+static isl_stat merge_pair(int n, struct ppcg_grouping_leaf leaves[n], int pos)
+{
+	int i;
+
+	leaves[pos].domain = isl_union_set_union(leaves[pos].domain,
+						leaves[pos + 1].domain);
+	leaves[pos].list = isl_union_set_list_concat(leaves[pos].list,
+						leaves[pos + 1].list);
+	leaves[pos].prefix = isl_multi_union_pw_aff_union_add(
+				leaves[pos].prefix, leaves[pos + 1].prefix);
+	for (i = pos + 1; i + 1 < n; ++i)
+		leaves[i] = leaves[i + 1];
+	leaves[n - 1].domain = NULL;
+	leaves[n - 1].list = NULL;
+	leaves[n - 1].prefix = NULL;
+
+	if (!leaves[pos].domain || !leaves[pos].list || !leaves[pos].prefix)
+		return isl_stat_error;
+
+	return isl_stat_ok;
+}
+
+/* Merge pairs of consecutive leaves in "leaves" taking into account
+ * the intersection of validity and proximity schedule constraints "dep".
+ *
+ * If a leaf has been merged with the next leaf, then the combination
+ * is checked again for merging with the next leaf.
+ * That is, if the leaves are A, B and C, then B may not have been
+ * merged with C, but after merging A and B, it could still be useful
+ * to merge the combination AB with C.
+ *
+ * Two leaves A and B are merged if there are instances of at least
+ * one pair of statements, one statement in A and one B, such that
+ * the validity and proximity schedule constraints between them
+ * make them suitable for merging according to check_merge.
+ *
+ * Return the final number of leaves in the sequence, or -1 on error.
+ */
+static int merge_leaves(int n, struct ppcg_grouping_leaf leaves[n],
+	__isl_keep isl_union_map *dep)
+{
+	int i;
+	struct ppcg_merge_leaves_data data;
+
+	for (i = n - 1; i >= 0; --i) {
+		isl_union_map *dep_i;
+		isl_stat ok;
+
+		if (i + 1 >= n)
+			continue;
+
+		dep_i = isl_union_map_copy(dep);
+		dep_i = isl_union_map_intersect_domain(dep_i,
+				isl_union_set_copy(leaves[i].domain));
+		dep_i = isl_union_map_intersect_range(dep_i,
+				isl_union_set_copy(leaves[i + 1].domain));
+		data.merge = 0;
+		data.src = &leaves[i];
+		data.dst = &leaves[i + 1];
+		ok = isl_union_map_foreach_map(dep_i, &check_merge, &data);
+		isl_union_map_free(dep_i);
+		if (ok < 0 && !data.merge)
+			return -1;
+		if (!data.merge)
+			continue;
+		if (merge_pair(n, leaves, i) < 0)
+			return -1;
+		--n;
+		++i;
+	}
+
+	return n;
+}
+
+/* Construct a schedule with "domain" as domain, that executes
+ * the elements of "list" in order (as a sequence).
+ */
+static __isl_give isl_schedule *schedule_from_domain_and_list(
+	__isl_keep isl_union_set *domain, __isl_keep isl_union_set_list *list)
+{
+	isl_schedule *schedule;
+	isl_schedule_node *node;
+
+	schedule = isl_schedule_from_domain(isl_union_set_copy(domain));
+	node = isl_schedule_get_root(schedule);
+	isl_schedule_free(schedule);
+	node = isl_schedule_node_child(node, 0);
+	list = isl_union_set_list_copy(list);
+	node = isl_schedule_node_insert_sequence(node, list);
+	schedule = isl_schedule_node_get_schedule(node);
+	isl_schedule_node_free(node);
+
+	return schedule;
+}
+
+/* Construct a unique identifier for a group in "grouping".
+ *
+ * The name is of the form G_n, with n the first value starting at
+ * grouping->group_id that does not result in an identifier
+ * that is already in use in the domain of the original schedule
+ * constraints.
+ */
+static isl_id *construct_group_id(struct ppcg_grouping *grouping,
+	__isl_take isl_space *space)
+{
+	isl_ctx *ctx;
+	isl_id *id;
+	isl_bool empty;
+	isl_union_set *domain;
+
+	if (!space)
+		return NULL;
+
+	ctx = isl_space_get_ctx(space);
+	domain = isl_schedule_constraints_get_domain(grouping->sc);
+
+	do {
+		char buffer[20];
+		isl_id *id;
+		isl_set *set;
+
+		snprintf(buffer, sizeof(buffer), "G_%d", grouping->group_id);
+		grouping->group_id++;
+		id = isl_id_alloc(ctx, buffer, NULL);
+		space = isl_space_set_tuple_id(space, isl_dim_set, id);
+		set = isl_union_set_extract_set(domain, isl_space_copy(space));
+		empty = isl_set_plain_is_empty(set);
+		isl_set_free(set);
+	} while (empty >= 0 && !empty);
+
+	if (empty < 0)
+		space = isl_space_free(space);
+
+	id = isl_space_get_tuple_id(space, isl_dim_set);
+
+	isl_space_free(space);
+	isl_union_set_free(domain);
+
+	return id;
+}
+
+/* Construct a contraction from "prefix" and "domain" for a new group
+ * in "grouping".
+ *
+ * The values of the prefix schedule "prefix" are used as instances
+ * of the new group.  The identifier of the group is constructed
+ * in such a way that it does not conflict with those of earlier
+ * groups nor with statements in the domain of the original
+ * schedule constraints.
+ * The isl_multi_union_pw_aff "prefix" then simply needs to be
+ * converted to an isl_union_pw_multi_aff.  However, this is not
+ * possible if "prefix" is zero-dimensional, so in this case,
+ * a contraction is constructed from "domain" instead.
+ */
+static isl_union_pw_multi_aff *group_contraction_from_prefix_and_domain(
+	struct ppcg_grouping *grouping,
+	__isl_keep isl_multi_union_pw_aff *prefix,
+	__isl_keep isl_union_set *domain)
+{
+	isl_id *id;
+	isl_space *space;
+	int dim;
+
+	space = isl_multi_union_pw_aff_get_space(prefix);
+	if (!space)
+		return NULL;
+	dim = isl_space_dim(space, isl_dim_set);
+	id = construct_group_id(grouping, space);
+	if (dim == 0) {
+		isl_multi_val *mv;
+
+		space = isl_multi_union_pw_aff_get_space(prefix);
+		space = isl_space_set_tuple_id(space, isl_dim_set, id);
+		mv = isl_multi_val_zero(space);
+		domain = isl_union_set_copy(domain);
+		return isl_union_pw_multi_aff_multi_val_on_domain(domain, mv);
+	}
+	prefix = isl_multi_union_pw_aff_copy(prefix);
+	prefix = isl_multi_union_pw_aff_set_tuple_id(prefix, isl_dim_out, id);
+	return isl_union_pw_multi_aff_from_multi_union_pw_aff(prefix);
+}
+
+/* Extend "grouping" with groups corresponding to merged
+ * leaves in the list of potentially merged leaves "leaves".
+ *
+ * The "list" field of each element in "leaves" contains a list
+ * of the instances sets of the original leaves that have been
+ * merged into this element.  If at least two of the original leaves
+ * have been merged into a given element, then add the corresponding
+ * group to "grouping".
+ * In particular, the domain is extended with the statement instances
+ * of the merged leaves, the contraction is extended with a mapping
+ * of these statement instances to instances of a new group and
+ * the schedule is extended with a schedule that executes
+ * the statement instances according to the order of the leaves
+ * in which they appear.
+ * Since the instances of the groups should already be scheduled apart
+ * in the schedule into which this schedule will be plugged in,
+ * the schedules of the individual groups are combined independently
+ * of each other (as a set).
+ */
+static isl_stat add_groups(struct ppcg_grouping *grouping,
+	int n, struct ppcg_grouping_leaf leaves[n])
+{
+	int i;
+
+	for (i = 0; i < n; ++i) {
+		int n_leaf;
+		isl_schedule *schedule;
+		isl_union_set *domain;
+		isl_union_pw_multi_aff *upma;
+
+		n_leaf = isl_union_set_list_n_union_set(leaves[i].list);
+		if (n_leaf < 0)
+			return isl_stat_error;
+		if (n_leaf <= 1)
+			continue;
+		schedule = schedule_from_domain_and_list(leaves[i].domain,
+							leaves[i].list);
+		upma = group_contraction_from_prefix_and_domain(grouping,
+					leaves[i].prefix, leaves[i].domain);
+
+		domain = isl_union_set_copy(leaves[i].domain);
+		if (grouping->domain) {
+			domain = isl_union_set_union(domain, grouping->domain);
+			upma = isl_union_pw_multi_aff_union_add(upma,
+						grouping->contraction);
+			schedule = isl_schedule_set(schedule,
+						grouping->schedule);
+		}
+		grouping->domain = domain;
+		grouping->contraction = upma;
+		grouping->schedule = schedule;
+
+		if (!grouping->domain || !grouping->contraction ||
+		    !grouping->schedule)
+			return isl_stat_error;
+	}
+
+	return isl_stat_ok;
+}
+
+/* Look for any pairs of consecutive leaves among the "n" children of "node"
+ * starting at "first" that should be merged together.
+ * Store the results in "grouping".
+ *
+ * First make sure the intersection of validity and proximity
+ * schedule constraints is available and extract the required
+ * information from the "n" leaves.
+ * Then try and merge consecutive leaves based on the validity
+ * and proximity constraints.
+ * If any pairs were successfully merged, then add groups
+ * corresponding to the merged leaves to "grouping".
+ */
+static isl_stat group_subsequence(__isl_keep isl_schedule_node *node,
+	int first, int n, struct ppcg_grouping *grouping)
+{
+	int n_merge;
+	struct ppcg_grouping_leaf *leaves;
+
+	if (ppcg_grouping_compute_dep(grouping) < 0)
+		return isl_stat_error;
+
+	leaves = extract_leaves(node, first, n);
+	if (!leaves)
+		return isl_stat_error;
+
+	n_merge = merge_leaves(n, leaves, grouping->dep);
+	if (n_merge >= 0 && n_merge < n &&
+	    add_groups(grouping, n_merge, leaves) < 0)
+		return isl_stat_error;
+
+	ppcg_grouping_leaf_free(n, leaves);
+
+	return isl_stat_ok;
+}
+
+/* If "node" is a sequence, then check if it has any consecutive
+ * leaves that should be merged together and store the results
+ * in "grouping".
+ *
+ * In particular, call group_subsequence on each consecutive
+ * sequence of (filtered) leaves among the children of "node".
+ */
+static isl_bool detect_groups(__isl_keep isl_schedule_node *node, void *user)
+{
+	int i, n, first;
+	struct ppcg_grouping *grouping = user;
+
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
+		return isl_bool_true;
+
+	n = isl_schedule_node_n_children(node);
+	if (n < 0)
+		return isl_bool_error;
+
+	first = -1;
+	for (i = 0; i < n; ++i) {
+		isl_schedule_node *child;
+		enum isl_schedule_node_type type;
+
+		child = isl_schedule_node_get_child(node, i);
+		child = isl_schedule_node_child(child, 0);
+		type = isl_schedule_node_get_type(child);
+		isl_schedule_node_free(child);
+
+		if (first >= 0 && type != isl_schedule_node_leaf) {
+			if (group_subsequence(node, first, i - first,
+						grouping) < 0)
+				return isl_bool_error;
+			first = -1;
+		}
+		if (first < 0 && type == isl_schedule_node_leaf)
+			first = i;
+	}
+	if (first >= 0) {
+		if (group_subsequence(node, first, n - first, grouping) < 0)
+			return isl_bool_error;
+	}
+
+	return isl_bool_true;
+}
+
+/* Complete "grouping" to cover all statement instances in the domain
+ * of grouping->sc.
+ *
+ * In particular, grouping->domain is set to the full set of statement
+ * instances; group->contraction is extended with an identity
+ * contraction on the additional instances and group->schedule
+ * is extended with an independent schedule on those additional instances.
+ * In the extension of group->contraction, the additional instances
+ * are split into those belong to different statements and those
+ * that belong to some of the same statements.  The first group
+ * is replaced by its universe in order to simplify the contraction extension.
+ */
+static void complete_grouping(struct ppcg_grouping *grouping)
+{
+	isl_union_set *domain, *left, *overlap;
+	isl_union_pw_multi_aff *upma;
+	isl_schedule *schedule;
+
+	domain = isl_schedule_constraints_get_domain(grouping->sc);
+	left = isl_union_set_subtract(isl_union_set_copy(domain),
+				    isl_union_set_copy(grouping->domain));
+	schedule = isl_schedule_from_domain(isl_union_set_copy(left));
+	schedule = isl_schedule_set(schedule, grouping->schedule);
+	grouping->schedule = schedule;
+
+	overlap = isl_union_set_universe(grouping->domain);
+	grouping->domain = domain;
+	overlap = isl_union_set_intersect(isl_union_set_copy(left), overlap);
+	left = isl_union_set_subtract(left, isl_union_set_copy(overlap));
+	left = isl_union_set_universe(left);
+	left = isl_union_set_union(left, overlap);
+	upma = isl_union_set_identity_union_pw_multi_aff(left);
+	upma = isl_union_pw_multi_aff_union_add(upma, grouping->contraction);
+	grouping->contraction = upma;
+}
+
+/* Compute a schedule on the domain of "sc" that respects the schedule
+ * constraints in "sc".
+ *
+ * "schedule" is a known correct schedule that is used to combine
+ * groups of statements if options->group_chains is set.
+ * In particular, statements that are executed consecutively in a sequence
+ * in this schedule and where all instances of the second depend on
+ * the instance of the first that is executed in the same iteration
+ * of outer band nodes are grouped together into a single statement.
+ * The schedule constraints are then mapped to these groups of statements
+ * and the resulting schedule is expanded again to refer to the original
+ * statements.
+ */
+__isl_give isl_schedule *ppcg_compute_schedule(
+	__isl_take isl_schedule_constraints *sc,
+	__isl_keep isl_schedule *schedule, struct ppcg_options *options)
+{
+	struct ppcg_grouping grouping = { sc };
+	isl_union_pw_multi_aff *contraction;
+	isl_union_map *umap;
+	isl_schedule *res, *expansion;
+
+	if (!options->group_chains)
+		return isl_schedule_constraints_compute_schedule(sc);
+
+	grouping.group_id = 0;
+	if (isl_schedule_foreach_schedule_node_top_down(schedule,
+			&detect_groups, &grouping) < 0)
+		goto error;
+	if (!grouping.contraction) {
+		ppcg_grouping_clear(&grouping);
+		return isl_schedule_constraints_compute_schedule(sc);
+	}
+	complete_grouping(&grouping);
+	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
+	umap = isl_union_map_from_union_pw_multi_aff(contraction);
+
+	sc = isl_schedule_constraints_apply(sc, umap);
+
+	res = isl_schedule_constraints_compute_schedule(sc);
+
+	contraction = isl_union_pw_multi_aff_copy(grouping.contraction);
+	expansion = isl_schedule_copy(grouping.schedule);
+	res = isl_schedule_expand(res, contraction, expansion);
+
+	ppcg_grouping_clear(&grouping);
+	return res;
+error:
+	ppcg_grouping_clear(&grouping);
+	isl_schedule_constraints_free(sc);
+	return NULL;
+}

Added: polly/trunk/lib/External/ppcg/hybrid.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/hybrid.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/hybrid.c (added)
+++ polly/trunk/lib/External/ppcg/hybrid.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,2242 @@
+/*
+ * Copyright 2013      Ecole Normale Superieure
+ * Copyright 2015      Sven Verdoolaege
+ *
+ * Use of this software is governed by the MIT license
+ *
+ * Written by Sven Verdoolaege,
+ * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
+ */
+
+#include <string.h>
+
+#include <isl/space.h>
+#include <isl/constraint.h>
+#include <isl/val.h>
+#include <isl/aff.h>
+#include <isl/set.h>
+#include <isl/map.h>
+#include <isl/union_set.h>
+#include <isl/union_map.h>
+
+#include "hybrid.h"
+#include "schedule.h"
+
+/* The hybrid tiling implemented in this file is based on
+ * Grosser et al., "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+
+/* Bounds on relative dependence distances in input to hybrid tiling.
+ * upper is an upper bound on the relative dependence distances
+ * in the first space dimension
+ * -lower is a lower bound on the relative dependence distances
+ * in all space dimensions.
+ *
+ * In particular,
+ *
+ *	d_i >= -lower_i d_0
+ * and
+ *	d_1 <= upper d_0
+ *
+ * for each dependence distance vector d, where d_1 is the component
+ * corresponding to the first space dimension.
+ *
+ * upper and lower are always non-negative.
+ * Some of the values may be NaN if no bound could be found.
+ */
+struct ppcg_ht_bounds {
+	isl_val *upper;
+	isl_multi_val *lower;
+};
+
+/* Free "bounds" along with all its fields.
+ */
+__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
+	__isl_take ppcg_ht_bounds *bounds)
+{
+	if (!bounds)
+		return NULL;
+	isl_val_free(bounds->upper);
+	isl_multi_val_free(bounds->lower);
+	free(bounds);
+
+	return NULL;
+}
+
+/* Create a ppcg_ht_bounds object for a band living in "space".
+ * The bounds are initialized to NaN.
+ */
+__isl_give ppcg_ht_bounds *ppcg_ht_bounds_alloc(__isl_take isl_space *space)
+{
+	int i, n;
+	isl_ctx *ctx;
+	ppcg_ht_bounds *bounds;
+
+	if (!space)
+		return NULL;
+
+	ctx = isl_space_get_ctx(space);
+	bounds = isl_alloc_type(ctx, struct ppcg_ht_bounds);
+	if (!bounds)
+		goto error;
+	bounds->upper = isl_val_nan(ctx);
+	bounds->lower = isl_multi_val_zero(space);
+	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
+	for (i = 0; i < n; ++i) {
+		isl_val *v = isl_val_copy(bounds->upper);
+		bounds->lower = isl_multi_val_set_val(bounds->lower, i, v);
+	}
+
+	if (!bounds->lower || !bounds->upper)
+		return ppcg_ht_bounds_free(bounds);
+
+	return bounds;
+error:
+	isl_space_free(space);
+	return NULL;
+}
+
+void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds)
+{
+	if (!bounds)
+		return;
+
+	fprintf(stderr, "lower: ");
+	isl_multi_val_dump(bounds->lower);
+	fprintf(stderr, "upper: ");
+	isl_val_dump(bounds->upper);
+}
+
+/* Return the upper bound on the relative dependence distances
+ * in the first space dimension.
+ */
+__isl_give isl_val *ppcg_ht_bounds_get_upper(__isl_keep ppcg_ht_bounds *bounds)
+{
+	if (!bounds)
+		return NULL;
+	return isl_val_copy(bounds->upper);
+}
+
+/* Replace the upper bound on the relative dependence distances
+ * in the first space dimension by "upper".
+ */
+__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_upper(
+	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_val *upper)
+{
+	if (!bounds || !upper)
+		goto error;
+	isl_val_free(bounds->upper);
+	bounds->upper = upper;
+	return bounds;
+error:
+	ppcg_ht_bounds_free(bounds);
+	isl_val_free(upper);
+	return NULL;
+}
+
+/* Return the lower bound on the relative dependence distances
+ * in space dimension "pos".
+ */
+__isl_give isl_val *ppcg_ht_bounds_get_lower(__isl_keep ppcg_ht_bounds *bounds,
+	int pos)
+{
+	if (!bounds)
+		return NULL;
+	return isl_multi_val_get_val(bounds->lower, pos);
+}
+
+/* Replace the lower bound on the relative dependence distances
+ * in space dimension "pos" by "lower".
+ */
+__isl_give ppcg_ht_bounds *ppcg_ht_bounds_set_lower(
+	__isl_take ppcg_ht_bounds *bounds, int pos, __isl_take isl_val *lower)
+{
+	if (!bounds || !lower)
+		goto error;
+	bounds->lower = isl_multi_val_set_val(bounds->lower, pos, lower);
+	if (!bounds->lower)
+		return ppcg_ht_bounds_free(bounds);
+	return bounds;
+error:
+	ppcg_ht_bounds_free(bounds);
+	isl_val_free(lower);
+	return NULL;
+}
+
+/* Can the bounds on relative dependence distances recorded in "bounds"
+ * be used to perform hybrid tiling?
+ * In particular, have appropriate lower and upper bounds been found?
+ * Any NaN indicates that no corresponding bound was found.
+ */
+isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds)
+{
+	isl_bool is_nan;
+	int i, n;
+
+	if (!bounds)
+		return isl_bool_error;
+	is_nan = isl_val_is_nan(bounds->upper);
+	if (is_nan < 0)
+		return isl_bool_error;
+	if (is_nan)
+		return isl_bool_false;
+
+	n = isl_multi_val_dim(bounds->lower, isl_dim_set);
+	for (i = 0; i < n; ++i) {
+		isl_val *v;
+
+		v = isl_multi_val_get_val(bounds->lower, i);
+		is_nan = isl_val_is_nan(v);
+		if (is_nan < 0)
+			return isl_bool_error;
+		if (is_nan)
+			return isl_bool_false;
+		isl_val_free(v);
+	}
+
+	return isl_bool_true;
+}
+
+/* Structure that represents the basic hexagonal tiling,
+ * along with information that is needed to perform the hybrid tiling.
+ *
+ * "bounds" are the bounds on the dependence distances that
+ * define the hexagonal shape and the required skewing in the remaining
+ * space dimensions.
+ *
+ * "input_node" points to the input pair of band nodes.
+ * "input_schedule" is the partial schedule of this input pair of band nodes.
+ * The space of this schedule is [P -> C], where P is the space
+ * of the parent node and C is the space of the child node.
+ *
+ * "space_sizes" represent the total size of a tile for the space
+ * dimensions, i.e., those corresponding to the child node.
+ * The space of "space_sizes" is C.
+ * If S_0 is the original tile size in the first space dimension,
+ * then the first entry of "space_sizes" is equal to
+ * W = 2*S_0 + floor(d_l h) + floor(d_u h).
+ * The remaining entries are the same as in the original tile sizes.
+ *
+ * The basic hexagonal tiling "hex" is defined
+ * in a "ts" (time-space) space and corresponds to the phase-1 tiles.
+ * "time_tile" maps the "ts" space to outer time tile.
+ * Is is equal to ts[t, s] -> floor(t/(2 * S_t)), with S_t the original tile
+ * size corresponding to the parent node.
+ * "local_time" maps the "ts" space to the time dimension inside each tile.
+ * It is equal to ts[t, s] -> t mod (2 S_t), with S_t the original tile
+ * size corresponding to the parent node.
+ * "shift_space" shifts the tiles at time tile T = floor(t/(2 S_t))
+ * in the space dimension such that they align to a multiple of W.
+ * It is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W,
+ * with shift_s = S_0 + floor(d_u h).
+ * "shift_phase" is the shift taken to go from phase 0 to phase 1
+ * It is equal to ts[t, s] -> ts[t + S_t, s + shift_s],
+ * with shift_s = S_0 + floor(d_u h).
+ *
+ * "project_ts" projects the space of the input schedule to the ts-space.
+ * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
+ */
+struct ppcg_ht_tiling {
+	int ref;
+
+	ppcg_ht_bounds *bounds;
+	isl_schedule_node *input_node;
+	isl_multi_union_pw_aff *input_schedule;
+
+	isl_multi_val *space_sizes;
+
+	isl_aff *time_tile;
+	isl_aff *local_time;
+	isl_aff *shift_space;
+	isl_multi_aff *shift_phase;
+	isl_set *hex;
+
+	isl_multi_aff *project_ts;
+};
+typedef struct ppcg_ht_tiling ppcg_ht_tiling;
+
+/* Return the space of the pair of band nodes that form the input
+ * to the hybrid tiling.
+ * In particular, return the space [P -> C], where P is the space
+ * of the parent node and C is the space of the child node.
+ */
+__isl_give isl_space *ppcg_ht_tiling_get_input_space(
+	__isl_keep ppcg_ht_tiling *tile)
+{
+	if (!tile)
+		return NULL;
+
+	return isl_multi_union_pw_aff_get_space(tile->input_schedule);
+}
+
+/* Remove a reference to "tile" and free "tile" along with all its fields
+ * as soon as the reference count drops to zero.
+ */
+static __isl_null ppcg_ht_tiling *ppcg_ht_tiling_free(
+	__isl_take ppcg_ht_tiling *tiling)
+{
+	if (!tiling)
+		return NULL;
+	if (--tiling->ref > 0)
+		return NULL;
+
+	ppcg_ht_bounds_free(tiling->bounds);
+	isl_schedule_node_free(tiling->input_node);
+	isl_multi_union_pw_aff_free(tiling->input_schedule);
+	isl_multi_val_free(tiling->space_sizes);
+	isl_aff_free(tiling->time_tile);
+	isl_aff_free(tiling->local_time);
+	isl_aff_free(tiling->shift_space);
+	isl_multi_aff_free(tiling->shift_phase);
+	isl_set_free(tiling->hex);
+	isl_multi_aff_free(tiling->project_ts);
+	free(tiling);
+
+	return NULL;
+}
+
+/* Return a new reference to "tiling".
+ */
+__isl_give ppcg_ht_tiling *ppcg_ht_tiling_copy(
+	__isl_keep ppcg_ht_tiling *tiling)
+{
+	if (!tiling)
+		return NULL;
+
+	tiling->ref++;
+	return tiling;
+}
+
+/* Return the isl_ctx to which "tiling" belongs.
+ */
+isl_ctx *ppcg_ht_tiling_get_ctx(__isl_keep ppcg_ht_tiling *tiling)
+{
+	if (!tiling)
+		return NULL;
+
+	return isl_multi_union_pw_aff_get_ctx(tiling->input_schedule);
+}
+
+/* Representation of one of the two phases of hybrid tiling.
+ *
+ * "tiling" points to the shared tiling data.
+ *
+ * "time_tile", "local_time" and "shift_space" are equal to the corresponding
+ * fields of "tiling", pulled back to the input space.
+ * In case of phase 0, these expressions have also been moved
+ * from phase 1 to phase 0.
+ *
+ * "domain" contains the hexagonal tiling of this phase.
+ *
+ * "space_shift" is the shift that should be added to the space band
+ * in order to be able to apply rectangular tiling to the space.
+ * For phase 1, it is equal to
+ *
+ *	[P[t] -> C[s_0, s_i]] -> C[(-(2 * shift_s)*T) % W, dl_i * u]
+ *
+ * with shift_s = S_0 + floor(d_u h),
+ * T equal to "time_tile" and u equal to "local_time".
+ * For phase 0, it is equal to
+ *
+ *	[P[t] -> C[s_0, s_i]] -> C[shift_s + (-(2 * shift_s)*T) % W, dl_i * u]
+ *
+ * "space_tile" is the space tiling.  It is equal to
+ *
+ *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
+ */
+struct ppcg_ht_phase {
+	ppcg_ht_tiling *tiling;
+
+	isl_aff *time_tile;
+	isl_aff *local_time;
+	isl_aff *shift_space;
+	isl_set *domain;
+
+	isl_multi_aff *space_shift;
+	isl_multi_aff *space_tile;
+};
+
+/* Free "phase" along with all its fields.
+ */
+static __isl_null ppcg_ht_phase *ppcg_ht_phase_free(
+	__isl_take ppcg_ht_phase *phase)
+{
+	if (!phase)
+		return NULL;
+
+	ppcg_ht_tiling_free(phase->tiling);
+	isl_aff_free(phase->time_tile);
+	isl_aff_free(phase->local_time);
+	isl_aff_free(phase->shift_space);
+	isl_set_free(phase->domain);
+	isl_multi_aff_free(phase->space_shift);
+	isl_multi_aff_free(phase->space_tile);
+	free(phase);
+
+	return NULL;
+}
+
+/* Wrapper around ppcg_ht_phase_free for use as an argument
+ * to isl_id_set_free_user.
+ */
+static void ppcg_ht_phase_free_wrap(void *user)
+{
+	ppcg_ht_phase *phase = user;
+
+	ppcg_ht_phase_free(phase);
+}
+
+/* Return the domain of hybrid tiling phase "phase".
+ */
+static __isl_give isl_set *ppcg_ht_phase_get_domain(ppcg_ht_phase *phase)
+{
+	if (!phase)
+		return NULL;
+
+	return isl_set_copy(phase->domain);
+}
+
+/* Return the space of the pair of band nodes that form the input
+ * to the hybrid tiling of which "phase" is a phase.
+ * In particular, return the space [P -> C], where P is the space
+ * of the parent node and C is the space of the child node.
+ */
+static __isl_give isl_space *ppcg_ht_phase_get_input_space(
+	__isl_keep ppcg_ht_phase *phase)
+{
+	if (!phase)
+		return NULL;
+
+	return ppcg_ht_tiling_get_input_space(phase->tiling);
+}
+
+/* Construct the lower left constraint of the hexagonal tile, i.e.,
+ *
+ *	du a - b <= (2h+1) du - duh
+ *	-du a + b + (2h+1) du - duh >= 0
+ *
+ * where duh = floor(du * h).
+ *
+ * This constraint corresponds to (6) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_lower_left(__isl_take isl_local_space *ls,
+	__isl_keep isl_val *h, __isl_keep isl_val *du, __isl_keep isl_val *duh)
+{
+	isl_val *v;
+	isl_aff *aff;
+
+	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
+	v = isl_val_mul(v, isl_val_copy(du));
+	v = isl_val_sub(v, isl_val_copy(duh));
+	aff = isl_aff_val_on_domain(ls, v);
+	v = isl_val_neg(isl_val_copy(du));
+	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
+	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the lower constraint of the hexagonal tile, i.e.,
+ *
+ *	a <= 2h+1
+ *	-a + 2h+1 >= 0
+ *
+ * This constraint corresponds to (7) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_lower(__isl_take isl_local_space *ls,
+	__isl_keep isl_val *h)
+{
+	isl_val *v;
+	isl_aff *aff;
+
+	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
+	aff = isl_aff_val_on_domain(ls, v);
+	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 0, -1);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the lower right constraint of the hexagonal tile, i.e.,
+ *
+ *	dl a + b <= (2h+1) dl + duh + (s0-1)
+ *	-dl a - b + (2h+1) dl + duh + (s0-1) >= 0
+ *
+ * where duh = floor(du * h).
+ *
+ * This constraint corresponds to (8) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_lower_right(
+	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
+	__isl_keep isl_val *s0, __isl_keep isl_val *dl, __isl_keep isl_val *duh)
+{
+	isl_val *v;
+	isl_aff *aff;
+
+	v = isl_val_add_ui(isl_val_mul_ui(isl_val_copy(h), 2), 1);
+	v = isl_val_mul(v, isl_val_copy(dl));
+	v = isl_val_add(v, isl_val_copy(duh));
+	v = isl_val_add(v, isl_val_copy(s0));
+	v = isl_val_sub_ui(v, 1);
+	aff = isl_aff_val_on_domain(ls, v);
+	v = isl_val_neg(isl_val_copy(dl));
+	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, v);
+	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the upper left constraint of the hexagonal tile, i.e.,
+ *
+ *	dl a + b >= h dl - (d - 1)/d				with d = den(dl)
+ *	dl a + b - h dl + (d - 1)/d >= 0
+ *
+ * This constraint corresponds to (10) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_upper_left(__isl_take isl_local_space *ls,
+	__isl_keep isl_val *h, __isl_keep isl_val *dl)
+{
+	isl_val *v, *d;
+	isl_aff *aff;
+
+	d = isl_val_get_den_val(dl);
+	v = isl_val_sub_ui(isl_val_copy(d), 1);
+	v = isl_val_div(v, d);
+	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(dl)));
+	aff = isl_aff_val_on_domain(ls, v);
+	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(dl));
+	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, 1);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the upper right constraint of the hexagonal tile, i.e.,
+ *
+ *	du a - b >= du h - duh - (s0-1) - dlh - (d - 1)/d	with d = den(du)
+ *	du a - b - du h + duh + (s0-1) + dlh + (d - 1)/d >= 0
+ *
+ * where dlh = floor(dl * h) and duh = floor(du * h).
+ *
+ * This constraint corresponds to (12) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_upper_right(
+	__isl_take isl_local_space *ls, __isl_keep isl_val *h,
+	__isl_keep isl_val *s0, __isl_keep isl_val *du,
+	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
+{
+	isl_val *v, *d;
+	isl_aff *aff;
+
+	d = isl_val_get_den_val(du);
+	v = isl_val_sub_ui(isl_val_copy(d), 1);
+	v = isl_val_div(v, d);
+	v = isl_val_sub(v, isl_val_mul(isl_val_copy(h), isl_val_copy(du)));
+	v = isl_val_add(v, isl_val_copy(duh));
+	v = isl_val_add(v, isl_val_copy(dlh));
+	v = isl_val_add(v, isl_val_copy(s0));
+	v = isl_val_sub_ui(v, 1);
+	aff = isl_aff_val_on_domain(ls, v);
+	aff = isl_aff_set_coefficient_val(aff, isl_dim_in, 0, isl_val_copy(du));
+	aff = isl_aff_set_coefficient_si(aff, isl_dim_in, 1, -1);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the uppper constraint of the hexagonal tile, i.e.,
+ *
+ *	a >= 0
+ *
+ * This constraint corresponds to (13) in
+ * "Hybrid Hexagonal/Classical Tiling for GPUs".
+ */
+static __isl_give isl_constraint *hex_upper(__isl_take isl_local_space *ls)
+{
+	isl_aff *aff;
+
+	aff = isl_aff_var_on_domain(ls, isl_dim_set, 0);
+
+	return isl_inequality_from_aff(aff);
+}
+
+/* Construct the basic hexagonal tile shape.
+ * "space" is the 2D space in which the hexagon should be constructed.
+ * h is st-1, with st the tile size in the time dimension
+ * s0 is the tile size in the space dimension
+ * dl is a bound on the negative relative dependence distances, i.e.,
+ *
+ *	d_s >= -dl d_t
+ *
+ * du is a bound on the positive relative dependence distances, i.e.,
+ *
+ *	d_s <= du d_t
+ *
+ * with (d_t,d_s) any dependence distance vector.
+ * dlh = floor(dl * h)
+ * duh = floor(du * h)
+ *
+ * The shape of the hexagon is as follows:
+ *
+ *		0 dlh   dlh+s0-1
+ *		   ______                __
+ * 0		  /      \_             /
+ *		 /         \_          /
+ * h		/            \ ______ /
+ * h+1		\_           //      \\_
+ *		  \_        //         \\_
+ * 2h+1		    \______//            \\
+ *		0   duh   duh+s0-1
+ *		             duh+s0-1+dlh
+ *		                  duh+s0-1+dlh+1+s0+1
+ *
+ * The next hexagon is shifted by duh + dlh + 2 * s0.
+ *
+ * The slope of the "/" constraints is dl.
+ * The slope of the "\_" constraints is du.
+ */
+static __isl_give isl_set *compute_hexagon(__isl_take isl_space *space,
+	__isl_keep isl_val *h, __isl_keep isl_val *s0,
+	__isl_keep isl_val *dl, __isl_keep isl_val *du,
+	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
+{
+	isl_local_space *ls;
+	isl_constraint *c;
+	isl_basic_set *bset;
+
+	ls = isl_local_space_from_space(space);
+
+	c = hex_lower_left(isl_local_space_copy(ls), h, du, duh);
+	bset = isl_basic_set_from_constraint(c);
+
+	c = hex_lower(isl_local_space_copy(ls), h);
+	bset = isl_basic_set_add_constraint(bset, c);
+
+	c = hex_lower_right(isl_local_space_copy(ls), h, s0, dl, duh);
+	bset = isl_basic_set_add_constraint(bset, c);
+
+	c = hex_upper_left(isl_local_space_copy(ls), h, dl);
+	bset = isl_basic_set_add_constraint(bset, c);
+
+	c = hex_upper_right(isl_local_space_copy(ls), h, s0, du, dlh, duh);
+	bset = isl_basic_set_add_constraint(bset, c);
+
+	c = hex_upper(ls);
+	bset = isl_basic_set_add_constraint(bset, c);
+
+	return isl_set_from_basic_set(bset);
+}
+
+/* Name of the ts-space.
+ */
+static const char *ts_space_name = "ts";
+
+/* Construct and return the space ts[t, s].
+ */
+static __isl_give isl_space *construct_ts_space(isl_ctx *ctx)
+{
+	isl_space *s;
+
+	s = isl_space_set_alloc(ctx, 0, 2);
+	s = isl_space_set_tuple_name(s, isl_dim_set, ts_space_name);
+
+	return s;
+}
+
+/* Name of the local ts-space.
+ */
+static const char *local_ts_space_name = "local_ts";
+
+/* Construct and return the space local_ts[t, s].
+ */
+static __isl_give isl_space *construct_local_ts_space(isl_ctx *ctx)
+{
+	isl_space *s;
+
+	s = isl_space_set_alloc(ctx, 0, 2);
+	s = isl_space_set_tuple_name(s, isl_dim_set, local_ts_space_name);
+
+	return s;
+}
+
+/* Compute the total size of a tile for the space dimensions,
+ * i.e., those corresponding to the child node
+ * of the input pattern.
+ * If S_0 is the original tile size in the first space dimension,
+ * then the first entry of "space_sizes" is equal to
+ * W = 2*S_0 + floor(d_l h) + floor(d_u h).
+ * The remaining entries are the same as in the original tile sizes.
+ * "tile_sizes" contains the original tile sizes, including
+ * the tile size corresponding to the parent node.
+ * "dlh" is equal to floor(d_l h).
+ * "duh" is equal to floor(d_u h).
+ */
+static __isl_give isl_multi_val *compute_space_sizes(
+	__isl_keep isl_multi_val *tile_sizes,
+	__isl_keep isl_val *dlh, __isl_keep isl_val *duh)
+{
+	isl_val *size;
+	isl_multi_val *space_sizes;
+
+	space_sizes = isl_multi_val_copy(tile_sizes);
+	space_sizes = isl_multi_val_factor_range(space_sizes);
+	size = isl_multi_val_get_val(space_sizes, 0);
+	size = isl_val_mul_ui(size, 2);
+	size = isl_val_add(size, isl_val_copy(duh));
+	size = isl_val_add(size, isl_val_copy(dlh));
+	space_sizes = isl_multi_val_set_val(space_sizes, 0, size);
+
+	return space_sizes;
+}
+
+/* Compute the offset of phase 1 with respect to phase 0
+ * in the ts-space ("space").
+ * In particular, return
+ *
+ *	ts[st, s0 + duh]
+ */
+static __isl_give isl_multi_val *compute_phase_shift(
+	__isl_keep isl_space *space, __isl_keep isl_val *st,
+	__isl_keep isl_val *s0, __isl_keep isl_val *duh)
+{
+	isl_val *v;
+	isl_multi_val *phase_shift;
+
+	phase_shift = isl_multi_val_zero(isl_space_copy(space));
+	phase_shift = isl_multi_val_set_val(phase_shift, 0, isl_val_copy(st));
+	v = isl_val_add(isl_val_copy(duh), isl_val_copy(s0));
+	phase_shift = isl_multi_val_set_val(phase_shift, 1, v);
+
+	return phase_shift;
+}
+
+/* Return the function
+ *
+ *	ts[t, s] -> floor(t/(2 * st))
+ *
+ * representing the time tile.
+ * "space" is the space ts[t, s].
+ */
+static __isl_give isl_aff *compute_time_tile(__isl_keep isl_space *space,
+	__isl_keep isl_val *st)
+{
+	isl_val *v;
+	isl_aff *t;
+	isl_local_space *ls;
+
+	ls = isl_local_space_from_space(isl_space_copy(space));
+	t = isl_aff_var_on_domain(ls, isl_dim_set, 0);
+	v = isl_val_mul_ui(isl_val_copy(st), 2);
+	t = isl_aff_floor(isl_aff_scale_down_val(t, v));
+
+	return t;
+}
+
+/* Compute a shift in the space dimension for tiles
+ * at time tile T = floor(t/(2 * S_t))
+ * such that they align to a multiple of the total space tile dimension W.
+ * In particular, compute
+ *
+ *	ts[t, s] -> s + (-(2 * shift_s)*T) % W
+ *
+ * where shift_s is the shift of phase 1 with respect to phase 0
+ * in the space dimension (the first element of "phase_shift").
+ * W is stored in the first element of "space_sizes".
+ * "time_tile" is the function
+ *
+ *	ts[t, s] -> floor(t/(2 * S_T))
+ *
+ * Since phase 1 is shifted by shift_s with respect to phase 0,
+ * the next line of phase 0 (at T+1) is shifted by 2*shift_s
+ * with respect to the previous line (at T).
+ * A shift of -(2 * shift_s)*T therefore allows the basic pattern
+ * (which starts at 0) to be applied.
+ * However, this shift will be used to obtain the tile coordinate
+ * in the first space dimension and if the original values
+ * in the space dimension are non-negative, then the shift should
+ * not make them negative.  Moreover, the shift should be as minimal
+ * as possible.
+ * Since the pattern repeats itself with a period of W in the space
+ * dimension, the shift can be replaced by (-(2 * shift_s)*T) % W.
+ */
+static __isl_give isl_aff *compute_shift_space(__isl_keep isl_aff *time_tile,
+	__isl_keep isl_multi_val *space_sizes,
+	__isl_keep isl_multi_val *phase_shift)
+{
+	isl_val *v;
+	isl_aff *s, *t;
+	isl_local_space *ls;
+
+	ls = isl_local_space_from_space(isl_aff_get_domain_space(time_tile));
+	t = isl_aff_copy(time_tile);
+	v = isl_val_mul_ui(isl_multi_val_get_val(phase_shift, 1), 2);
+	v = isl_val_neg(v);
+	t = isl_aff_scale_val(t, v);
+	v = isl_multi_val_get_val(space_sizes, 0);
+	t = isl_aff_mod_val(t, v);
+	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
+	s = isl_aff_add(s, t);
+
+	return s;
+}
+
+/* Give the phase_shift ts[S_t, S_0 + floor(d_u h)],
+ * compute a function that applies the shift, i.e.,
+ *
+ *	ts[t, s] -> ts[t + S_t, s + S_0 + floor(d_u h)],
+ */
+static __isl_give isl_multi_aff *compute_shift_phase(
+	__isl_keep isl_multi_val *phase_shift)
+{
+	isl_space *space;
+	isl_multi_aff *shift;
+
+	space = isl_multi_val_get_space(phase_shift);
+	shift = isl_multi_aff_multi_val_on_space(space,
+					isl_multi_val_copy(phase_shift));
+	space = isl_multi_aff_get_space(shift);
+	shift = isl_multi_aff_add(shift, isl_multi_aff_identity(space));
+
+	return shift;
+}
+
+/* Compute a mapping from the ts-space to the local coordinates
+ * within each tile.  In particular, compute
+ *
+ *	ts[t, s] -> local_ts[t % (2 S_t), (s + (-(2 * shift_s)*T) % W) % W]
+ *
+ * "ts" is the space ts[t, s]
+ * "local_ts" is the space local_ts[t, s]
+ * "shift_space" is equal to ts[t, s] -> s + (-(2 * shift_s)*T) % W
+ * "st" is the tile size in the time dimension S_t.
+ * The first element of "space_sizes" is equal to W.
+ */
+static __isl_give isl_multi_aff *compute_localize(
+	__isl_keep isl_space *local_ts, __isl_keep isl_aff *shift_space,
+	__isl_keep isl_val *st, __isl_keep isl_multi_val *space_sizes)
+{
+	isl_val *v;
+	isl_space *space;
+	isl_aff *s, *t;
+	isl_multi_aff *localize;
+
+	space = isl_aff_get_domain_space(shift_space);
+	local_ts = isl_space_copy(local_ts);
+	space = isl_space_map_from_domain_and_range(space, local_ts);
+	localize = isl_multi_aff_identity(space);
+	t = isl_multi_aff_get_aff(localize, 0);
+	v = isl_val_mul_ui(isl_val_copy(st), 2);
+	t = isl_aff_mod_val(t, v);
+	localize = isl_multi_aff_set_aff(localize, 0, t);
+	s = isl_aff_copy(shift_space);
+	v = isl_multi_val_get_val(space_sizes, 0);
+	s = isl_aff_mod_val(s, v);
+	localize = isl_multi_aff_set_aff(localize, 1, s);
+
+	return localize;
+}
+
+/* Set the project_ts field of "tiling".
+ *
+ * This field projects the space of the input schedule to the ts-space.
+ * It is equal to [P[t] -> C[s_0, ...]] -> ts[t, s_0].
+ */
+static __isl_give ppcg_ht_tiling *ppcg_ht_tiling_set_project_ts(
+	__isl_take ppcg_ht_tiling *tiling)
+{
+	int n;
+	isl_space *space;
+	isl_multi_aff *project;
+
+	if (!tiling)
+		return NULL;
+
+	space = ppcg_ht_tiling_get_input_space(tiling);
+	n = isl_space_dim(space, isl_dim_set);
+	project = isl_multi_aff_project_out_map(space, isl_dim_set, 2, n - 2);
+	project = isl_multi_aff_set_tuple_name(project,
+						isl_dim_out, ts_space_name);
+	if (!project)
+		return ppcg_ht_tiling_free(tiling);
+
+	tiling->project_ts = project;
+
+	return tiling;
+}
+
+/* Construct a hybrid tiling description from bounds on the dependence
+ * distances "bounds".
+ * "input_node" points to the original parent node.
+ * "input_schedule" is the combined schedule of the parent and child
+ * node in the input.
+ * "tile_sizes" are the original, user specified tile sizes.
+ */
+static __isl_give ppcg_ht_tiling *ppcg_ht_bounds_construct_tiling(
+	__isl_take ppcg_ht_bounds *bounds,
+	__isl_keep isl_schedule_node *input_node,
+	__isl_keep isl_multi_union_pw_aff *input_schedule,
+	__isl_keep isl_multi_val *tile_sizes)
+{
+	isl_ctx *ctx;
+	ppcg_ht_tiling *tiling;
+	isl_multi_val *space_sizes, *phase_shift;
+	isl_aff *time_tile, *shift_space;
+	isl_multi_aff *localize;
+	isl_val *h, *duh, *dlh;
+	isl_val *st, *s0, *du, *dl;
+	isl_space *ts, *local_ts;
+
+	if (!bounds || !input_node || !input_schedule || !tile_sizes)
+		goto error;
+
+	ctx = isl_multi_union_pw_aff_get_ctx(input_schedule);
+	tiling = isl_calloc_type(ctx, struct ppcg_ht_tiling);
+	if (!tiling)
+		goto error;
+	tiling->ref = 1;
+
+	st = isl_multi_val_get_val(tile_sizes, 0);
+	h = isl_val_sub_ui(isl_val_copy(st), 1);
+	s0 = isl_multi_val_get_val(tile_sizes, 1);
+	du = ppcg_ht_bounds_get_upper(bounds);
+	dl = ppcg_ht_bounds_get_lower(bounds, 0);
+
+	duh = isl_val_floor(isl_val_mul(isl_val_copy(du), isl_val_copy(h)));
+	dlh = isl_val_floor(isl_val_mul(isl_val_copy(dl), isl_val_copy(h)));
+
+	ts = construct_ts_space(ctx);
+	local_ts = construct_local_ts_space(ctx);
+
+	space_sizes = compute_space_sizes(tile_sizes, dlh, duh);
+	phase_shift = compute_phase_shift(ts, st, s0, duh);
+	time_tile = compute_time_tile(ts, st);
+	shift_space = compute_shift_space(time_tile, space_sizes, phase_shift);
+	localize = compute_localize(local_ts, shift_space, st, space_sizes);
+	isl_space_free(ts);
+
+	tiling->input_node = isl_schedule_node_copy(input_node);
+	tiling->input_schedule = isl_multi_union_pw_aff_copy(input_schedule);
+	tiling->space_sizes = space_sizes;
+	tiling->bounds = bounds;
+	tiling->local_time = isl_multi_aff_get_aff(localize, 0);
+	tiling->hex = compute_hexagon(local_ts, h, s0, dl, du, dlh, duh);
+	tiling->hex = isl_set_preimage_multi_aff(tiling->hex, localize);
+	tiling->time_tile = time_tile;
+	tiling->shift_space = shift_space;
+	tiling->shift_phase = compute_shift_phase(phase_shift);
+	isl_multi_val_free(phase_shift);
+
+	isl_val_free(duh);
+	isl_val_free(dlh);
+	isl_val_free(du);
+	isl_val_free(dl);
+	isl_val_free(s0);
+	isl_val_free(st);
+	isl_val_free(h);
+
+	if (!tiling->input_schedule || !tiling->local_time || !tiling->hex ||
+	    !tiling->shift_space || !tiling->shift_phase)
+		return ppcg_ht_tiling_free(tiling);
+
+	tiling = ppcg_ht_tiling_set_project_ts(tiling);
+
+	return tiling;
+error:
+	ppcg_ht_bounds_free(bounds);
+	return NULL;
+}
+
+/* Are all members of the band node "node" coincident?
+ */
+static isl_bool all_coincident(__isl_keep isl_schedule_node *node)
+{
+	int i, n;
+
+	n = isl_schedule_node_band_n_member(node);
+	for (i = 0; i < n; ++i) {
+		isl_bool c;
+
+		c = isl_schedule_node_band_member_get_coincident(node, i);
+		if (c < 0 || !c)
+			return c;
+	}
+
+	return isl_bool_true;
+}
+
+/* Does "node" satisfy the properties of the inner node in the input
+ * pattern for hybrid tiling?
+ * That is, is it a band node with only coincident members, of which
+ * there is at least one?
+ */
+static isl_bool has_child_properties(__isl_keep isl_schedule_node *node)
+{
+	if (!node)
+		return isl_bool_error;
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
+		return isl_bool_false;
+	if (isl_schedule_node_band_n_member(node) < 1)
+		return isl_bool_false;
+	return all_coincident(node);
+}
+
+/* Does "node" satisfy the properties of the outer node in the input
+ * pattern for hybrid tiling?
+ * That is, is it a band node with a single member?
+ */
+static isl_bool has_parent_properties(__isl_keep isl_schedule_node *node)
+{
+	if (!node)
+		return isl_bool_error;
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_band)
+		return isl_bool_false;
+	if (isl_schedule_node_band_n_member(node) != 1)
+		return isl_bool_false;
+	return isl_bool_true;
+}
+
+/* Does the parent of "node" satisfy the input patttern for hybrid tiling?
+ * That is, does "node" satisfy the properties of the inner node and
+ * does the parent of "node" satisfy the properties of the outer node?
+ */
+isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node)
+{
+	isl_bool has_pattern;
+
+	has_pattern = has_child_properties(node);
+	if (has_pattern < 0 || !has_pattern)
+		return has_pattern;
+
+	node = isl_schedule_node_copy(node);
+	node = isl_schedule_node_parent(node);
+	has_pattern = has_parent_properties(node);
+	isl_schedule_node_free(node);
+
+	return has_pattern;
+}
+
+/* Does "node" satisfy the input patttern for hybrid tiling?
+ * That is, does "node" satisfy the properties of the outer node and
+ * does the child of "node" satisfy the properties of the inner node?
+ */
+isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node)
+{
+	isl_bool has_pattern;
+
+	has_pattern = has_parent_properties(node);
+	if (has_pattern < 0 || !has_pattern)
+		return has_pattern;
+
+	node = isl_schedule_node_get_child(node, 0);
+	has_pattern = has_child_properties(node);
+	isl_schedule_node_free(node);
+
+	return has_pattern;
+}
+
+/* Check that "node" satisfies the input pattern for hybrid tiling.
+ * Error out if it does not.
+ */
+static isl_stat check_input_pattern(__isl_keep isl_schedule_node *node)
+{
+	isl_bool has_pattern;
+
+	has_pattern = ppcg_ht_has_input_pattern(node);
+	if (has_pattern < 0)
+		return isl_stat_error;
+	if (!has_pattern)
+		isl_die(isl_schedule_node_get_ctx(node), isl_error_invalid,
+			"invalid input pattern for hybrid tiling",
+			return isl_stat_error);
+
+	return isl_stat_ok;
+}
+
+/* Extract the input schedule from "node", i.e., the product
+ * of the partial schedules of the parent and child nodes
+ * in the input pattern.
+ */
+static __isl_give isl_multi_union_pw_aff *extract_input_schedule(
+	__isl_keep isl_schedule_node *node)
+{
+	isl_multi_union_pw_aff *partial, *partial2;
+
+	partial = isl_schedule_node_band_get_partial_schedule(node);
+	node = isl_schedule_node_get_child(node, 0);
+	partial2 = isl_schedule_node_band_get_partial_schedule(node);
+	isl_schedule_node_free(node);
+
+	return isl_multi_union_pw_aff_range_product(partial, partial2);
+}
+
+/* Collect all dependences from "scop" that are relevant for performing
+ * hybrid tiling on "node" and its child and map them to the schedule
+ * space of this pair of nodes.
+ *
+ * In case live range reordering is not used,
+ * the flow and the false dependences are collected.
+ * In case live range reordering is used,
+ * the flow and the forced dependences are collected, as well
+ * as the order dependences that are adjacent to non-local
+ * flow dependences.
+ *
+ * In all cases, only dependences that map to the same instance
+ * of the outer part of the schedule are considered.
+ */
+static __isl_give isl_map *collect_deps(struct ppcg_scop *scop,
+	__isl_keep isl_schedule_node *node)
+{
+	isl_space *space;
+	isl_multi_union_pw_aff *prefix, *partial;
+	isl_union_map *flow, *other, *dep, *umap;
+	isl_map *map;
+
+	prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
+	partial = extract_input_schedule(node);
+	space = isl_multi_union_pw_aff_get_space(partial);
+
+	flow = isl_union_map_copy(scop->dep_flow);
+	flow = isl_union_map_eq_at_multi_union_pw_aff(flow,
+					isl_multi_union_pw_aff_copy(prefix));
+	if (!scop->options->live_range_reordering) {
+		other = isl_union_map_copy(scop->dep_false);
+		other = isl_union_map_eq_at_multi_union_pw_aff(other, prefix);
+	} else {
+		isl_union_map *local, *non_local, *order, *adj;
+		isl_union_set *domain, *range;
+
+		other = isl_union_map_copy(scop->dep_forced);
+		other = isl_union_map_eq_at_multi_union_pw_aff(other,
+					isl_multi_union_pw_aff_copy(prefix));
+		local = isl_union_map_copy(flow);
+		local = isl_union_map_eq_at_multi_union_pw_aff(local,
+					isl_multi_union_pw_aff_copy(partial));
+		non_local = isl_union_map_copy(flow);
+		non_local = isl_union_map_subtract(non_local, local);
+
+		order = isl_union_map_copy(scop->dep_order);
+		order = isl_union_map_eq_at_multi_union_pw_aff(order, prefix);
+		adj = isl_union_map_copy(order);
+		domain = isl_union_map_domain(isl_union_map_copy(non_local));
+		domain = isl_union_set_coalesce(domain);
+		adj = isl_union_map_intersect_range(adj, domain);
+		other = isl_union_map_union(other, adj);
+
+		adj = order;
+		range = isl_union_map_range(non_local);
+		range = isl_union_set_coalesce(range);
+		adj = isl_union_map_intersect_domain(adj, range);
+		other = isl_union_map_union(other, adj);
+	}
+	dep = isl_union_map_union(flow, other);
+
+	umap = isl_union_map_from_multi_union_pw_aff(partial);
+	dep = isl_union_map_apply_domain(dep, isl_union_map_copy(umap));
+	dep = isl_union_map_apply_range(dep, umap);
+
+	space = isl_space_map_from_set(space);
+	map = isl_union_map_extract_map(dep, space);
+	isl_union_map_free(dep);
+
+	map = isl_map_coalesce(map);
+
+	return map;
+}
+
+/* Given a constraint of the form
+ *
+ *	a i_0 + b i_1 >= 0
+ * or
+ *	a i_0 + b i_1 = 0
+ *
+ * use it to update one or both of the non-negative bounds
+ * in "list" = (min, max) such that
+ *
+ *	i_1 >= -min i_0
+ * and
+ *	i_1 <= max i_0
+ *
+ * If b = 0, then the constraint cannot be used.
+ * Otherwise, the constraint is equivalent to
+ *
+ *	sgn(b) i_1 >= - a/abs(b) i_0
+ * i.e.,
+ *	i_1 >= - a/abs(b) i_0
+ * or
+ *	i_1 <= a/abs(b) i_0
+ *
+ * Set the first or second element of "list" to max(0, a/abs(b)),
+ * according to the sign of "b".  Or set both in case the constraint
+ * is an equality, taking into account the sign change.
+ */
+static __isl_give isl_val_list *list_set_min_max(__isl_take isl_val_list *list,
+	__isl_keep isl_constraint *c)
+{
+	isl_val *a, *b;
+	int sign;
+	int pos;
+	isl_bool eq, is_zero, is_neg;
+
+	eq = isl_constraint_is_equality(c);
+	if (eq < 0)
+		return isl_val_list_free(list);
+
+	b = isl_constraint_get_coefficient_val(c, isl_dim_set, 1);
+	is_zero = isl_val_is_zero(b);
+	if (is_zero == isl_bool_true) {
+		isl_val_free(b);
+		return list;
+	}
+	a = isl_constraint_get_coefficient_val(c, isl_dim_set, 0);
+	sign = isl_val_sgn(b);
+	b = isl_val_abs(b);
+	a = isl_val_div(a, b);
+
+	if (eq)
+		b = isl_val_copy(a);
+
+	pos = sign > 0 ? 0 : 1;
+	is_neg = isl_val_is_neg(a);
+	if (is_neg == isl_bool_true)
+		a = isl_val_set_si(a, 0);
+	list = isl_val_list_set_val(list, pos, a);
+
+	if (!eq)
+		return is_neg < 0 ? isl_val_list_free(list) : list;
+
+	pos = 1 - pos;
+	a = isl_val_neg(b);
+	is_neg = isl_val_is_neg(a);
+	if (is_neg == isl_bool_true)
+		a = isl_val_set_si(a, 0);
+	list = isl_val_list_set_val(list, pos, a);
+
+	return is_neg < 0 ? isl_val_list_free(list) : list;
+}
+
+/* If constraint "c" passes through the origin, then try and use it
+ * to update the non-negative bounds in "list" = (min, max) such that
+ *
+ *	i_1 >= -min i_0
+ * and
+ *	i_1 <= max i_0
+ */
+static isl_stat set_min_max(__isl_take isl_constraint *c, void *user)
+{
+	isl_val *v;
+	isl_val_list **list = user;
+	isl_bool is_zero;
+
+	v = isl_constraint_get_constant_val(c);
+	is_zero = isl_val_is_zero(v);
+	isl_val_free(v);
+
+	if (is_zero == isl_bool_true)
+		*list = list_set_min_max(*list, c);
+
+	isl_constraint_free(c);
+	return is_zero < 0 ? isl_stat_error : isl_stat_ok;
+}
+
+/* Given a set of dependence distance vectors "dist", compute
+ * pair of non-negative bounds min and max such that
+ *
+ *	d_pos >= -min d_0
+ * and
+ *	d_pos <= max d_0
+ *
+ * and return the pair (min, max).
+ * If no bound can be found in either direction, then the bound
+ * is replaced by NaN.
+ *
+ * The dependence distances are first projected onto the (d_0, d_pos).
+ * Then the zero dependence distance is added and the convex hull is computed.
+ * Finally, the bounds are extracted from the constraints of the convex hull
+ * that pass through the origin.
+ */
+static __isl_give isl_val_list *min_max_dist(__isl_keep isl_set *dist, int pos)
+{
+	isl_space *space;
+	isl_basic_set *hull;
+	int dim;
+	isl_ctx *ctx;
+	isl_val *nan;
+	isl_val_list *list;
+
+	ctx = isl_set_get_ctx(dist);
+	nan = isl_val_nan(ctx);
+	list = isl_val_list_alloc(ctx, 2);
+	list = isl_val_list_add(list, isl_val_copy(nan));
+	list = isl_val_list_add(list, nan);
+
+	dist = isl_set_copy(dist);
+	dim = isl_set_dim(dist, isl_dim_set);
+	if (dist && pos >= dim)
+		isl_die(ctx, isl_error_internal, "position out of bounds",
+			dist = isl_set_free(dist));
+	dist = isl_set_project_out(dist, isl_dim_set, pos + 1, dim - (pos + 1));
+	dist = isl_set_project_out(dist, isl_dim_set, 1, pos - 1);
+
+	space = isl_set_get_space(dist);
+	dist = isl_set_union(dist, isl_set_from_point(isl_point_zero(space)));
+	dist = isl_set_remove_divs(dist);
+	hull = isl_set_convex_hull(dist);
+
+	if (isl_basic_set_foreach_constraint(hull, &set_min_max, &list) < 0)
+		list = isl_val_list_free(list);
+	isl_basic_set_free(hull);
+
+	return list;
+}
+
+/* Given a schedule node "node" that, together with its child,
+ * satisfies the input pattern for hybrid tiling, compute bounds
+ * on the relative dependence distances of the child node with
+ * respect to the parent node.  These bounds are needed to
+ * construct a hybrid tiling.
+ *
+ * First all relevant dependences are collected and mapped
+ * to the schedule space of the pair of nodes.  Then, the
+ * dependence distances are computed in this space.
+ *
+ * These dependence distances are then projected onto a two-dimensional
+ * space consisting of the single schedule dimension of the outer node
+ * and one of the schedule dimensions of the inner node.
+ * The maximal and minimal relative dependence distances are extracted
+ * from these projections.
+ * This process is repeated for each of the schedule dimensions
+ * of the inner node.  For the first dimension, both minimal and
+ * maximal relative dependence distances are stored in the result.
+ * For the other dimensions, only the minimal relative dependence
+ * distance is stored.
+ */
+__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
+	__isl_keep isl_schedule_node *node)
+{
+	ppcg_ht_bounds *bnd;
+	isl_space *space;
+	isl_map *map;
+	isl_set *dist;
+	isl_val_list *pair;
+	isl_schedule_node *child;
+	int n;
+	int i, dim;
+
+	if (!scop || !node || check_input_pattern(node) < 0)
+		return NULL;
+
+	child = isl_schedule_node_get_child(node, 0);
+	space = isl_schedule_node_band_get_space(child);
+	dim = isl_schedule_node_band_n_member(child);
+	isl_schedule_node_free(child);
+	bnd = ppcg_ht_bounds_alloc(space);
+	if (!bnd)
+		return NULL;
+
+	map = collect_deps(scop, node);
+
+	dist = isl_map_deltas(map);
+	n = isl_set_dim(dist, isl_dim_param);
+	dist = isl_set_project_out(dist, isl_dim_param, 0, n);
+
+	pair = min_max_dist(dist, 1);
+	bnd = ppcg_ht_bounds_set_lower(bnd, 0, isl_val_list_get_val(pair, 0));
+	bnd = ppcg_ht_bounds_set_upper(bnd, isl_val_list_get_val(pair, 1));
+	isl_val_list_free(pair);
+
+	for (i = 1; i < dim; ++i) {
+		pair = min_max_dist(dist, 1 + i);
+		bnd = ppcg_ht_bounds_set_lower(bnd, i,
+						isl_val_list_get_val(pair, 0));
+		isl_val_list_free(pair);
+	}
+
+	isl_set_free(dist);
+
+	return bnd;
+}
+
+/* Check if all the fields of "phase" are valid, freeing "phase"
+ * if they are not.
+ */
+static __isl_give ppcg_ht_phase *check_phase(__isl_take ppcg_ht_phase *phase)
+{
+	if (!phase)
+		return NULL;
+
+	if (!phase->tiling || !phase->local_time ||
+	    !phase->shift_space || !phase->domain)
+		return ppcg_ht_phase_free(phase);
+
+	return phase;
+}
+
+/* Construct a ppcg_ht_phase object, that simply copies
+ * information from "tiling".
+ * That is, the result is defined over the "ts" space and
+ * corresponds to phase 1.
+ */
+static __isl_give ppcg_ht_phase *construct_phase(
+	__isl_keep ppcg_ht_tiling *tiling)
+{
+	isl_ctx *ctx;
+	ppcg_ht_phase *phase;
+
+	if (!tiling)
+		return NULL;
+
+	ctx = ppcg_ht_tiling_get_ctx(tiling);
+	phase = isl_calloc_type(ctx, struct ppcg_ht_phase);
+	if (!phase)
+		return NULL;
+	phase->tiling = ppcg_ht_tiling_copy(tiling);
+	phase->time_tile = isl_aff_copy(tiling->time_tile);
+	phase->local_time = isl_aff_copy(tiling->local_time);
+	phase->shift_space = isl_aff_copy(tiling->shift_space);
+	phase->domain = isl_set_copy(tiling->hex);
+
+	return check_phase(phase);
+}
+
+/* Align the parameters of the elements of "phase" to those of "space".
+ */
+static __isl_give ppcg_ht_phase *phase_align_params(
+	__isl_take ppcg_ht_phase *phase, __isl_take isl_space *space)
+{
+	if (!phase)
+		goto error;
+
+	phase->time_tile = isl_aff_align_params(phase->time_tile,
+							isl_space_copy(space));
+	phase->local_time = isl_aff_align_params(phase->local_time,
+							isl_space_copy(space));
+	phase->shift_space = isl_aff_align_params(phase->shift_space,
+							isl_space_copy(space));
+	phase->domain = isl_set_align_params(phase->domain, space);
+
+	return check_phase(phase);
+error:
+	isl_space_free(space);
+	return NULL;
+}
+
+/* Pull back "phase" over "ma".
+ * That is, take a phase defined over the range of "ma" and
+ * turn it into a phase defined over the domain of "ma".
+ */
+static __isl_give ppcg_ht_phase *pullback_phase(__isl_take ppcg_ht_phase *phase,
+	__isl_take isl_multi_aff *ma)
+{
+	phase = phase_align_params(phase, isl_multi_aff_get_space(ma));
+	if (!phase)
+		goto error;
+
+	phase->time_tile = isl_aff_pullback_multi_aff(phase->time_tile,
+							isl_multi_aff_copy(ma));
+	phase->local_time = isl_aff_pullback_multi_aff(phase->local_time,
+							isl_multi_aff_copy(ma));
+	phase->shift_space = isl_aff_pullback_multi_aff(phase->shift_space,
+							isl_multi_aff_copy(ma));
+	phase->domain = isl_set_preimage_multi_aff(phase->domain, ma);
+
+	return check_phase(phase);
+error:
+	isl_multi_aff_free(ma);
+	return NULL;
+}
+
+/* Pullback "phase" over phase->tiling->shift_phase, which shifts
+ * phase 0 to phase 1.  The pullback therefore takes a phase 1
+ * description and turns it into a phase 0 description.
+ */
+static __isl_give ppcg_ht_phase *shift_phase(__isl_take ppcg_ht_phase *phase)
+{
+	ppcg_ht_tiling *tiling;
+
+	if (!phase)
+		return NULL;
+
+	tiling = phase->tiling;
+	return pullback_phase(phase, isl_multi_aff_copy(tiling->shift_phase));
+}
+
+/* Take a "phase" defined over the ts-space and plug in the projection
+ * from the input schedule space to the ts-space.
+ * The result is then defined over this input schedule space.
+ */
+static __isl_give ppcg_ht_phase *lift_phase(__isl_take ppcg_ht_phase *phase)
+{
+	ppcg_ht_tiling *tiling;
+
+	if (!phase)
+		return NULL;
+
+	tiling = phase->tiling;
+	return pullback_phase(phase, isl_multi_aff_copy(tiling->project_ts));
+}
+
+/* Compute the shift that should be added to the space band
+ * in order to be able to apply rectangular tiling to the space.
+ * Store the shift in phase->space_shift.
+ *
+ * In the first dimension, it is equal to shift_space - s.
+ * For phase 1, this results in
+ *
+ *	(-(2 * shift_s)*T) % W
+ *
+ * In phase 0, the "s" in shift_space has been replaced by "s + shift_s",
+ * so the result is
+ *
+ *	shift_s + (-(2 * shift_s)*T) % W
+ *
+ * In the other dimensions, the shift is equal to
+ *
+ *	dl_i * local_time.
+ */
+static __isl_give ppcg_ht_phase *compute_space_shift(
+	__isl_take ppcg_ht_phase *phase)
+{
+	int i, n;
+	isl_space *space;
+	isl_local_space *ls;
+	isl_aff *aff, *s;
+	isl_multi_aff *space_shift;
+
+	if (!phase)
+		return NULL;
+
+	space = ppcg_ht_phase_get_input_space(phase);
+	space = isl_space_unwrap(space);
+	space = isl_space_range_map(space);
+
+	space_shift = isl_multi_aff_zero(space);
+	aff = isl_aff_copy(phase->shift_space);
+	ls = isl_local_space_from_space(isl_aff_get_domain_space(aff));
+	s = isl_aff_var_on_domain(ls, isl_dim_set, 1);
+	aff = isl_aff_sub(aff, s);
+	space_shift = isl_multi_aff_set_aff(space_shift, 0, aff);
+
+	n = isl_multi_aff_dim(space_shift, isl_dim_out);
+	for (i = 1; i < n; ++i) {
+		isl_val *v;
+		isl_aff *time;
+
+		v = ppcg_ht_bounds_get_lower(phase->tiling->bounds, i);
+		time = isl_aff_copy(phase->local_time);
+		time = isl_aff_scale_val(time, v);
+		space_shift = isl_multi_aff_set_aff(space_shift, i, time);
+	}
+
+	if (!space_shift)
+		return ppcg_ht_phase_free(phase);
+	phase->space_shift = space_shift;
+	return phase;
+}
+
+/* Compute the space tiling and store the result in phase->space_tile.
+ * The space tiling is of the form
+ *
+ *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size]
+ */
+static __isl_give ppcg_ht_phase *compute_space_tile(
+	__isl_take ppcg_ht_phase *phase)
+{
+	isl_space *space;
+	isl_multi_val *space_sizes;
+	isl_multi_aff *space_shift;
+	isl_multi_aff *tile;
+
+	if (!phase)
+		return NULL;
+
+	space = ppcg_ht_phase_get_input_space(phase);
+	space = isl_space_unwrap(space);
+	tile = isl_multi_aff_range_map(space);
+	space_shift = isl_multi_aff_copy(phase->space_shift);
+	tile = isl_multi_aff_add(space_shift, tile);
+	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
+	tile = isl_multi_aff_scale_down_multi_val(tile, space_sizes);
+	tile = isl_multi_aff_floor(tile);
+
+	if (!tile)
+		return ppcg_ht_phase_free(phase);
+	phase->space_tile = tile;
+	return phase;
+}
+
+/* Construct a representation for one of the two phase for hybrid tiling
+ * "tiling".  If "shift" is not set, then the phase is constructed
+ * directly from the hexagonal tile shape in "tiling", which represents
+ * the phase-1 tiles.  If "shift" is set, then this tile shape is shifted
+ * back over tiling->shift_phase to obtain the phase-0 tiles.
+ *
+ * First copy data from "tiling", then optionally shift the phase and
+ * finally move the tiling from the "ts" space of "tiling" to
+ * the space of the input pattern.
+ *
+ * After the basic phase has been computed, also compute
+ * the corresponding space shift.
+ */
+static __isl_give ppcg_ht_phase *ppcg_ht_tiling_compute_phase(
+	__isl_keep ppcg_ht_tiling *tiling, int shift)
+{
+	ppcg_ht_phase *phase;
+
+	phase = construct_phase(tiling);
+	if (shift)
+		phase = shift_phase(phase);
+	phase = lift_phase(phase);
+
+	phase = compute_space_shift(phase);
+	phase = compute_space_tile(phase);
+
+	return phase;
+}
+
+/* Consruct a function that is equal to the time tile of "phase0"
+ * on the domain of "phase0" and equal to the time tile of "phase1"
+ * on the domain of "phase1".
+ * The two domains are assumed to form a partition of the input
+ * schedule space.
+ */
+static __isl_give isl_pw_multi_aff *combine_time_tile(
+	__isl_keep ppcg_ht_phase *phase0, __isl_keep ppcg_ht_phase *phase1)
+{
+	isl_aff *T;
+	isl_pw_aff *time, *time1;
+
+	if (!phase0 || !phase1)
+		return NULL;
+
+	T = isl_aff_copy(phase0->time_tile);
+	time = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase0), T);
+
+	T = isl_aff_copy(phase1->time_tile);
+	time1 = isl_pw_aff_alloc(ppcg_ht_phase_get_domain(phase1), T);
+
+	time = isl_pw_aff_union_add(time, time1);
+
+	return isl_pw_multi_aff_from_pw_aff(time);
+}
+
+/* Name used in mark nodes that contain a pointer to a ppcg_ht_phase.
+ */
+static char *ppcg_phase_name = "phase";
+
+/* Does "id" contain a pointer to a ppcg_ht_phase?
+ * That is, is it called "phase"?
+ */
+static isl_bool is_phase_id(__isl_keep isl_id *id)
+{
+	const char *name;
+
+	name = isl_id_get_name(id);
+	if (!name)
+		return isl_bool_error;
+
+	return !strcmp(name, ppcg_phase_name);
+}
+
+/* Given a mark node with an identifier that points to a ppcg_ht_phase,
+ * extract this ppcg_ht_phase pointer.
+ */
+__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
+	__isl_keep isl_schedule_node *node)
+{
+	isl_bool is_phase;
+	isl_id *id;
+	void *p;
+
+	if (!node)
+		return NULL;
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
+		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
+			"not a phase mark", return NULL);
+
+	id = isl_schedule_node_mark_get_id(node);
+	is_phase = is_phase_id(id);
+	p = isl_id_get_user(id);
+	isl_id_free(id);
+
+	if (is_phase < 0)
+		return NULL;
+	if (!is_phase)
+		isl_die(isl_schedule_node_get_ctx(node), isl_error_internal,
+			"not a phase mark", return NULL);
+
+	return p;
+}
+
+/* Insert a mark node at "node" holding a pointer to "phase".
+ */
+static __isl_give isl_schedule_node *insert_phase(
+	__isl_take isl_schedule_node *node, __isl_take ppcg_ht_phase *phase)
+{
+	isl_ctx *ctx;
+	isl_id *id;
+
+	if (!node)
+		goto error;
+	ctx = isl_schedule_node_get_ctx(node);
+	id = isl_id_alloc(ctx, ppcg_phase_name, phase);
+	if (!id)
+		goto error;
+	id = isl_id_set_free_user(id, &ppcg_ht_phase_free_wrap);
+	node = isl_schedule_node_insert_mark(node, id);
+
+	return node;
+error:
+	ppcg_ht_phase_free(phase);
+	isl_schedule_node_free(node);
+	return NULL;
+}
+
+/* Construct a mapping from the elements of the original pair of bands
+ * to which tiling was applied that belong to a tile of "phase"
+ * to that tile, preserving the values for the outer bands.
+ *
+ * The mapping is of the form
+ *
+ *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
+ *
+ * where tile is defined by a concatenation of the time_tile and
+ * the space_tile.
+ */
+static __isl_give isl_map *construct_tile_map(__isl_keep ppcg_ht_phase *phase)
+{
+	int depth;
+	isl_space *space;
+	isl_multi_aff *ma;
+	isl_multi_aff *tiling;
+	isl_map *el2tile;
+
+	depth = isl_schedule_node_get_schedule_depth(
+						phase->tiling->input_node);
+	space = isl_aff_get_space(phase->time_tile);
+	space = isl_space_params(space);
+	space = isl_space_set_from_params(space);
+	space = isl_space_add_dims(space, isl_dim_set, depth);
+	space = isl_space_map_from_set(space);
+	ma = isl_multi_aff_identity(space);
+
+	tiling = isl_multi_aff_flat_range_product(
+		isl_multi_aff_from_aff(isl_aff_copy(phase->time_tile)),
+		isl_multi_aff_copy(phase->space_tile));
+	el2tile = isl_map_from_multi_aff(tiling);
+	el2tile = isl_map_intersect_domain(el2tile,
+						isl_set_copy(phase->domain));
+	el2tile = isl_map_product(isl_map_from_multi_aff(ma), el2tile);
+
+	return el2tile;
+}
+
+/* Return a description of the full tiles of "phase" at the point
+ * in the original schedule tree where the tiling was applied.
+ *
+ * First construct a mapping from the input schedule dimensions
+ * up to an including the original pair of bands to which hybrid tiling
+ * was applied to schedule dimensions in which this original pair
+ * has been replaced by the tiles.
+ * This mapping is of the form
+ *
+ *	[[outer] -> [P -> C]] -> [[outer] -> [tile]]
+ *
+ * Apply this mapping to the set of all values for the input
+ * schedule dimensions and then apply its inverse.
+ * The result is the set of values for the input schedule dimensions
+ * that would map to any of the tiles.  Subtracting from this set
+ * the set of values that are actually executed produces the set
+ * of values that belong to a tile but that are not executed.
+ * Mapping these back to the tiles produces a description of
+ * the partial tiles.  Subtracting these from the set of all tiles
+ * produces a description of the full tiles in the form
+ *
+ *	[[outer] -> [tile]]
+ */
+static __isl_give isl_set *compute_full_tile(__isl_keep ppcg_ht_phase *phase)
+{
+	isl_schedule_node *node;
+	isl_union_set *domain;
+	isl_union_map *prefix, *schedule;
+	isl_set *all, *partial, *all_el;
+	isl_map *tile2el, *el2tile;
+	isl_multi_union_pw_aff *mupa;
+
+	el2tile = construct_tile_map(phase);
+	tile2el = isl_map_reverse(isl_map_copy(el2tile));
+
+	node = phase->tiling->input_node;
+	prefix = isl_schedule_node_get_prefix_schedule_union_map(node);
+	domain = isl_schedule_node_get_domain(node);
+	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
+	schedule = isl_union_map_from_multi_union_pw_aff(mupa);
+	schedule = isl_union_map_range_product(prefix, schedule);
+	all_el = isl_set_from_union_set(isl_union_set_apply(domain, schedule));
+	all_el = isl_set_coalesce(all_el);
+
+	all = isl_set_apply(isl_set_copy(all_el), isl_map_copy(el2tile));
+
+	partial = isl_set_copy(all);
+	partial = isl_set_apply(partial, tile2el);
+	partial = isl_set_subtract(partial, all_el);
+	partial = isl_set_apply(partial, el2tile);
+
+	return isl_set_subtract(all, partial);
+}
+
+/* Copy the AST loop types of the non-isolated part to those
+ * of the isolated part.
+ */
+static __isl_give isl_schedule_node *set_isolate_loop_type(
+	__isl_take isl_schedule_node *node)
+{
+	int i, n;
+
+	n = isl_schedule_node_band_n_member(node);
+	for (i = 0; i < n; ++i) {
+		enum isl_ast_loop_type type;
+
+		type = isl_schedule_node_band_member_get_ast_loop_type(node, i);
+		node = isl_schedule_node_band_member_set_isolate_ast_loop_type(
+								node, i, type);
+	}
+
+	return node;
+}
+
+/* If options->isolate_full_tiles is set, then mark the full tiles
+ * in "node" for isolation.  The full tiles are derived from "phase".
+ * "node" may point to a part of the tiling, e.g., the space tiling.
+ *
+ * The full tiles are originally computed in the form
+ *
+ *	[[outer] -> [tile]]
+ *
+ * However, the band that "node" points to may only contain
+ * subset of the tile dimensions.
+ * The description above is therefore treated as
+ *
+ *	[[outer] -> [before; this; after]]
+ *
+ * before is of size "pos"; this is of size "dim"; and
+ * after is of size "out - pos - dim".
+ * The after part is first project out.  Then the range is split
+ * into a before and this part and finally the before part is moved
+ * to the domain, resulting in
+ *
+ *	[[outer; before] -> [this]]
+ *
+ * This description is then used as the isolate option.
+ *
+ * The AST loop type for the isolated part is set to be the same
+ * as that of the non-isolated part.
+ */
+static __isl_give isl_schedule_node *ppcg_ht_phase_isolate_full_tile_node(
+	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
+	struct ppcg_options *options)
+{
+	int in, out, pos, depth, dim;
+	isl_space *space;
+	isl_multi_aff *ma1, *ma2;
+	isl_set *tile;
+	isl_map *map;
+	isl_set *set;
+	isl_union_set *opt;
+
+	if (!options->isolate_full_tiles)
+		return node;
+
+	depth = isl_schedule_node_get_schedule_depth(node);
+	dim = isl_schedule_node_band_n_member(node);
+
+	tile = compute_full_tile(phase);
+	map = isl_set_unwrap(tile);
+	in = isl_map_dim(map, isl_dim_in);
+	out = isl_map_dim(map, isl_dim_out);
+	pos = depth - in;
+	map = isl_map_project_out(map, isl_dim_out, pos + dim,
+				out - (pos + dim));
+	space = isl_space_range(isl_map_get_space(map));
+	ma1 = isl_multi_aff_project_out_map(isl_space_copy(space),
+					   isl_dim_set, pos, dim);
+	ma2 = isl_multi_aff_project_out_map(space, isl_dim_set, 0, pos);
+	ma1 = isl_multi_aff_range_product(ma1, ma2);
+	map = isl_map_apply_range(map, isl_map_from_multi_aff(ma1));
+	map = isl_map_uncurry(map);
+	map = isl_map_flatten_domain(map);
+	set = isl_map_wrap(map);
+	set = isl_set_set_tuple_name(set, "isolate");
+
+	opt = isl_schedule_node_band_get_ast_build_options(node);
+	opt = isl_union_set_add_set(opt, set);
+	node = isl_schedule_node_band_set_ast_build_options(node, opt);
+	node = set_isolate_loop_type(node);
+
+	return node;
+}
+
+/* Insert a band node for performing the space tiling for "phase" at "node".
+ * In particular, insert a band node with partial schedule
+ *
+ *	[P[t] -> C[s]] -> C[floor((s + space_shift)/space_size)]
+ *
+ * pulled back over the input schedule.
+ * "options" determines whether full tiles should be separated
+ * from partial tiles.
+ *
+ * The first tile dimension iterates over the hexagons in the same
+ * phase, which are independent by construction.  The first dimension
+ * is therefore marked coincident.
+ * All dimensions are also marked for being generated as atomic loops
+ * because separation is usually not desirable on tile loops.
+ */
+static __isl_give isl_schedule_node *insert_space_tiling(
+	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node,
+	struct ppcg_options *options)
+{
+	isl_multi_aff *space_tile;
+	isl_multi_union_pw_aff *mupa;
+
+	if (!phase)
+		return isl_schedule_node_free(node);
+
+	space_tile = isl_multi_aff_copy(phase->space_tile);
+	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
+	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_tile);
+	node = isl_schedule_node_insert_partial_schedule(node, mupa);
+	node = ppcg_set_schedule_node_type(node, isl_ast_loop_atomic);
+	node = ppcg_ht_phase_isolate_full_tile_node(phase, node, options);
+	node = isl_schedule_node_band_member_set_coincident(node, 0, 1);
+
+	return node;
+}
+
+/* Given a pointer "node" to (a copy of) the original child node
+ * in the input pattern, adjust its partial schedule such that
+ * it starts at zero within each tile.
+ *
+ * That is, replace "s" by (s + space_shift) % space_sizes.
+ */
+__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
+	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node)
+{
+	isl_multi_val *space_sizes;
+	isl_multi_aff *space_shift;
+	isl_multi_union_pw_aff *mupa;
+
+	space_shift = isl_multi_aff_copy(phase->space_shift);
+	mupa = isl_multi_union_pw_aff_copy(phase->tiling->input_schedule);
+	mupa = isl_multi_union_pw_aff_apply_multi_aff(mupa, space_shift);
+	node = isl_schedule_node_band_shift(node, mupa);
+	space_sizes = isl_multi_val_copy(phase->tiling->space_sizes);
+	node = isl_schedule_node_band_mod(node, space_sizes);
+
+	return node;
+}
+
+/* Does
+ *
+ *	s0 > delta + 2 * {delta * h} - 1
+ *
+ * hold?
+ */
+static isl_bool wide_enough(__isl_keep isl_val *s0, __isl_keep isl_val *delta,
+	__isl_keep isl_val *h)
+{
+	isl_val *v, *v2;
+	isl_bool ok;
+
+	v = isl_val_mul(isl_val_copy(delta), isl_val_copy(h));
+	v2 = isl_val_floor(isl_val_copy(v));
+	v = isl_val_sub(v, v2);
+	v = isl_val_mul_ui(v, 2);
+	v = isl_val_add(v, isl_val_copy(delta));
+	v = isl_val_sub_ui(v, 1);
+	ok = isl_val_gt(s0, v);
+	isl_val_free(v);
+
+	return ok;
+}
+
+/* Is the tile size specified by "sizes" wide enough in the first space
+ * dimension, i.e., the base of the hexagon?  This ensures that,
+ * after hybrid tiling using "bounds" and these sizes,
+ * neighboring hexagons in the same phase are far enough apart
+ * that they do not depend on each other.
+ * The test is only meaningful if the bounds are valid.
+ *
+ * Let st be (half) the size in the time dimension and s0 the base
+ * size in the first space dimension.  Let delta be the dependence
+ * distance in either positive or negative direction.  In principle,
+ * it should be enough to have s0 + 1 > delta, i.e., s0 >= delta.
+ * However, in case of fractional delta, the tile is not extended
+ * with delta * (st - 1), but instead with floor(delta * (st - 1)).
+ * The condition therefore needs to be adjusted to
+ *
+ *	s0 + 1 > delta + 2 {delta * (st - 1)}
+ *
+ * (with {} the fractional part) to account for the two slanted sides.
+ * The condition in the paper "Hybrid Hexagonal/Classical Tiling for GPUs"
+ * translates to
+ *
+ *	s0 >= delta + {delta * (st - 1)}
+ *
+ * Since 1 > frac(delta * (st - 1)), this condition implies
+ * the condition above.
+ *
+ * The condition is checked for both directions.
+ */
+isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
+	__isl_keep isl_multi_val *sizes)
+{
+	isl_val *s0, *h;
+	isl_val *delta;
+	isl_bool ok;
+
+	ok = ppcg_ht_bounds_is_valid(bounds);
+	if (ok < 0 || !ok)
+		return ok;
+
+	h = isl_val_sub_ui(isl_multi_val_get_val(sizes, 0), 1);
+	s0 = isl_multi_val_get_val(sizes, 1);
+
+	delta = ppcg_ht_bounds_get_lower(bounds, 0);
+	ok = wide_enough(s0, delta, h);
+	isl_val_free(delta);
+
+	delta = ppcg_ht_bounds_get_upper(bounds);
+	if (ok == isl_bool_true)
+		ok = wide_enough(s0, delta, h);
+	isl_val_free(delta);
+
+	isl_val_free(s0);
+	isl_val_free(h);
+
+	return ok;
+}
+
+/* Check that the tile will be wide enough in the first space
+ * dimension, i.e., the base of the hexagon.  This ensures that
+ * neighboring hexagons in the same phase are far enough apart
+ * that they do not depend on each other.
+ *
+ * Error out if the condition fails to hold.
+ */
+static isl_stat check_width(__isl_keep ppcg_ht_bounds *bounds,
+	__isl_keep isl_multi_val *sizes)
+{
+	isl_bool ok;
+
+	ok = ppcg_ht_bounds_supports_sizes(bounds, sizes);
+
+	if (ok < 0)
+		return isl_stat_error;
+	if (!ok)
+		isl_die(isl_multi_val_get_ctx(sizes), isl_error_invalid,
+			"base of hybrid tiling hexagon not sufficiently wide",
+			return isl_stat_error);
+
+	return isl_stat_ok;
+}
+
+/* Given valid bounds on the relative dependence distances for
+ * the pair of nested nodes that "node" point to, as well as sufficiently
+ * wide tile sizes "sizes", insert the corresponding time and space tiling
+ * at "node", along with a pair of phase nodes that can be used
+ * to make further changes.
+ * The space of "sizes" should be the product of the spaces
+ * of the schedules of the pair of parent and child nodes.
+ * "options" determines whether full tiles should be separated
+ * from partial tiles.
+ *
+ * In particular, given an input of the form
+ *
+ *	P - C - ...
+ *
+ * the output has the form
+ *
+ *	        /- F0 - M0 - CT0 - P - C - ...
+ *	PT - seq
+ *	        \- F1 - M1 - CT1 - P - C - ...
+ *
+ * PT is the global time tiling.  Within each of these tiles,
+ * two phases are executed in order.  Within each phase, the schedule
+ * space is further subdivided into tiles through CT0 and CT1.
+ * The first dimension of each of these iterates over the hexagons
+ * within a phase and these are independent by construction.
+ * The F0 and F1 filters filter the statement instances that belong
+ * to the corresponding phase.  The M0 and M1 marks contain a pointer
+ * to a ppcg_ht_phase object that can be used to perform further changes.
+ *
+ * After checking that input satisfies the requirements,
+ * a data structure is constructed that represents the tiling and
+ * two additional data structures are constructed for the two phases
+ * of the tiling.  These are then used to define the filters F0 and F1 and
+ * combined to construct the time tiling PT.
+ * Then the time tiling node PT is inserted, followed by
+ * the sequence with the two filters, the CT space tiling nodes and
+ * the phase markers M.
+ */
+__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
+	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
+	__isl_take isl_schedule_node *node, struct ppcg_options *options)
+{
+	isl_ctx *ctx;
+	isl_union_set *phase0;
+	isl_union_set *phase1;
+	isl_multi_union_pw_aff *input, *dom_time;
+	isl_union_pw_multi_aff *upma;
+	isl_pw_multi_aff *time;
+	isl_union_set_list *phases;
+	ppcg_ht_tiling *tiling;
+	ppcg_ht_phase *phase_0;
+	ppcg_ht_phase *phase_1;
+
+	if (!node || !sizes || !bounds)
+		goto error;
+	if (check_input_pattern(node) < 0 || check_width(bounds, sizes) < 0)
+		goto error;
+
+	ctx = isl_schedule_node_get_ctx(node);
+
+	input = extract_input_schedule(node);
+
+	tiling = ppcg_ht_bounds_construct_tiling(bounds, node, input, sizes);
+	phase_0 = ppcg_ht_tiling_compute_phase(tiling, 1);
+	phase_1 = ppcg_ht_tiling_compute_phase(tiling, 0);
+	time = combine_time_tile(phase_0, phase_1);
+	ppcg_ht_tiling_free(tiling);
+
+	upma = isl_union_pw_multi_aff_from_multi_union_pw_aff(
+					isl_multi_union_pw_aff_copy(input));
+	phase0 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_0));
+	phase0 = isl_union_set_preimage_union_pw_multi_aff(phase0,
+					isl_union_pw_multi_aff_copy(upma));
+	phase1 = isl_union_set_from_set(ppcg_ht_phase_get_domain(phase_1));
+	phase1 = isl_union_set_preimage_union_pw_multi_aff(phase1, upma);
+
+	phases = isl_union_set_list_alloc(ctx, 2);
+	phases = isl_union_set_list_add(phases, phase0);
+	phases = isl_union_set_list_add(phases, phase1);
+
+	dom_time = isl_multi_union_pw_aff_apply_pw_multi_aff(input, time);
+	node = isl_schedule_node_insert_partial_schedule(node, dom_time);
+
+	node = isl_schedule_node_child(node, 0);
+
+	node = isl_schedule_node_insert_sequence(node, phases);
+	node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_child(node, 0);
+	node = insert_space_tiling(phase_0, node, options);
+	node = insert_phase(node, phase_0);
+	node = isl_schedule_node_parent(node);
+	node = isl_schedule_node_next_sibling(node);
+	node = isl_schedule_node_child(node, 0);
+	node = insert_space_tiling(phase_1, node, options);
+	node = insert_phase(node, phase_1);
+	node = isl_schedule_node_parent(node);
+	node = isl_schedule_node_parent(node);
+
+	node = isl_schedule_node_parent(node);
+
+	isl_multi_val_free(sizes);
+	return node;
+error:
+	isl_multi_val_free(sizes);
+	isl_schedule_node_free(node);
+	ppcg_ht_bounds_free(bounds);
+	return NULL;
+}
+
+/* Given a branch "node" that contains a sequence node with two phases
+ * of hybrid tiling as input, call "fn" on each of the two phase marker
+ * nodes.
+ *
+ * That is, the input is as follows
+ *
+ *	         /- F0 - M0 - ...
+ *	... - seq
+ *	         \- F1 - M1 - ...
+ *
+ * and "fn" is called on M0 and on M1.
+ */
+__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
+	__isl_take isl_schedule_node *node,
+	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
+		void *user), void *user)
+{
+	int depth0, depth;
+
+	depth0 = isl_schedule_node_get_tree_depth(node);
+
+	while (node &&
+	    isl_schedule_node_get_type(node) != isl_schedule_node_sequence)
+		node = isl_schedule_node_child(node, 0);
+
+	node = isl_schedule_node_child(node, 0);
+	node = isl_schedule_node_child(node, 0);
+	if (!node)
+		return NULL;
+	node = fn(node, user);
+	node = isl_schedule_node_parent(node);
+	node = isl_schedule_node_next_sibling(node);
+	node = isl_schedule_node_child(node, 0);
+	if (!node)
+		return NULL;
+	node = fn(node, user);
+	node = isl_schedule_node_parent(node);
+	node = isl_schedule_node_parent(node);
+
+	depth = isl_schedule_node_get_tree_depth(node);
+	node = isl_schedule_node_ancestor(node, depth - depth0);
+
+	return node;
+}
+
+/* This function is called on each of the two phase marks
+ * in a hybrid tiling tree.
+ * Drop the phase mark at "node".
+ */
+static __isl_give isl_schedule_node *drop_phase_mark(
+	__isl_take isl_schedule_node *node, void *user)
+{
+	isl_id *id;
+	isl_bool is_phase;
+
+	if (isl_schedule_node_get_type(node) != isl_schedule_node_mark)
+		return node;
+
+	id = isl_schedule_node_mark_get_id(node);
+	is_phase = is_phase_id(id);
+	isl_id_free(id);
+
+	if (is_phase < 0)
+		return isl_schedule_node_free(node);
+	if (is_phase)
+		node = isl_schedule_node_delete(node);
+
+	return node;
+}
+
+/* Given a branch "node" that contains a sequence node with two phases
+ * of hybrid tiling as input, remove the two phase marker nodes.
+ *
+ * That is, the input is as follows
+ *
+ *	         /- F0 - M0 - ...
+ *	... - seq
+ *	         \- F1 - M1 - ...
+ *
+ * and the output is
+ *
+ *	         /- F0 - ...
+ *	... - seq
+ *	         \- F1 - ...
+ */
+__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
+	__isl_take isl_schedule_node *node)
+{
+	return hybrid_tile_foreach_phase(node, &drop_phase_mark, NULL);
+}

Added: polly/trunk/lib/External/ppcg/hybrid.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/hybrid.h?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/hybrid.h (added)
+++ polly/trunk/lib/External/ppcg/hybrid.h Thu Jul 20 08:48:13 2017
@@ -0,0 +1,41 @@
+#ifndef HYBRID_H
+#define HYBRID_H
+
+#include <isl/val.h>
+#include <isl/schedule_node.h>
+
+#include "ppcg.h"
+
+struct ppcg_ht_bounds;
+typedef struct ppcg_ht_bounds ppcg_ht_bounds;
+
+struct ppcg_ht_phase;
+typedef struct ppcg_ht_phase ppcg_ht_phase;
+
+isl_bool ppcg_ht_has_input_pattern(__isl_keep isl_schedule_node *node);
+isl_bool ppcg_ht_parent_has_input_pattern(__isl_keep isl_schedule_node *node);
+
+__isl_give ppcg_ht_bounds *ppcg_ht_compute_bounds(struct ppcg_scop *scop,
+	__isl_keep isl_schedule_node *node);
+void ppcg_ht_bounds_dump(__isl_keep ppcg_ht_bounds *bounds);
+isl_bool ppcg_ht_bounds_is_valid(__isl_keep ppcg_ht_bounds *bounds);
+isl_bool ppcg_ht_bounds_supports_sizes(__isl_keep ppcg_ht_bounds *bounds,
+	__isl_keep isl_multi_val *sizes);
+__isl_give isl_schedule_node *ppcg_ht_bounds_insert_tiling(
+	__isl_take ppcg_ht_bounds *bounds, __isl_take isl_multi_val *sizes,
+	__isl_take isl_schedule_node *node, struct ppcg_options *options);
+__isl_null ppcg_ht_bounds *ppcg_ht_bounds_free(
+	__isl_take ppcg_ht_bounds *bounds);
+
+__isl_keep ppcg_ht_phase *ppcg_ht_phase_extract_from_mark(
+	__isl_keep isl_schedule_node *node);
+__isl_give isl_schedule_node *ppcg_ht_phase_shift_space_point(
+	__isl_keep ppcg_ht_phase *phase, __isl_take isl_schedule_node *node);
+__isl_give isl_schedule_node *hybrid_tile_foreach_phase(
+	__isl_take isl_schedule_node *node,
+	__isl_give isl_schedule_node *(*fn)(__isl_take isl_schedule_node *node,
+		void *user), void *user);
+__isl_give isl_schedule_node *hybrid_tile_drop_phase_marks(
+	__isl_take isl_schedule_node *node);
+
+#endif

Modified: polly/trunk/lib/External/ppcg/opencl.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/opencl.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/opencl.c (original)
+++ polly/trunk/lib/External/ppcg/opencl.c Thu Jul 20 08:48:13 2017
@@ -216,8 +216,6 @@ static __isl_give isl_printer *opencl_pr
 	p = isl_printer_print_str(p, macros);
 	p = isl_printer_end_line(p);
 
-	p = isl_ast_op_type_print_macro(isl_ast_op_max, p);
-
 	return p;
 }
 
@@ -264,6 +262,11 @@ static __isl_give isl_printer *allocate_
 {
 	int need_lower_bound;
 
+	need_lower_bound = !is_array_positive_size_guard_trivial(array);
+	if (need_lower_bound)
+		p = ppcg_print_macro(isl_ast_op_max, p);
+
+	p = ppcg_ast_expr_print_macros(array->bound_expr, p);
 	p = ppcg_start_block(p);
 
 	p = isl_printer_start_line(p);
@@ -272,9 +275,9 @@ static __isl_give isl_printer *allocate_
 	p = isl_printer_print_str(p, " = clCreateBuffer(context, ");
 	p = isl_printer_print_str(p, "CL_MEM_READ_WRITE, ");
 
-	need_lower_bound = !is_array_positive_size_guard_trivial(array);
 	if (need_lower_bound) {
-		p = isl_printer_print_str(p, "max(sizeof(");
+		p = isl_printer_print_str(p, ppcg_max);
+		p = isl_printer_print_str(p, "(sizeof(");
 		p = isl_printer_print_str(p, array->type);
 		p = isl_printer_print_str(p, "), ");
 	}
@@ -313,6 +316,124 @@ static __isl_give isl_printer *opencl_al
 	return p;
 }
 
+/* Free the device array corresponding to "array"
+ */
+static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
+	struct gpu_array_info *array)
+{
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn("
+					"clReleaseMemObject(dev_");
+	p = isl_printer_print_str(p, array->name);
+	p = isl_printer_print_str(p, "));");
+	p = isl_printer_end_line(p);
+
+	return p;
+}
+
+/* Free the accessed device arrays.
+ */
+static __isl_give isl_printer *opencl_release_device_arrays(
+	__isl_take isl_printer *p, struct gpu_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < prog->n_array; ++i) {
+		struct gpu_array_info *array = &prog->array[i];
+		if (!gpu_array_requires_device_allocation(array))
+			continue;
+
+		p = release_device_array(p, array);
+	}
+	return p;
+}
+
+/* Create an OpenCL device, context, command queue and build the kernel.
+ * input is the name of the input file provided to ppcg.
+ */
+static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
+	const char *input, struct opencl_info *info)
+{
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "cl_device_id device;");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "cl_context context;");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "cl_program program;");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "cl_command_queue queue;");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "cl_int err;");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "device = opencl_create_device(");
+	p = isl_printer_print_int(p, info->options->opencl_use_gpu);
+	p = isl_printer_print_str(p, ");");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, "
+		"&device, NULL, NULL, &err);");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn(err);");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
+					"(context, device, 0, &err);");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn(err);");
+	p = isl_printer_end_line(p);
+
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "program = ");
+
+	if (info->options->opencl_embed_kernel_code) {
+		p = isl_printer_print_str(p, "opencl_build_program_from_string("
+						"context, device, kernel_code, "
+						"sizeof(kernel_code), \"");
+	} else {
+		p = isl_printer_print_str(p, "opencl_build_program_from_file("
+						"context, device, \"");
+		p = isl_printer_print_str(p, info->kernel_c_name);
+		p = isl_printer_print_str(p, "\", \"");
+	}
+
+	if (info->options->opencl_compiler_options)
+		p = isl_printer_print_str(p,
+					info->options->opencl_compiler_options);
+
+	p = isl_printer_print_str(p, "\");");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_end_line(p);
+
+	return p;
+}
+
+static __isl_give isl_printer *opencl_release_cl_objects(
+	__isl_take isl_printer *p, struct opencl_info *info)
+{
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
+					"(queue));");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
+					"(program));");
+	p = isl_printer_end_line(p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
+					"(context));");
+	p = isl_printer_end_line(p);
+
+	return p;
+}
+
 /* Print a call to the OpenCL clSetKernelArg() function which sets
  * the arguments of the kernel.  arg_name and arg_index are the name and the
  * index of the kernel argument.  The index of the leftmost argument of
@@ -761,6 +882,26 @@ static __isl_give isl_printer *opencl_en
 	return p;
 }
 
+/* Macro definitions for ppcg_min and ppcg_max for use
+ * in OpenCL kernel code.
+ * These macro definitions essentially call the corresponding
+ * OpenCL macros/functions, but first ensure that the two arguments
+ * have the same type, since the OpenCL versions are only defined
+ * in case those arguments have the same type.
+ */
+static const char *opencl_min =
+	"(x,y)    min((__typeof__(x + y)) x, (__typeof__(x + y)) y)";
+static const char *opencl_max =
+	"(x,y)    max((__typeof__(x + y)) x, (__typeof__(x + y)) y)";
+
+/* Set the macro definitions for ppcg_min and ppcg_max to
+ * OpenCL specific versions.
+ */
+static __isl_give isl_printer *set_opencl_macros(__isl_take isl_printer *p)
+{
+	return ppcg_set_macros(p, opencl_min, opencl_max);
+}
+
 static __isl_give isl_printer *opencl_print_kernel(struct gpu_prog *prog,
 	struct ppcg_kernel *kernel, __isl_take isl_printer *p)
 {
@@ -779,8 +920,9 @@ static __isl_give isl_printer *opencl_pr
 	p = opencl_print_kernel_iterators(p, kernel);
 	p = opencl_print_kernel_vars(p, kernel);
 	p = isl_printer_end_line(p);
-	p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
-	p = ppcg_print_macros(p, kernel->tree);
+	p = ppcg_set_macro_names(p);
+	p = set_opencl_macros(p);
+	p = gpu_print_macros(p, kernel->tree);
 	p = isl_ast_node_print(kernel->tree, p, print_options);
 	p = isl_printer_indent(p, -4);
 	p = isl_printer_start_line(p);
@@ -813,24 +955,27 @@ static __isl_give isl_printer *opencl_pr
 	__isl_take isl_printer *p, struct ppcg_kernel *kernel, int i)
 {
 	int grid_dim, block_dim;
-	isl_pw_aff *bound_grid;
+	isl_ast_expr *grid_size_expr;
+	isl_ast_expr *bound_grid;
 
 	grid_dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 	block_dim = kernel->n_block;
 
 	if (i < min(grid_dim, block_dim)) {
-		bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
+		grid_size_expr = kernel->grid_size_expr;
+		bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i);
 		p = isl_printer_print_str(p, "(");
-		p = isl_printer_print_pw_aff(p, bound_grid);
+		p = isl_printer_print_ast_expr(p, bound_grid);
 		p = isl_printer_print_str(p, ") * ");
 		p = isl_printer_print_int(p, kernel->block_dim[i]);
-		isl_pw_aff_free(bound_grid);
-	} else if (i >= grid_dim)
+		isl_ast_expr_free(bound_grid);
+	} else if (i >= grid_dim) {
 		p = isl_printer_print_int(p, kernel->block_dim[i]);
-	else {
-		bound_grid = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
-		p = isl_printer_print_pw_aff(p, bound_grid);
-		isl_pw_aff_free(bound_grid);
+	} else {
+		grid_size_expr = kernel->grid_size_expr;
+		bound_grid = isl_ast_expr_get_op_arg(grid_size_expr, 1 + i);
+		p = isl_printer_print_ast_expr(p, bound_grid);
+		isl_ast_expr_free(bound_grid);
 	}
 
 	return p;
@@ -907,16 +1052,50 @@ static __isl_give isl_printer *copy_arra
 	return p;
 }
 
-/* Print a statement for copying an array to or from the device.
- * The statement identifier is called "to_device_<array name>" or
- * "from_device_<array name>" and its user pointer points
- * to the gpu_array_info of the array that needs to be copied.
+/* Print code for initializing the device for execution of the transformed
+ * code.  This includes declaring locally defined variables as well as
+ * declaring and allocating the required copies of arrays on the device.
+ */
+static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
+	struct gpu_prog *prog, struct opencl_info *opencl)
+{
+	p = opencl_print_host_macros(p);
+
+	p = gpu_print_local_declarations(p, prog);
+	p = opencl_declare_device_arrays(p, prog);
+	p = opencl_setup(p, opencl->input, opencl);
+	p = opencl_allocate_device_arrays(p, prog);
+
+	return p;
+}
+
+/* Print code for clearing the device after execution of the transformed code.
+ * In particular, free the memory that was allocated on the device.
+ */
+static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
+	struct gpu_prog *prog, struct opencl_info *opencl)
+{
+	p = opencl_release_device_arrays(p, prog);
+	p = opencl_release_cl_objects(p, opencl);
+
+	return p;
+}
+
+/* Print a statement for copying an array to or from the device,
+ * or for initializing or clearing the device.
+ * The statement identifier of a copying node is called
+ * "to_device_<array name>" or "from_device_<array name>" and
+ * its user pointer points to the gpu_array_info of the array
+ * that needs to be copied.
+ * The node for initializing the device is called "init_device".
+ * The node for clearing the device is called "clear_device".
  *
- * Extract the array from the identifier and call
- * copy_array_to_device or copy_array_from_device.
+ * Extract the array (if any) from the identifier and call
+ * init_device, clear_device, copy_array_to_device or copy_array_from_device.
  */
-static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
-	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
+static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
+	__isl_keep isl_ast_node *node, struct gpu_prog *prog,
+	struct opencl_info *opencl)
 {
 	isl_ast_expr *expr, *arg;
 	isl_id *id;
@@ -933,7 +1112,11 @@ static __isl_give isl_printer *print_to_
 	isl_ast_expr_free(expr);
 
 	if (!name)
-		array = NULL;
+		return isl_printer_free(p);
+	if (!strcmp(name, "init_device"))
+		return init_device(p, prog, opencl);
+	if (!strcmp(name, "clear_device"))
+		return clear_device(p, prog, opencl);
 	if (!array)
 		return isl_printer_free(p);
 
@@ -945,11 +1128,12 @@ static __isl_give isl_printer *print_to_
 
 /* Print the user statement of the host code to "p".
  *
- * The host code may contain original user statements, kernel launches and
- * statements that copy data to/from the device.
+ * The host code may contain original user statements, kernel launches,
+ * statements that copy data to/from the device and statements
+ * the initialize or clear the device.
  * The original user statements and the kernel launches have
- * an associated annotation, while the data copy statements do not.
- * The latter are handled by print_to_from_device.
+ * an associated annotation, while the other statements do not.
+ * The latter are handled by print_device_node.
  * The annotation on the user statements is called "user".
  *
  * In case of a kernel launch, print a block of statements that
@@ -989,7 +1173,7 @@ static __isl_give isl_printer *opencl_pr
 
 	id = isl_ast_node_get_annotation(node);
 	if (!id)
-		return print_to_from_device(p, node, data->prog);
+		return print_device_node(p, node, data->prog, data->opencl);
 
 	is_user = !strcmp(isl_id_get_name(id), "user");
 	kernel = is_user ? NULL : isl_id_get_user(id);
@@ -1092,130 +1276,12 @@ static __isl_give isl_printer *opencl_pr
 	print_options = isl_ast_print_options_set_print_user(print_options,
 				&opencl_print_host_user, &data);
 
-	p = ppcg_print_macros(p, tree);
+	p = gpu_print_macros(p, tree);
 	p = isl_ast_node_print(tree, p, print_options);
 
 	return p;
 }
 
-/* Create an OpenCL device, context, command queue and build the kernel.
- * input is the name of the input file provided to ppcg.
- */
-static __isl_give isl_printer *opencl_setup(__isl_take isl_printer *p,
-	const char *input, struct opencl_info *info)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cl_device_id device;");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cl_context context;");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cl_program program;");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cl_command_queue queue;");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "cl_int err;");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "device = opencl_create_device(");
-	p = isl_printer_print_int(p, info->options->opencl_use_gpu);
-	p = isl_printer_print_str(p, ");");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "context = clCreateContext(NULL, 1, "
-		"&device, NULL, NULL, &err);");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn(err);");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "queue = clCreateCommandQueue"
-					"(context, device, 0, &err);");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn(err);");
-	p = isl_printer_end_line(p);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "program = ");
-
-	if (info->options->opencl_embed_kernel_code) {
-		p = isl_printer_print_str(p, "opencl_build_program_from_string("
-						"context, device, kernel_code, "
-						"sizeof(kernel_code), \"");
-	} else {
-		p = isl_printer_print_str(p, "opencl_build_program_from_file("
-						"context, device, \"");
-		p = isl_printer_print_str(p, info->kernel_c_name);
-		p = isl_printer_print_str(p, "\", \"");
-	}
-
-	if (info->options->opencl_compiler_options)
-		p = isl_printer_print_str(p,
-					info->options->opencl_compiler_options);
-
-	p = isl_printer_print_str(p, "\");");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-static __isl_give isl_printer *opencl_release_cl_objects(
-	__isl_take isl_printer *p, struct opencl_info *info)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseCommandQueue"
-					"(queue));");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseProgram"
-					"(program));");
-	p = isl_printer_end_line(p);
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn(clReleaseContext"
-					"(context));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Free the device array corresponding to "array"
- */
-static __isl_give isl_printer *release_device_array(__isl_take isl_printer *p,
-	struct gpu_array_info *array)
-{
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, "openclCheckReturn("
-					"clReleaseMemObject(dev_");
-	p = isl_printer_print_str(p, array->name);
-	p = isl_printer_print_str(p, "));");
-	p = isl_printer_end_line(p);
-
-	return p;
-}
-
-/* Free the accessed device arrays.
- */
-static __isl_give isl_printer *opencl_release_device_arrays(
-	__isl_take isl_printer *p, struct gpu_prog *prog)
-{
-	int i;
-
-	for (i = 0; i < prog->n_array; ++i) {
-		struct gpu_array_info *array = &prog->array[i];
-		if (!gpu_array_requires_device_allocation(array))
-			continue;
-
-		p = release_device_array(p, array);
-	}
-	return p;
-}
-
 /* Given a gpu_prog "prog" and the corresponding transformed AST
  * "tree", print the entire OpenCL code to "p".
  */
@@ -1237,22 +1303,8 @@ static __isl_give isl_printer *print_ope
 	if (!opencl->kprinter)
 		return isl_printer_free(p);
 
-	p = ppcg_start_block(p);
-
-	p = opencl_print_host_macros(p);
-
-	p = gpu_print_local_declarations(p, prog);
-	p = opencl_declare_device_arrays(p, prog);
-	p = opencl_setup(p, opencl->input, opencl);
-	p = opencl_allocate_device_arrays(p, prog);
-
 	p = opencl_print_host_code(p, prog, tree, opencl);
 
-	p = opencl_release_device_arrays(p, prog);
-	p = opencl_release_cl_objects(p, opencl);
-
-	p = ppcg_end_block(p);
-
 	return p;
 }
 

Modified: polly/trunk/lib/External/ppcg/opencl_test.sh.in
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/opencl_test.sh.in?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/opencl_test.sh.in (original)
+++ polly/trunk/lib/External/ppcg/opencl_test.sh.in Thu Jul 20 08:48:13 2017
@@ -54,6 +54,25 @@ run_tests () {
 run_tests default
 run_tests embed --opencl-embed-kernel-code
 
+for i in $srcdir/examples/*.c; do
+	echo $i
+	name=`basename $i`
+	name="${name%.c}"
+	exe_ref="${OUTDIR}/$name.ref$EXEEXT"
+	gen_ocl="${OUTDIR}/$name.ppcg.c"
+	exe_ocl="${OUTDIR}/$name.ppcg$EXEEXT"
+	output_ref="${OUTDIR}/$name.ref.out"
+	output_ocl="${OUTDIR}/$name.ppcg.out"
+	$CC $CFLAGS $i -o $exe_ref || exit
+	./ppcg$EXEEXT --target=opencl --opencl-no-use-gpu $i -o "$gen_ocl" || \
+		exit
+	$CC $CFLAGS -I "$srcdir" "$srcdir/ocl_utilities.c" -lOpenCL \
+		"$gen_ocl" -o "$exe_ocl" || exit
+	$exe_ref > $output_ref || exit
+	$exe_ocl > $output_ocl || exit
+	cmp $output_ref $output_ocl || exit
+done
+
 if [ $keep = "no" ]; then
 	rm -r "${OUTDIR}"
 fi

Modified: polly/trunk/lib/External/ppcg/polybench_test.sh.in
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/polybench_test.sh.in?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/polybench_test.sh.in (original)
+++ polly/trunk/lib/External/ppcg/polybench_test.sh.in Thu Jul 20 08:48:13 2017
@@ -88,8 +88,8 @@ run_tests () {
 	done
 }
 
-run_tests ppcg --target=c
-run_tests ppcg_live "--target=c --no-live-range-reordering"
+run_tests ppcg "--target=c --tile"
+run_tests ppcg_live "--target=c --no-live-range-reordering --tile"
 
 # Test OpenMP code, if compiler supports openmp
 if [ $HAVE_OPENMP = "yes" ]; then

Modified: polly/trunk/lib/External/ppcg/ppcg.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/ppcg.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/ppcg.c (original)
+++ polly/trunk/lib/External/ppcg/ppcg.c Thu Jul 20 08:48:13 2017
@@ -16,9 +16,17 @@
 #include <stdlib.h>
 #include <string.h>
 #include <isl/ctx.h>
+#include <isl/id.h>
+#include <isl/val.h>
+#include <isl/set.h>
+#include <isl/union_set.h>
+#include <isl/union_map.h>
+#include <isl/aff.h>
 #include <isl/flow.h>
 #include <isl/options.h>
 #include <isl/schedule.h>
+#include <isl/ast.h>
+#include <isl/id_to_ast_expr.h>
 #include <isl/ast_build.h>
 #include <isl/schedule.h>
 #include <pet.h>
@@ -29,7 +37,6 @@
 #include "cpu.h"
 
 struct options {
-	struct isl_options *isl;
 	struct pet_options *pet;
 	struct ppcg_options *ppcg;
 	char *input;
@@ -43,7 +50,6 @@ static void print_version(void)
 }
 
 ISL_ARGS_START(struct options, options_args)
-ISL_ARG_CHILD(struct options, isl, "isl", &isl_options_args, "isl options")
 ISL_ARG_CHILD(struct options, pet, "pet", &pet_options_args, "pet options")
 ISL_ARG_CHILD(struct options, ppcg, NULL, &ppcg_options_args, "ppcg options")
 ISL_ARG_STR(struct options, output, 'o', NULL,
@@ -99,8 +105,6 @@ int ppcg_scop_any_hidden_declarations(st
 	if (!scop)
 		return 0;
 
-        // This is a pet feature not available in Polly.
-        return 0;
 	for (i = 0; i < scop->pet->n_array; ++i)
 		if (scop->pet->arrays[i]->declared &&
 		    !scop->pet->arrays[i]->exposed)
@@ -186,7 +190,6 @@ __isl_give isl_id_list *ppcg_scop_genera
 	int n, const char *prefix)
 {
 	int i;
-	char name[10];
 	isl_ctx *ctx;
 	isl_id_list *names;
 
@@ -338,7 +341,7 @@ static __isl_give isl_union_map *project
  *
  *	{ [S[i,j] -> R_1[]] -> S[i,j]; [S[i,j] -> R_2[]] -> S[i,j] }
  */
-void compute_tagger(struct ppcg_scop *ps)
+static void compute_tagger(struct ppcg_scop *ps)
 {
 	isl_union_map *tagged;
 	isl_union_pw_multi_aff *tagger;
@@ -362,12 +365,25 @@ void compute_tagger(struct ppcg_scop *ps
  *
  * We compute the "dependence" of any "kill" (an explicit kill
  * or a must write) on any may write.
- * The may writes with a "depending" kill are definitely killed.
+ * The elements accessed by the may writes with a "depending" kill
+ * also accessing the element are definitely killed.
  * The remaining may writes can potentially be live out.
+ *
+ * The result of the dependence analysis is
+ *
+ *	{ IW -> [IK -> A] }
+ *
+ * with IW the instance of the write statement, IK the instance of kill
+ * statement and A the element that was killed.
+ * The range factor range is
+ *
+ *	{ IW -> A }
+ *
+ * containing all such pairs for which there is a kill statement instance,
+ * i.e., all pairs that have been killed.
  */
 static void compute_live_out(struct ppcg_scop *ps)
 {
-	isl_union_pw_multi_aff *tagger;
 	isl_schedule *schedule;
 	isl_union_map *kills;
 	isl_union_map *exposed;
@@ -375,22 +391,21 @@ static void compute_live_out(struct ppcg
 	isl_union_access_info *access;
 	isl_union_flow *flow;
 
-	tagger = isl_union_pw_multi_aff_copy(ps->tagger);
 	schedule = isl_schedule_copy(ps->schedule);
-	schedule = isl_schedule_pullback_union_pw_multi_aff(schedule, tagger);
-	kills = isl_union_map_union(isl_union_map_copy(ps->tagged_must_writes),
-				    isl_union_map_copy(ps->tagged_must_kills));
+	kills = isl_union_map_union(isl_union_map_copy(ps->must_writes),
+				    isl_union_map_copy(ps->must_kills));
 	access = isl_union_access_info_from_sink(kills);
 	access = isl_union_access_info_set_may_source(access,
-				isl_union_map_copy(ps->tagged_may_writes));
+				    isl_union_map_copy(ps->may_writes));
 	access = isl_union_access_info_set_schedule(access, schedule);
 	flow = isl_union_access_info_compute_flow(access);
-	covering = isl_union_flow_get_may_dependence(flow);
+	covering = isl_union_flow_get_full_may_dependence(flow);
 	isl_union_flow_free(flow);
-	exposed = isl_union_map_copy(ps->tagged_may_writes);
-	exposed = isl_union_map_subtract_domain(exposed,
-				isl_union_map_domain(covering));
-	ps->live_out = project_out_tags(exposed);
+
+	covering = isl_union_map_range_factor_range(covering);
+	exposed = isl_union_map_copy(ps->may_writes);
+	exposed = isl_union_map_subtract(exposed, covering);
+	ps->live_out = exposed;
 }
 
 /* Compute the tagged flow dependences and the live_in accesses and store
@@ -696,7 +711,7 @@ static void compute_flow_dep(struct ppcg
  * set of order dependences and a set of external false dependences
  * in compute_live_range_reordering_dependences.
  */
-void compute_dependences(struct ppcg_scop *scop)
+static void compute_dependences(struct ppcg_scop *scop)
 {
 	isl_union_map *may_source;
 	isl_union_access_info *access;
@@ -815,7 +830,7 @@ static __isl_give isl_set *set_intersect
 	return set;
 }
 
-void *ppcg_scop_free(struct ppcg_scop *ps)
+static void *ppcg_scop_free(struct ppcg_scop *ps)
 {
 	if (!ps)
 		return NULL;
@@ -832,6 +847,7 @@ void *ppcg_scop_free(struct ppcg_scop *p
 	isl_union_map_free(ps->must_writes);
 	isl_union_map_free(ps->live_out);
 	isl_union_map_free(ps->tagged_must_kills);
+	isl_union_map_free(ps->must_kills);
 	isl_union_map_free(ps->tagged_dep_flow);
 	isl_union_map_free(ps->dep_flow);
 	isl_union_map_free(ps->dep_false);
@@ -882,13 +898,14 @@ static struct ppcg_scop *ppcg_scop_from_
 	}
 	ps->domain = collect_non_kill_domains(scop);
 	ps->call = collect_call_domains(scop);
-	ps->tagged_reads = pet_scop_collect_tagged_may_reads(scop);
-	ps->reads = pet_scop_collect_may_reads(scop);
-	ps->tagged_may_writes = pet_scop_collect_tagged_may_writes(scop);
-	ps->may_writes = pet_scop_collect_may_writes(scop);
-	ps->tagged_must_writes = pet_scop_collect_tagged_must_writes(scop);
-	ps->must_writes = pet_scop_collect_must_writes(scop);
-	ps->tagged_must_kills = pet_scop_collect_tagged_must_kills(scop);
+	ps->tagged_reads = pet_scop_get_tagged_may_reads(scop);
+	ps->reads = pet_scop_get_may_reads(scop);
+	ps->tagged_may_writes = pet_scop_get_tagged_may_writes(scop);
+	ps->may_writes = pet_scop_get_may_writes(scop);
+	ps->tagged_must_writes = pet_scop_get_tagged_must_writes(scop);
+	ps->must_writes = pet_scop_get_must_writes(scop);
+	ps->tagged_must_kills = pet_scop_get_tagged_must_kills(scop);
+	ps->must_kills = pet_scop_get_must_kills(scop);
 	ps->schedule = isl_schedule_copy(scop->schedule);
 	ps->pet = scop;
 	ps->independence = isl_union_map_empty(isl_set_get_space(ps->context));
@@ -902,7 +919,7 @@ static struct ppcg_scop *ppcg_scop_from_
 
 	if (!ps->context || !ps->domain || !ps->call || !ps->reads ||
 	    !ps->may_writes || !ps->must_writes || !ps->tagged_must_kills ||
-	    !ps->schedule || !ps->independence || !ps->names)
+	    !ps->must_kills || !ps->schedule || !ps->independence || !ps->names)
 		return ppcg_scop_free(ps);
 
 	return ps;
@@ -1009,7 +1026,6 @@ static int check_options(isl_ctx *ctx)
 	return 0;
 }
 
-#if 0
 int main(int argc, char **argv)
 {
 	int r;
@@ -1020,8 +1036,12 @@ int main(int argc, char **argv)
 	assert(options);
 
 	ctx = isl_ctx_alloc_with_options(&options_args, options);
-	isl_options_set_schedule_outer_coincidence(ctx, 1);
+	ppcg_options_set_target_defaults(options->ppcg);
+	isl_options_set_ast_build_detect_min_max(ctx, 1);
+	isl_options_set_ast_print_macro_once(ctx, 1);
+	isl_options_set_schedule_whole_component(ctx, 0);
 	isl_options_set_schedule_maximize_band_depth(ctx, 1);
+	isl_options_set_schedule_maximize_coincidence(ctx, 1);
 	pet_options_set_encapsulate_dynamic_control(ctx, 1);
 	argc = options_parse(options, argc, argv, ISL_ARG_ALL);
 
@@ -1040,4 +1060,3 @@ int main(int argc, char **argv)
 
 	return r;
 }
-#endif

Modified: polly/trunk/lib/External/ppcg/ppcg.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/ppcg.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/ppcg.h (original)
+++ polly/trunk/lib/External/ppcg/ppcg.h Thu Jul 20 08:48:13 2017
@@ -37,8 +37,10 @@ int ppcg_extract_base_name(char *name, c
  *	to a reference identifier
  * "live_out" contains the potential write accesses that are potentially
  *	not killed by any kills or any other writes.
- * "tagged_must_kills" contains all definite kill accesses with
- *	a reference identifier in the domain.
+ * "must_kills" contains all definite kill accesses.
+ * "tagged_must_kills" is the same as "must_kills", except that the domain
+ *	is a wrapped relation mapping an iteration domain
+ *	to a reference identifier.
  *
  * "tagger" maps tagged iteration domains to the corresponding untagged
  *	iteration domain.
@@ -87,6 +89,7 @@ struct ppcg_scop {
 	isl_union_map *must_writes;
 	isl_union_map *live_out;
 	isl_union_map *tagged_must_kills;
+	isl_union_map *must_kills;
 
 	isl_union_pw_multi_aff *tagger;
 
@@ -114,8 +117,8 @@ int ppcg_transform(isl_ctx *ctx, const c
 	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p,
 		struct ppcg_scop *scop, void *user), void *user);
 
-void compute_tagger(struct ppcg_scop *ps);
-void compute_dependences(struct ppcg_scop *scop);
-void *ppcg_scop_free(struct ppcg_scop *ps);
+__isl_give isl_schedule *ppcg_compute_schedule(
+	__isl_take isl_schedule_constraints *sc,
+	__isl_keep isl_schedule *schedule, struct ppcg_options *options);
 
 #endif

Modified: polly/trunk/lib/External/ppcg/ppcg_options.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/ppcg_options.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/ppcg_options.c (original)
+++ polly/trunk/lib/External/ppcg/ppcg_options.c Thu Jul 20 08:48:13 2017
@@ -17,6 +17,36 @@ static struct isl_arg_choice target[] =
 	{0}
 };
 
+/* Set defaults that depend on the target.
+ * In particular, set --schedule-outer-coincidence iff target is a GPU.
+ */
+void ppcg_options_set_target_defaults(struct ppcg_options *options)
+{
+	char *argv[2] = { NULL };
+
+	argv[0] = "ppcg_options_set_target_defaults";
+	if (options->target == PPCG_TARGET_C)
+		argv[1] = "--no-schedule-outer-coincidence";
+	else
+		argv[1] = "--schedule-outer-coincidence";
+
+	isl_options_parse(options->isl, 2, argv, ISL_ARG_ALL);
+}
+
+/* Callback that is called whenever the "target" option is set (to "val").
+ * The callback is called after target has been updated.
+ *
+ * Call ppcg_options_set_target_defaults to reset the target-dependent options.
+ */
+static int set_target(void *opt, unsigned val)
+{
+	struct ppcg_options *options = opt;
+
+	ppcg_options_set_target_defaults(options);
+
+	return 0;
+}
+
 ISL_ARGS_START(struct ppcg_debug_options, ppcg_debug_options_args)
 ISL_ARG_BOOL(struct ppcg_debug_options, dump_schedule_constraints, 0,
 	"dump-schedule-constraints", 0, "dump schedule constraints")
@@ -46,10 +76,14 @@ ISL_ARG_BOOL(struct ppcg_options, opencl
 ISL_ARGS_END
 
 ISL_ARGS_START(struct ppcg_options, ppcg_options_args)
+ISL_ARG_CHILD(struct ppcg_options, isl, "isl", &isl_options_args, "isl options")
 ISL_ARG_CHILD(struct ppcg_options, debug, NULL, &ppcg_debug_options_args,
 	"debugging options")
+ISL_ARG_BOOL(struct ppcg_options, group_chains, 0, "group-chains", 1,
+	"group chains of interdependent statements that are executed "
+	"consecutively in the original schedule before scheduling")
 ISL_ARG_BOOL(struct ppcg_options, reschedule, 0, "reschedule", 1,
-	"replace original schedule by isl computed schedule (except C target)")
+	"replace original schedule by isl computed schedule")
 ISL_ARG_BOOL(struct ppcg_options, scale_tile_loops, 0,
 	"scale-tile-loops", 1, NULL)
 ISL_ARG_BOOL(struct ppcg_options, wrap, 0, "wrap", 1, NULL)
@@ -62,22 +96,37 @@ ISL_ARG_STR(struct ppcg_options, ctx, 0,
 ISL_ARG_BOOL(struct ppcg_options, non_negative_parameters, 0,
 	"assume-non-negative-parameters", 0,
 	"assume all parameters are non-negative)")
+ISL_ARG_BOOL(struct ppcg_options, tile, 0, "tile", 0,
+	"perform tiling (C target)")
 ISL_ARG_INT(struct ppcg_options, tile_size, 'S', "tile-size", "size", 32, NULL)
+ISL_ARG_BOOL(struct ppcg_options, isolate_full_tiles, 0, "isolate-full-tiles",
+	0, "isolate full tiles from partial tiles (hybrid tiling)")
 ISL_ARG_STR(struct ppcg_options, sizes, 0, "sizes", "sizes", NULL,
 	"Per kernel tile, grid and block sizes")
 ISL_ARG_INT(struct ppcg_options, max_shared_memory, 0,
 	"max-shared-memory", "size", 8192, "maximal amount of shared memory")
 ISL_ARG_BOOL(struct ppcg_options, openmp, 0, "openmp", 0,
 	"Generate OpenMP macros (only for C target)")
-ISL_ARG_CHOICE(struct ppcg_options, target, 0, "target", target,
-	PPCG_TARGET_CUDA, "the target to generate code for")
+ISL_ARG_USER_OPT_CHOICE(struct ppcg_options, target, 0, "target", target,
+	&set_target, PPCG_TARGET_CUDA, PPCG_TARGET_CUDA,
+	"the target to generate code for")
 ISL_ARG_BOOL(struct ppcg_options, linearize_device_arrays, 0,
 	"linearize-device-arrays", 1,
 	"linearize all device arrays, even those of fixed size")
+ISL_ARG_BOOL(struct ppcg_options, allow_gnu_extensions, 0,
+	"allow-gnu-extensions", 1,
+	"allow the use of GNU extensions in generated code")
 ISL_ARG_BOOL(struct ppcg_options, live_range_reordering, 0,
 	"live-range-reordering", 1,
 	"allow successive live ranges on the same memory element "
 	"to be reordered")
+ISL_ARG_BOOL(struct ppcg_options, hybrid, 0, "hybrid", 0,
+	"apply hybrid tiling whenever a suitable input pattern is found "
+	"(GPU targets)")
+ISL_ARG_BOOL(struct ppcg_options, unroll_copy_shared, 0, "unroll-copy-shared",
+	0, "unroll code for copying to/from shared memory")
+ISL_ARG_BOOL(struct ppcg_options, unroll_gpu_tile, 0, "unroll-gpu-tile", 0,
+	"unroll code inside tile on GPU targets")
 ISL_ARG_GROUP("opencl", &ppcg_opencl_options_args, "OpenCL options")
 ISL_ARG_STR(struct ppcg_options, save_schedule_file, 0, "save-schedule",
 	"file", NULL, "save isl computed schedule to <file>")

Modified: polly/trunk/lib/External/ppcg/ppcg_options.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/ppcg_options.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/ppcg_options.h (original)
+++ polly/trunk/lib/External/ppcg/ppcg_options.h Thu Jul 20 08:48:13 2017
@@ -2,6 +2,7 @@
 #define PPCG_OPTIONS_H
 
 #include <isl/arg.h>
+#include <isl/options.h>
 
 struct ppcg_debug_options {
 	int dump_schedule_constraints;
@@ -12,8 +13,12 @@ struct ppcg_debug_options {
 };
 
 struct ppcg_options {
+	struct isl_options *isl;
 	struct ppcg_debug_options *debug;
 
+	/* Group chains of consecutive statements before scheduling. */
+	int group_chains;
+
 	/* Use isl to compute a schedule replacing the original schedule. */
 	int reschedule;
 	int scale_tile_loops;
@@ -24,8 +29,13 @@ struct ppcg_options {
 	char *ctx;
 	char *sizes;
 
+	/* Perform tiling (C target). */
+	int tile;
 	int tile_size;
 
+	/* Isolate full tiles from partial tiles. */
+	int isolate_full_tiles;
+
 	/* Take advantage of private memory. */
 	int use_private_memory;
 
@@ -44,9 +54,20 @@ struct ppcg_options {
 	/* Linearize all device arrays. */
 	int linearize_device_arrays;
 
+	/* Allow the use of GNU extensions in generated code. */
+	int allow_gnu_extensions;
+
 	/* Allow live range to be reordered. */
 	int live_range_reordering;
 
+	/* Allow hybrid tiling whenever a suitable input pattern is found. */
+	int hybrid;
+
+	/* Unroll the code for copying to/from shared memory. */
+	int unroll_copy_shared;
+	/* Unroll code inside tile on GPU targets. */
+	int unroll_gpu_tile;
+
 	/* Options to pass to the OpenCL compiler.  */
 	char *opencl_compiler_options;
 	/* Prefer GPU device over CPU. */
@@ -74,4 +95,6 @@ ISL_ARG_DECL(ppcg_options, struct ppcg_o
 #define		PPCG_TARGET_CUDA	1
 #define		PPCG_TARGET_OPENCL      2
 
+void ppcg_options_set_target_defaults(struct ppcg_options *options);
+
 #endif

Modified: polly/trunk/lib/External/ppcg/print.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/print.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/print.c (original)
+++ polly/trunk/lib/External/ppcg/print.c Thu Jul 20 08:48:13 2017
@@ -11,6 +11,7 @@
 #include <isl/ast_build.h>
 
 #include "print.h"
+#include "util.h"
 
 __isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p)
 {
@@ -30,21 +31,276 @@ __isl_give isl_printer *ppcg_end_block(_
 	return p;
 }
 
-static int print_macro(enum isl_ast_op_type type, void *user)
+/* Names of notes that keep track of whether min/max
+ * macro definitions have already been printed.
+ */
+static const char *ppcg_max_printed = "ppcg_max_printed";
+static const char *ppcg_min_printed = "ppcg_min_printed";
+
+/* Has the macro definition corresponding to "note_name" been printed
+ * to "p" before?
+ * That is, does "p" have an associated "note_name" note?
+ */
+static isl_bool printed_before(__isl_keep isl_printer *p, const char *note_name)
+{
+	isl_ctx *ctx;
+	isl_id *id;
+	isl_bool printed;
+
+	if (!p)
+		return isl_bool_error;
+
+	ctx = isl_printer_get_ctx(p);
+	id = isl_id_alloc(ctx, note_name, NULL);
+	printed = isl_printer_has_note(p, id);
+	isl_id_free(id);
+
+	return printed;
+}
+
+/* Keep track of the fact that the macro definition corresponding
+ * to "note_name" has been printed to "p" by attaching a note with
+ * that name.  The value of the note is of no importance, but it
+ * has to be a valid isl_id, so the note identifier is reused
+ * as the note.
+ */
+static __isl_give isl_printer *mark_printed(__isl_take isl_printer *p,
+	const char *note_name)
+{
+	isl_ctx *ctx;
+	isl_id *id;
+
+	if (!p)
+		return NULL;
+
+	ctx = isl_printer_get_ctx(p);
+	id = isl_id_alloc(ctx, note_name, NULL);
+	return isl_printer_set_note(p, id, isl_id_copy(id));
+}
+
+/* Print a macro definition "def" for the macro "name" to "p",
+ * unless such a macro definition has been printed to "p" before.
+ * "note_name" is used as the name of the note that keeps track
+ * of whether this printing has happened.
+ */
+static __isl_give isl_printer *print_ppcg_macro(__isl_take isl_printer *p,
+	const char *name, const char *def, const char *note_name)
+{
+	isl_bool printed;
+
+	printed = printed_before(p, note_name);
+	if (printed < 0)
+		return isl_printer_free(p);
+	if (printed)
+		return p;
+
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, "#define ");
+	p = isl_printer_print_str(p, name);
+	p = isl_printer_print_str(p, def);
+	p = isl_printer_end_line(p);
+
+	p = mark_printed(p, note_name);
+
+	return p;
+}
+
+/* Structure for keeping track of definitions of some macros.
+ */
+struct ppcg_macros {
+	const char *min;
+	const char *max;
+};
+
+/* Free the memory allocated by a struct ppcg_macros.
+ */
+static void ppcg_macros_free(void *user)
+{
+	free(user);
+}
+
+/* Default macro definitions (when GNU extensions are allowed).
+ */
+struct ppcg_macros ppcg_macros_default = {
+	.min = "(x,y)    "
+		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
+		"_x < _y ? _x : _y; })",
+	.max = "(x,y)    "
+		"({ __typeof__(x) _x = (x); __typeof__(y) _y = (y); "
+		"_x > _y ? _x : _y; })",
+};
+
+/* Name used for the note that keeps track of macro definitions.
+ */
+static const char *ppcg_macros = "ppcg_macros";
+
+/* Set the macro definitions for isl_ast_op_min and isl_ast_op_max
+ * to "min" and "max" and store them in "p".
+ *
+ * In particular, create a ppcg_macros object and attach it
+ * as a note to the printer.
+ */
+__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
+	const char *min, const char *max)
+{
+	isl_ctx *ctx;
+	isl_id *id, *macros_id;
+	struct ppcg_macros *macros;
+
+	if (!p)
+		return NULL;
+
+	ctx = isl_printer_get_ctx(p);
+	macros = isl_alloc_type(ctx, struct ppcg_macros);
+	if (!macros)
+		return isl_printer_free(p);
+	macros->min = min;
+	macros->max = max;
+	id = isl_id_alloc(ctx, ppcg_macros, NULL);
+	macros_id = isl_id_alloc(ctx, NULL, macros);
+	if (!macros_id)
+		ppcg_macros_free(macros);
+	else
+		macros_id = isl_id_set_free_user(macros_id, &ppcg_macros_free);
+
+	p = isl_printer_set_note(p, id, macros_id);
+
+	return p;
+}
+
+/* Return the ppcg_macros object that holds the currently active
+ * macro definitions in "p".
+ * If "p" has a note with macro definitions, then return those.
+ * Otherwise, return the default macro definitions.
+ */
+static struct ppcg_macros *get_macros(__isl_keep isl_printer *p)
+{
+	isl_id *id;
+	isl_bool has_macros;
+	struct ppcg_macros *macros;
+
+	id = isl_id_alloc(isl_printer_get_ctx(p), ppcg_macros, NULL);
+	has_macros = isl_printer_has_note(p, id);
+	if (has_macros < 0 || !has_macros) {
+		isl_id_free(id);
+		if (has_macros < 0)
+			return NULL;
+		return &ppcg_macros_default;
+	}
+	id = isl_printer_get_note(p, id);
+	macros = isl_id_get_user(id);
+	isl_id_free(id);
+
+	return macros;
+}
+
+/* Print the currently active macro definition for ppcg_max.
+ */
+static __isl_give isl_printer *print_max(__isl_take isl_printer *p)
+{
+	struct ppcg_macros *macros;
+
+	macros = get_macros(p);
+	if (!macros)
+		return isl_printer_free(p);
+	return print_ppcg_macro(p, ppcg_max, macros->max, ppcg_max_printed);
+}
+
+/* Print the currently active macro definition for ppcg_min.
+ */
+static __isl_give isl_printer *print_min(__isl_take isl_printer *p)
+{
+	struct ppcg_macros *macros;
+
+	macros = get_macros(p);
+	if (!macros)
+		return isl_printer_free(p);
+	return print_ppcg_macro(p, ppcg_min, macros->min, ppcg_min_printed);
+}
+
+/* Print a macro definition for "type" to "p".
+ * If GNU extensions are allowed, then print a specialized definition
+ * for isl_ast_op_min and isl_ast_op_max.
+ * Otherwise, use the default isl definition.
+ */
+__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
+	__isl_take isl_printer *p)
+{
+	isl_ctx *ctx;
+	struct ppcg_options *options;
+
+	if (!p)
+		return NULL;
+
+	ctx = isl_printer_get_ctx(p);
+	options = isl_ctx_peek_options(ctx, &ppcg_options_args);
+	if (!options || !options->allow_gnu_extensions)
+		return isl_ast_op_type_print_macro(type, p);
+
+	switch (type) {
+	case isl_ast_op_max:
+		return print_max(p);
+	case isl_ast_op_min:
+		return print_min(p);
+	default:
+		return isl_ast_op_type_print_macro(type, p);
+	}
+}
+
+/* isl_ast_expr_foreach_ast_op_type or isl_ast_node_foreach_ast_op_type
+ * callback that prints a macro definition for "type".
+ */
+static isl_stat print_macro(enum isl_ast_op_type type, void *user)
 {
 	isl_printer **p = user;
 
-	if (type == isl_ast_op_fdiv_q)
-		return 0;
+	*p = ppcg_print_macro(type, *p);
+	if (!*p)
+		return isl_stat_error;
+
+	return isl_stat_ok;
+}
+
+/* Print the required macros for "expr".
+ */
+__isl_give isl_printer *ppcg_ast_expr_print_macros(
+	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p)
+{
+	if (isl_ast_expr_foreach_ast_op_type(expr, &print_macro, &p) < 0)
+		return isl_printer_free(p);
+	return p;
+}
 
-	*p = isl_ast_op_type_print_macro(type, *p);
+/* isl_id_to_ast_expr_foreach callback that prints the required
+ * macro definitions for "val".
+ */
+static isl_stat print_expr_macros(__isl_take isl_id *key,
+	__isl_take isl_ast_expr *val, void *user)
+{
+	isl_printer **p = user;
 
-	return 0;
+	*p = ppcg_ast_expr_print_macros(val, *p);
+	isl_id_free(key);
+	isl_ast_expr_free(val);
+
+	if (!*p)
+		return isl_stat_error;
+	return isl_stat_ok;
 }
 
-/* Print the required macros for "node", except one for floord.
- * The caller is assumed to have printed a macro for floord already
- * as it may also appear in the declarations and the statements.
+/* Print the required macro definitions for the body of a statement in which
+ * the access expressions are replaced by the isl_ast_expr objects
+ * in "ref2expr".
+ */
+__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
+	__isl_keep isl_id_to_ast_expr *ref2expr)
+{
+	if (isl_id_to_ast_expr_foreach(ref2expr, &print_expr_macros, &p) < 0)
+		return isl_printer_free(p);
+	return p;
+}
+
+/* Print the required macros for "node".
  */
 __isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
 	__isl_keep isl_ast_node *node)
@@ -54,67 +310,105 @@ __isl_give isl_printer *ppcg_print_macro
 	return p;
 }
 
-/* Print "extent" as a sequence of
- *
- *	[1 + maximal_value]
+/* Names used for the macros that may appear in a printed isl AST.
+ */
+const char *ppcg_min = "ppcg_min";
+const char *ppcg_max = "ppcg_max";
+const char *ppcg_fdiv_q = "ppcg_fdiv_q";
+
+/* Set the names of the macros that may appear in a printed isl AST.
+ */
+__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p)
+{
+	p = isl_ast_op_type_set_print_name(p, isl_ast_op_min, ppcg_min);
+	p = isl_ast_op_type_set_print_name(p, isl_ast_op_max, ppcg_max);
+	p = isl_ast_op_type_set_print_name(p, isl_ast_op_fdiv_q, ppcg_fdiv_q);
+
+	return p;
+}
+
+/* Given a multi affine expression "mpa" without domain, modify it to have
+ * the schedule space of "build" as domain.
  *
- * one for each dimension.
- * "build" is used to simplify the size expressions, if any.
+ * If the schedule space of "build" is a parameter space, then nothing
+ * needs to be done.
+ * Otherwise, "mpa" is first given a 0D domain and then it is combined
+ * with a mapping from the schedule space of "build" to the same 0D domain.
  */
-static __isl_give isl_printer *print_extent(__isl_take isl_printer *p,
-	__isl_keep isl_set *extent, __isl_keep isl_ast_build *build)
+__isl_give isl_multi_pw_aff *ppcg_attach_multi_pw_aff(
+	__isl_take isl_multi_pw_aff *mpa, __isl_keep isl_ast_build *build)
 {
-	int i, n;
+	isl_bool params;
+	isl_space *space;
+	isl_multi_aff *ma;
+
+	space = isl_ast_build_get_schedule_space(build);
+	params = isl_space_is_params(space);
+	if (params < 0 || params) {
+		isl_space_free(space);
+		if (params < 0)
+			return isl_multi_pw_aff_free(mpa);
+		return mpa;
+	}
+	space = isl_space_from_domain(space);
+	ma = isl_multi_aff_zero(space);
+	mpa = isl_multi_pw_aff_from_range(mpa);
+	mpa = isl_multi_pw_aff_pullback_multi_aff(mpa, ma);
 
-	n = isl_set_dim(extent, isl_dim_set);
-	if (n == 0)
-		return p;
+	return mpa;
+}
 
-	for (i = 0; i < n; ++i) {
-		isl_set *dom;
-		isl_local_space *ls;
-		isl_aff *one;
-		isl_pw_aff *bound;
-		isl_ast_expr *expr;
-
-		bound = isl_set_dim_max(isl_set_copy(extent), i);
-		dom = isl_pw_aff_domain(isl_pw_aff_copy(bound));
-		ls = isl_local_space_from_space(isl_set_get_space(dom));
-		one = isl_aff_zero_on_domain(ls);
-		one = isl_aff_add_constant_si(one, 1);
-		bound = isl_pw_aff_add(bound, isl_pw_aff_alloc(dom, one));
-
-		p = isl_printer_print_str(p, "[");
-		expr = isl_ast_build_expr_from_pw_aff(build, bound);
-		p = isl_printer_print_ast_expr(p, expr);
-		p = isl_printer_print_str(p, "]");
+/* Build an access AST expression from "size" using "build".
+ * "size" does not have a domain, but "build" may have a proper schedule space.
+ * First modify "size" to have that schedule space as domain.
+ */
+__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
+	__isl_keep isl_ast_build *build)
+{
+	size = ppcg_attach_multi_pw_aff(size, build);
+	return isl_ast_build_access_from_multi_pw_aff(build, size);
+}
 
-		isl_ast_expr_free(expr);
-	}
+/* Print a declaration for an array with element type "base_type" and
+ * size "size" to "p".
+ */
+__isl_give isl_printer *ppcg_print_declaration_with_size(
+	__isl_take isl_printer *p, const char *base_type,
+	__isl_keep isl_ast_expr *size)
+{
+	if (!base_type || !size)
+		return isl_printer_free(p);
+
+	p = ppcg_ast_expr_print_macros(size, p);
+	p = isl_printer_start_line(p);
+	p = isl_printer_print_str(p, base_type);
+	p = isl_printer_print_str(p, " ");
+	p = isl_printer_print_ast_expr(p, size);
+	p = isl_printer_print_str(p, ";");
+	p = isl_printer_end_line(p);
 
 	return p;
 }
 
 /* Print a declaration for array "array" to "p", using "build"
  * to simplify any size expressions.
+ *
+ * The size is computed from the extent of the array and is
+ * subsequently converted to an "access expression" by "build".
  */
 __isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
 	struct pet_array *array, __isl_keep isl_ast_build *build)
 {
-	const char *name;
+	isl_multi_pw_aff *size;
+	isl_ast_expr *expr;
 
 	if (!array)
 		return isl_printer_free(p);
 
-	name = isl_set_get_tuple_name(array->extent);
-
-	p = isl_printer_start_line(p);
-	p = isl_printer_print_str(p, array->element_type);
-	p = isl_printer_print_str(p, " ");
-	p = isl_printer_print_str(p, name);
-	p = print_extent(p, array->extent, build);
-	p = isl_printer_print_str(p, ";");
-	p = isl_printer_end_line(p);
+	size = ppcg_size_from_extent(isl_set_copy(array->extent));
+	expr = isl_ast_build_access_from_multi_pw_aff(build, size);
+	p = ppcg_print_declaration_with_size(p, array->element_type, expr);
+	isl_ast_expr_free(expr);
 
 	return p;
 }
@@ -164,67 +458,3 @@ __isl_give isl_printer *ppcg_print_hidde
 {
 	return print_declarations(p, scop, 0);
 }
-
-/* Internal data structure for print_guarded_user.
- *
- * fn is the function that should be called to print the body.
- * user is the argument that should be passed to this function.
- */
-struct ppcg_print_guarded_data {
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user);
-	void *user;
-};
-
-/* Print the body of the if statement expressing the guard passed
- * to "ppcg_print_guarded" by calling data->fn.
- */
-static __isl_give isl_printer *print_guarded_user(__isl_take isl_printer *p,
-	__isl_take isl_ast_print_options *options,
-	__isl_keep isl_ast_node *node, void *user)
-{
-	struct ppcg_print_guarded_data *data = user;
-
-	p = data->fn(p, data->user);
-
-	isl_ast_print_options_free(options);
-	return p;
-}
-
-/* Print a condition for the given "guard" within the given "context"
- * on "p", calling "fn" with "user" to print the body of the if statement.
- * If the guard is implied by the context, then no if statement is printed
- * and the body is printed directly to "p".
- *
- * Both "guard" and "context" are assumed to be parameter sets.
- *
- * We slightly abuse the AST generator to print this guard.
- * In particular, we create a trivial schedule for an iteration
- * domain with a single instance, restricted by the guard.
- */
-__isl_give isl_printer *ppcg_print_guarded(__isl_take isl_printer *p,
-	__isl_take isl_set *guard, __isl_take isl_set *context,
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user),
-	void *user)
-{
-	struct ppcg_print_guarded_data data = { fn, user };
-	isl_ctx *ctx;
-	isl_union_map *schedule;
-	isl_ast_build *build;
-	isl_ast_node *tree;
-	isl_ast_print_options *options;
-
-	ctx = isl_printer_get_ctx(p);
-	guard = isl_set_from_params(guard);
-	schedule = isl_union_map_from_map(isl_map_from_domain(guard));
-	build = isl_ast_build_from_context(context);
-	tree = isl_ast_build_node_from_schedule_map(build, schedule);
-	isl_ast_build_free(build);
-
-	options = isl_ast_print_options_alloc(ctx);
-	options = isl_ast_print_options_set_print_user(options,
-						&print_guarded_user, &data);
-	p = isl_ast_node_print(tree, p, options);
-	isl_ast_node_free(tree);
-
-	return p;
-}

Modified: polly/trunk/lib/External/ppcg/print.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/print.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/print.h (original)
+++ polly/trunk/lib/External/ppcg/print.h Thu Jul 20 08:48:13 2017
@@ -5,12 +5,31 @@
 
 #include "ppcg.h"
 
+extern const char *ppcg_min;
+extern const char *ppcg_max;
+extern const char *ppcg_fdiv_q;
+
 __isl_give isl_printer *ppcg_start_block(__isl_take isl_printer *p);
 __isl_give isl_printer *ppcg_end_block(__isl_take isl_printer *p);
 
+__isl_give isl_printer *ppcg_set_macro_names(__isl_take isl_printer *p);
+__isl_give isl_printer *ppcg_set_macros(__isl_take isl_printer *p,
+	const char *min, const char *max);
+__isl_give isl_printer *ppcg_print_macro(enum isl_ast_op_type type,
+	__isl_take isl_printer *p);
+__isl_give isl_printer *ppcg_ast_expr_print_macros(
+	__isl_keep isl_ast_expr *expr, __isl_take isl_printer *p);
+__isl_give isl_printer *ppcg_print_body_macros(__isl_take isl_printer *p,
+	__isl_keep isl_id_to_ast_expr *ref2expr);
 __isl_give isl_printer *ppcg_print_macros(__isl_take isl_printer *p,
 	__isl_keep isl_ast_node *node);
 
+__isl_give isl_ast_expr *ppcg_build_size_expr(__isl_take isl_multi_pw_aff *size,
+	__isl_keep isl_ast_build *build);
+
+__isl_give isl_printer *ppcg_print_declaration_with_size(
+	__isl_take isl_printer *p, const char *base_type,
+	__isl_keep isl_ast_expr *size);
 __isl_give isl_printer *ppcg_print_declaration(__isl_take isl_printer *p,
 	struct pet_array *array, __isl_keep isl_ast_build *build);
 __isl_give isl_printer *ppcg_print_exposed_declarations(
@@ -18,9 +37,4 @@ __isl_give isl_printer *ppcg_print_expos
 __isl_give isl_printer *ppcg_print_hidden_declarations(
 	__isl_take isl_printer *p, struct ppcg_scop *scop);
 
-__isl_give isl_printer *ppcg_print_guarded(__isl_take isl_printer *p,
-	__isl_take isl_set *guard, __isl_take isl_set *context,
-	__isl_give isl_printer *(*fn)(__isl_take isl_printer *p, void *user),
-	void *user);
-
 #endif

Modified: polly/trunk/lib/External/ppcg/schedule.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/schedule.c?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/schedule.c (original)
+++ polly/trunk/lib/External/ppcg/schedule.c Thu Jul 20 08:48:13 2017
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 #include <ctype.h>
+#include <stdio.h>
 #include <string.h>
 
 #include <isl/set.h>
@@ -18,41 +19,6 @@
 
 #include "schedule.h"
 
-/* Construct a map from a len-dimensional domain to
- * a (len-n)-dimensional domain that projects out the n coordinates
- * starting at first.
- * "dim" prescribes the parameters.
- */
-__isl_give isl_map *project_out(__isl_take isl_space *dim,
-    int len, int first, int n)
-{
-    int i, j;
-    isl_basic_map *bmap;
-
-    dim = isl_space_add_dims(dim, isl_dim_in, len);
-    dim = isl_space_add_dims(dim, isl_dim_out, len - n);
-    bmap = isl_basic_map_universe(dim);
-
-    for (i = 0, j = 0; i < len; ++i) {
-        if (i >= first && i < first + n)
-            continue;
-	bmap = isl_basic_map_equate(bmap, isl_dim_in, i, isl_dim_out, j);
-        ++j;
-    }
-
-    return isl_map_from_basic_map(bmap);
-}
-
-/* Construct a projection that maps a src_len dimensional domain
- * to its first dst_len coordinates.
- * "dim" prescribes the parameters.
- */
-__isl_give isl_map *projection(__isl_take isl_space *dim,
-    int src_len, int dst_len)
-{
-    return project_out(dim, src_len, dst_len, src_len - dst_len);
-}
-
 /* Add parameters with identifiers "ids" to "set".
  */
 static __isl_give isl_set *add_params(__isl_take isl_set *set,
@@ -114,79 +80,86 @@ __isl_give isl_set *parametrization(__is
 	return parametrize(set, first, ids);
 }
 
-/* Extend "set" with unconstrained coordinates to a total length of "dst_len".
+/* Load and return a schedule from a file called "filename".
  */
-__isl_give isl_set *extend(__isl_take isl_set *set, int dst_len)
+static __isl_give isl_schedule *load_schedule(isl_ctx *ctx,
+	const char *filename)
 {
-    int n_set;
-    isl_space *dim;
-    isl_map *map;
-
-    dim = isl_set_get_space(set);
-    n_set = isl_space_dim(dim, isl_dim_set);
-    dim = isl_space_drop_dims(dim, isl_dim_set, 0, n_set);
-    map = projection(dim, dst_len, n_set);
-    map = isl_map_reverse(map);
+	FILE *file;
+	isl_schedule *schedule;
 
-    return isl_set_apply(set, map);
-}
-
-/* Set max_out to the maximal number of output dimensions over
- * all maps.
- */
-static isl_stat update_max_out(__isl_take isl_map *map, void *user)
-{
-	int *max_out = user;
-	int n_out = isl_map_dim(map, isl_dim_out);
-
-	if (n_out > *max_out)
-		*max_out = n_out;
+	file = fopen(filename, "r");
+	if (!file) {
+		fprintf(stderr, "Unable to open '%s' for reading\n", filename);
+		return NULL;
+	}
+	schedule = isl_schedule_read_from_file(ctx, file);
+	fclose(file);
 
-	isl_map_free(map);
-	return isl_stat_ok;
+	return schedule;
 }
 
-struct align_range_data {
-	int max_out;
-	isl_union_map *res;
-};
-
-/* Extend the dimension of the range of the given map to data->max_out and
- * then add the result to data->res.
+/* Save the schedule "schedule" to a file called "filename".
+ * The schedule is printed in block style.
  */
-static isl_stat map_align_range(__isl_take isl_map *map, void *user)
+static void save_schedule(__isl_keep isl_schedule *schedule,
+	const char *filename)
 {
-	struct align_range_data *data = user;
-	int i;
-	isl_space *dim;
-	isl_map *proj;
-	int n_out = isl_map_dim(map, isl_dim_out);
+	FILE *file;
+	isl_ctx *ctx;
+	isl_printer *p;
 
-	dim = isl_union_map_get_space(data->res);
-	proj = isl_map_reverse(projection(dim, data->max_out, n_out));
-	for (i = n_out; i < data->max_out; ++i)
-		proj = isl_map_fix_si(proj, isl_dim_out, i, 0);
+	if (!schedule)
+		return;
 
-	map = isl_map_apply_range(map, proj);
-
-	data->res = isl_union_map_add_map(data->res, map);
+	file = fopen(filename, "w");
+	if (!file) {
+		fprintf(stderr, "Unable to open '%s' for writing\n", filename);
+		return;
+	}
+	ctx = isl_schedule_get_ctx(schedule);
+	p = isl_printer_to_file(ctx, file);
+	p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
+	p = isl_printer_print_schedule(p, schedule);
+	isl_printer_free(p);
+	fclose(file);
+}
+
+/* Obtain a schedule, either by reading it form a file
+ * or by computing it using "compute".
+ * Also take care of saving the computed schedule and/or
+ * dumping the obtained schedule if requested by the user.
+ */
+__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
+	struct ppcg_options *options,
+	__isl_give isl_schedule *(*compute)(void *user), void *user)
+{
+	isl_schedule *schedule;
+
+	if (options->load_schedule_file) {
+		schedule = load_schedule(ctx, options->load_schedule_file);
+	} else {
+		schedule = compute(user);
+		if (options->save_schedule_file)
+			save_schedule(schedule, options->save_schedule_file);
+	}
+	if (options->debug->dump_schedule)
+		isl_schedule_dump(schedule);
 
-	return isl_stat_ok;
+	return schedule;
 }
 
-/* Extend the ranges of the maps in the union map such they all have
- * the same dimension.
+/* Mark all dimensions in the band node "node" to be of "type".
  */
-__isl_give isl_union_map *align_range(__isl_take isl_union_map *umap)
+__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
+	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type)
 {
-	struct align_range_data data;
-
-	data.max_out = 0;
-	isl_union_map_foreach_map(umap, &update_max_out, &data.max_out);
+	int i, n;
 
-	data.res = isl_union_map_empty(isl_union_map_get_space(umap));
-	isl_union_map_foreach_map(umap, &map_align_range, &data);
+	n = isl_schedule_node_band_n_member(node);
+	for (i = 0; i < n; ++i)
+		node = isl_schedule_node_band_member_set_ast_loop_type(node, i,
+							type);
 
-	isl_union_map_free(umap);
-	return data.res;
+	return node;
 }

Modified: polly/trunk/lib/External/ppcg/schedule.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/schedule.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/schedule.h (original)
+++ polly/trunk/lib/External/ppcg/schedule.h Thu Jul 20 08:48:13 2017
@@ -2,55 +2,20 @@
 #define _SCHEDULE_H
 
 #include <isl/id.h>
-#include <isl/set_type.h>
-#include <isl/map_type.h>
-#include <isl/union_map_type.h>
-
-#include <pet.h>
-
-/* An access to an outer array element or an iterator.
- * Accesses to iterators have an access relation that maps to an unnamed space.
- * An access may be both read and write.
- * If the access relation is empty, then the output dimension may
- * not be equal to the dimension of the corresponding array.
- */
-struct gpu_stmt_access {
-	/* Access reads elements */
-	int read;
-	/* Access writes elements */
-	int write;
-	/* All writes are definite writes. */
-	int exact_write;
-	/* The number of index expressions specified in the access. */
-	int n_index;
-
-	/* May access relation */
-	isl_map *access;
-	/* May access relation with as domain a mapping from iteration domain
-	 * to a reference identifier.
-	 */
-	isl_map *tagged_access;
-	/* The reference id of the corresponding pet_expr. */
-	isl_id *ref_id;
-
-	struct gpu_stmt_access *next;
-};
-
-struct gpu_stmt {
-	isl_id *id;
-	struct pet_stmt *stmt;
-
-	/* Linked list of accesses. */
-	struct gpu_stmt_access *accesses;
-};
-
-__isl_give isl_map *project_out(__isl_take isl_space *dim,
-	int len, int first, int n);
-__isl_give isl_map *projection(__isl_take isl_space *dim,
-	int src_len, int dst_len);
+#include <isl/space.h>
+#include <isl/schedule.h>
+#include <isl/schedule_node.h>
+
+#include "ppcg_options.h"
+
 __isl_give isl_set *parametrization(__isl_take isl_space *space,
 	int len, int first, __isl_keep isl_id_list *names);
-__isl_give isl_set *extend(__isl_take isl_set *set, int dst_len);
-__isl_give isl_union_map *align_range(__isl_take isl_union_map *umap);
+
+__isl_give isl_schedule *ppcg_get_schedule(isl_ctx *ctx,
+	struct ppcg_options *options,
+	__isl_give isl_schedule *(*compute)(void *user), void *user);
+
+__isl_give isl_schedule_node *ppcg_set_schedule_node_type(
+	__isl_take isl_schedule_node *node, enum isl_ast_loop_type type);
 
 #endif

Added: polly/trunk/lib/External/ppcg/tests/iterator.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/tests/iterator.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/tests/iterator.c (added)
+++ polly/trunk/lib/External/ppcg/tests/iterator.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+
+int main()
+{
+	int i;
+	int a[101];
+
+	i = 0;
+#pragma scop
+	for (i = 0; i < 100; ++i)
+		a[i] = i;
+	a[i] = i;
+#pragma endscop
+	if (a[100] != 100)
+		return EXIT_FAILURE;
+
+	return EXIT_SUCCESS;
+}

Added: polly/trunk/lib/External/ppcg/tests/live_out.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/tests/live_out.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/tests/live_out.c (added)
+++ polly/trunk/lib/External/ppcg/tests/live_out.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,22 @@
+#include <stdlib.h>
+
+/* Check that a write access is not removed from the live-out
+ * accesses only because a strict subset of the (potentially)
+ * accessed elements are killed by a later write.
+ */
+int main()
+{
+	int A[10];
+
+	A[1] = 0;
+#pragma scop
+	int i = 1;
+	i = i * i;
+	A[i] = 1;
+	A[0] = 0;
+#pragma endscop
+	if (A[1] != 1)
+		return EXIT_FAILURE;
+
+	return EXIT_SUCCESS;
+}

Added: polly/trunk/lib/External/ppcg/tests/local.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/tests/local.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/tests/local.c (added)
+++ polly/trunk/lib/External/ppcg/tests/local.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,22 @@
+#include <stdlib.h>
+
+int main()
+{
+	int A[100];
+
+#pragma scop
+	{
+		int B[100];
+		B[0] = 0;
+		for (int i = 1; i < 100; ++i)
+			B[i] = B[i - 1] + 1;
+		for (int i = 0; i < 100; ++i)
+			A[i] = B[i];
+	}
+#pragma endscop
+	for (int i = 0; i < 100; ++i)
+		if (A[i] != i)
+			return EXIT_FAILURE;
+
+	return EXIT_SUCCESS;
+}

Added: polly/trunk/lib/External/ppcg/tests/struct4.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/tests/struct4.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/tests/struct4.c (added)
+++ polly/trunk/lib/External/ppcg/tests/struct4.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,27 @@
+#include <stdlib.h>
+
+struct s {
+	int a;
+	int b;
+};
+
+int main()
+{
+	int a[10];
+
+	for (int i = 0; i < 10; ++i)
+		a[i] = 0;
+#pragma scop
+	for (int i = 0; i < 10; ++i) {
+		struct s b;
+		b.a = 1;
+		b.b = i;
+		a[i] = b.a + b.b;
+	}
+#pragma endscop
+	for (int i = 0; i < 10; ++i)
+		if (a[i] != 1 + i)
+			return EXIT_FAILURE;
+
+	return EXIT_SUCCESS;
+}

Added: polly/trunk/lib/External/ppcg/util.c
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/util.c?rev=308623&view=auto
==============================================================================
--- polly/trunk/lib/External/ppcg/util.c (added)
+++ polly/trunk/lib/External/ppcg/util.c Thu Jul 20 08:48:13 2017
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2012-2013 Ecole Normale Superieure
+ *
+ * Use of this software is governed by the MIT license
+ *
+ * Written by Sven Verdoolaege,
+ * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
+ */
+
+#include <isl/space.h>
+#include <isl/val.h>
+#include <isl/aff.h>
+#include <isl/set.h>
+
+#include "util.h"
+
+/* Construct an isl_multi_val living in "space" with all values equal to "val".
+ */
+__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
+	int val)
+{
+	int i, n;
+	isl_ctx *ctx;
+	isl_val *v;
+	isl_multi_val *mv;
+
+	if (!space)
+		return NULL;
+
+	ctx = isl_space_get_ctx(space);
+	n = isl_space_dim(space, isl_dim_set);
+	mv = isl_multi_val_zero(space);
+	v = isl_val_int_from_si(ctx, val);
+	for (i = 0; i < n; ++i)
+		mv = isl_multi_val_set_val(mv, i, isl_val_copy(v));
+	isl_val_free(v);
+
+	return mv;
+}
+
+/* Construct an isl_multi_val living in "space" with values specified
+ * by "list".  "list" is assumed to have at least as many entries
+ * as the set dimension of "space".
+ */
+__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
+	__isl_take isl_space *space, int *list)
+{
+	int i, n;
+	isl_ctx *ctx;
+	isl_multi_val *mv;
+
+	if (!space)
+		return NULL;
+
+	ctx = isl_space_get_ctx(space);
+	n = isl_space_dim(space, isl_dim_set);
+	mv = isl_multi_val_zero(space);
+	for (i = 0; i < n; ++i) {
+		isl_val *v;
+
+		v = isl_val_int_from_si(ctx, list[i]);
+		mv = isl_multi_val_set_val(mv, i, v);
+	}
+
+	return mv;
+}
+
+/* Compute the size of a bounding box around the origin and "set",
+ * where "set" is assumed to contain only non-negative elements.
+ * In particular, compute the maximal value of "set" in each direction
+ * and add one.
+ */
+__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set)
+{
+	int i, n;
+	isl_multi_pw_aff *mpa;
+
+	n = isl_set_dim(set, isl_dim_set);
+	mpa = isl_multi_pw_aff_zero(isl_set_get_space(set));
+	for (i = 0; i < n; ++i) {
+		isl_space *space;
+		isl_aff *one;
+		isl_pw_aff *bound;
+
+		if (!isl_set_dim_has_upper_bound(set, isl_dim_set, i)) {
+			const char *name;
+			name = isl_set_get_tuple_name(set);
+			if (!name)
+				name = "";
+			fprintf(stderr, "unable to determine extent of '%s' "
+				"in dimension %d\n", name, i);
+			set = isl_set_free(set);
+		}
+		bound = isl_set_dim_max(isl_set_copy(set), i);
+
+		space = isl_pw_aff_get_domain_space(bound);
+		one = isl_aff_zero_on_domain(isl_local_space_from_space(space));
+		one = isl_aff_add_constant_si(one, 1);
+		bound = isl_pw_aff_add(bound, isl_pw_aff_from_aff(one));
+		mpa = isl_multi_pw_aff_set_pw_aff(mpa, i, bound);
+	}
+	isl_set_free(set);
+
+	return mpa;
+}

Modified: polly/trunk/lib/External/ppcg/util.h
URL: http://llvm.org/viewvc/llvm-project/polly/trunk/lib/External/ppcg/util.h?rev=308623&r1=308622&r2=308623&view=diff
==============================================================================
--- polly/trunk/lib/External/ppcg/util.h (original)
+++ polly/trunk/lib/External/ppcg/util.h Thu Jul 20 08:48:13 2017
@@ -3,6 +3,9 @@
 
 #include <string.h>
 
+#include <isl/space.h>
+#include <isl/val.h>
+
 /* Compare the prefix of "s" to "prefix" up to the length of "prefix".
  */
 static inline int prefixcmp(const char *s, const char *prefix)
@@ -10,4 +13,10 @@ static inline int prefixcmp(const char *
 	return strncmp(s, prefix, strlen(prefix));
 }
 
+__isl_give isl_multi_val *ppcg_multi_val_from_int(__isl_take isl_space *space,
+	int val);
+__isl_give isl_multi_val *ppcg_multi_val_from_int_list(
+	__isl_take isl_space *space, int *list);
+__isl_give isl_multi_pw_aff *ppcg_size_from_extent(__isl_take isl_set *set);
+
 #endif