[test-suite] r312463 - [test-suite] Adding the CLAMR mini-app

Sun Sep 3 20:10:18 PDT 2017

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/mesh.cpp?rev=312463&view=auto
==============================================================================

--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp Sun Sep  3 20:10:18 2017
@@ -0,0 +1,10456 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifdef HAVE_MPI
+#include "mpi.h"
+#endif
+
+#include <algorithm>
+#include <unistd.h>
+#include <limits.h>
+#include <time.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//#include "hsfc.h"
+#include "KDTree.h"
+#include "mesh.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+#include "timer.h"
+#ifdef HAVE_MPI
+#include "l7/l7.h"
+#endif
+#include "reduce.h"
+#include "genmalloc.h"
+#include "hash.h"
+
+#define DEBUG 0
+//#define BOUNDS_CHECK 1
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+#define DEBUG_RESTORE_VALS 1
+
+typedef int scanInt;
+void scan ( scanInt *input , scanInt *output , scanInt length);
+
+#ifdef _OPENMP
+#undef REZONE_NO_OPTIMIZATION
+#else
+#define REZONE_NO_OPTIMIZATION 1
+#endif
+
+#define TIMING_LEVEL 2
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+#define IPOW2(a) (2 << (a))
+
+#if defined(MINIMUM_PRECISION)
+#define CONSERVATION_EPS    .1
+#define STATE_EPS      15.0
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+#define CONSERVATION_EPS    .02
+#define STATE_EPS        .025
+
+#elif defined(FULL_PRECISION)
+#define CONSERVATION_EPS    .02
+#define STATE_EPS        .025
+
+#endif
+
+typedef unsigned int uint;
+#ifdef __APPLE_CC__
+typedef unsigned long ulong;
+#endif
+
+#define TWO 2
+#define HALF 0.5
+
+#define __NEW_STENCIL__
+//#define __OLD_STENCIL__
+//#define STENCIL_WARNING 1
+
+#ifdef STENCIL_WARNING
+int do_stencil_warning=1;
+#else
+int do_stencil_warning=0;
+#endif
+
+#ifdef HAVE_OPENCL
+#include "mesh_kernel.inc"
+#endif
+
+extern bool localStencil;
+int calc_neighbor_type;
+bool dynamic_load_balance_on;
+bool neighbor_remap;
+
+#ifdef _OPENMP
+static bool iversion_flag = false;
+#endif
+
+static const char *mesh_timer_descriptor[MESH_TIMER_SIZE] = {
+   "mesh_timer_count_BCs",
+   "mesh_timer_calc_neighbors",
+   "mesh_timer_hash_setup",
+   "mesh_timer_hash_query",
+   "mesh_timer_find_boundary",
+   "mesh_timer_push_setup",
+   "mesh_timer_push_boundary",
+   "mesh_timer_local_list",
+   "mesh_timer_layer1",
+   "mesh_timer_layer2",
+   "mesh_timer_layer_list",
+   "mesh_timer_copy_mesh_data",
+   "mesh_timer_fill_mesh_ghost",
+   "mesh_timer_fill_neigh_ghost",
+   "mesh_timer_set_corner_neigh",
+   "mesh_timer_neigh_adjust",
+   "mesh_timer_setup_comm",
+   "mesh_timer_kdtree_setup",
+   "mesh_timer_kdtree_query",
+   "mesh_timer_refine_smooth",
+   "mesh_timer_rezone_all",
+   "mesh_timer_partition",
+   "mesh_timer_calc_spatial_coordinates",
+   "mesh_timer_load_balance"
+};
+
+#ifdef HAVE_OPENCL
+cl_kernel      kernel_hash_adjust_sizes;
+cl_kernel      kernel_hash_setup;
+cl_kernel      kernel_hash_setup_local;
+cl_kernel      kernel_neighbor_init;
+cl_kernel      kernel_calc_neighbors;
+cl_kernel      kernel_calc_neighbors_local;
+cl_kernel      kernel_calc_border_cells;
+cl_kernel      kernel_calc_border_cells2;
+cl_kernel      kernel_finish_scan;
+cl_kernel      kernel_get_border_data;
+cl_kernel      kernel_calc_layer1;
+cl_kernel      kernel_calc_layer1_sethash;
+cl_kernel      kernel_calc_layer2;
+cl_kernel      kernel_get_border_data2;
+cl_kernel      kernel_calc_layer2_sethash;
+cl_kernel      kernel_copy_mesh_data;
+cl_kernel      kernel_fill_mesh_ghost;
+cl_kernel      kernel_fill_neighbor_ghost;
+cl_kernel      kernel_set_corner_neighbor;
+cl_kernel      kernel_adjust_neighbors_local;
+cl_kernel      kernel_reduction_scan2;
+cl_kernel      kernel_reduction_count;
+cl_kernel      kernel_reduction_count2;
+cl_kernel      kernel_hash_size;
+cl_kernel      kernel_finish_hash_size;
+cl_kernel      kernel_calc_spatial_coordinates;
+cl_kernel      kernel_count_BCs;
+cl_kernel      kernel_do_load_balance_lower;
+cl_kernel      kernel_do_load_balance_middle;
+cl_kernel      kernel_do_load_balance_upper;
+#ifndef MINIMUM_PRECISION
+cl_kernel      kernel_do_load_balance_double;
+#endif
+cl_kernel      kernel_do_load_balance_float;
+cl_kernel      kernel_refine_smooth;
+cl_kernel      kernel_coarsen_smooth;
+cl_kernel      kernel_coarsen_check_block;
+cl_kernel      kernel_rezone_all;
+cl_kernel      kernel_rezone_neighbors;
+#ifndef MINIMUM_PRECISION
+cl_kernel      kernel_rezone_one_double;
+#endif
+cl_kernel      kernel_rezone_one_float;
+cl_kernel      kernel_copy_mpot_ghost_data;
+cl_kernel      kernel_set_boundary_refinement;
+#endif
+
+extern size_t hash_header_size;
+extern int   choose_hash_method;
+
+void Mesh::write_grid(int ncycle)
+{
+   FILE *fp;
+   char filename[20];
+
+   if (ncycle<0) ncycle=0;
+   sprintf(filename,"grid%02d.gph",ncycle);
+   fp=fopen(filename,"w");
+
+   fprintf(fp,"viewport %lf %lf %lf %lf\n",xmin,ymin,xmax,ymax);
+   for (uint ic = 0; ic < ncells; ic++) {
+      fprintf(fp,"rect  %lf   %lf   %lf   %lf\n",x[ic],y[ic],x[ic]+dx[ic],y[ic]+dy[ic]);
+   }
+
+   fprintf(fp,"line_init %lf %lf\n",x[0]+0.5*dx[0],y[0]+0.5*dy[0]);
+   for (uint ic = 1; ic < ncells; ic++){
+      fprintf(fp,"line %lf %lf\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic]);
+   }
+
+   for (uint ic = 0; ic < ncells; ic++){
+      fprintf(fp,"text %lf %lf %d\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic],ic);
+   }
+
+   fclose(fp);
+}
+
+Mesh::Mesh(FILE *fin, int *numpe)
+{
+   char string[80];
+   ibase = 1;
+
+   time_t trand;
+   time(&trand);
+   srand48((long)trand);
+
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+   sscanf(string,"levmax %d",&levmx);
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+   sscanf(string,"cells %ld",&ncells);
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+   sscanf(string,"numpe %d",numpe);
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+   sscanf(string,"ndim %d",&ndim);
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+#ifdef MINIMUM_PRECISION
+   sscanf(string,"xaxis %f %f",&xmin, &deltax);
+#else
+   sscanf(string,"xaxis %lf %lf",&xmin, &deltax);
+#endif
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+   sscanf(string,"yaxis %lf %lf",(double*)&ymin, (double*)&deltay);
+   if (ndim == THREE_DIMENSIONAL){
+     if(fgets(string, 80, fin) == NULL) exit(-1);
+     sscanf(string,"zaxis %lf %lf",(double*)&zmin, (double*)&deltaz);
+   }
+   if(fgets(string, 80, fin) == NULL) exit(-1);
+
+   index.resize(ncells);
+
+   allocate(ncells);
+
+   uint ic=0;
+   while(fgets(string, 80, fin)!=NULL){
+      sscanf(string, "%d %d %d %d", &(index[ic]), &(i[ic]), &(j[ic]), &(level[ic]));
+      ic++;
+   }
+
+   ibase = 0;
+   calc_spatial_coordinates(ibase);
+   KDTree_Initialize(&tree);
+
+
+  print();
+
+   if (ic != ncells) {
+      printf("Error -- cells read does not match number specified\n");
+   }
+   return;
+}
+
+void Mesh::print(void)
+{
+   assert(&nlft[0] != NULL);
+   assert(&x[0] != NULL);
+   assert(&index[0] != NULL);
+
+   //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+   printf("index orig index   i     j     lev   nlft  nrht  nbot  ntop   xlow    xhigh     ylow    yhigh\n");
+   for (uint ic=0; ic<ncells; ic++)
+   {  printf("%6d %6d   %4d  %4d   %4d  %4d  %4d  %4d  %4d ", ic, index[ic], i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+      printf("%8.2lf %8.2lf %8.2lf %8.2lf\n", x[ic], x[ic]+dx[ic], y[ic], y[ic]+dy[ic]); }
+}
+
+void Mesh::print_local()
+{  //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+   if (mesh_memory.get_memory_size(nlft) >= ncells_ghost){
+      fprintf(fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mype);
+      for (uint ic=0; ic<ncells; ic++) {
+         fprintf(fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+      }
+      for (uint ic=ncells; ic<ncells_ghost; ic++) {
+         fprintf(fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+      }
+   } else {
+      fprintf(fp,"%d:    index   i     j     lev\n",mype);
+      for (uint ic=0; ic<ncells_ghost; ic++) {
+         fprintf(fp,"%d: %6d  %4d  %4d   %4d  \n", mype,ic, i[ic], j[ic], level[ic]);
+      }
+   }
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::print_dev_local(void)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<int>i_tmp(ncells_ghost);
+   vector<int>j_tmp(ncells_ghost);
+   vector<int>level_tmp(ncells_ghost);
+   vector<int>nlft_tmp(ncells_ghost);
+   vector<int>nrht_tmp(ncells_ghost);
+   vector<int>nbot_tmp(ncells_ghost);
+   vector<int>ntop_tmp(ncells_ghost);
+   ezcl_enqueue_read_buffer(command_queue, dev_i,     CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_j,     CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nlft,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nrht,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nbot,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_ntop,  CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+   //fprintf(fp,"\n%d:                    Printing mesh for dev_local\n\n",mype);
+
+   fprintf(fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mype);
+   for (uint ic=0; ic<MAX(ncells_ghost,ncells); ic++) {
+      fprintf(fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
+   }
+   //fprintf(fp,"\n%d:              Finished printing mesh for dev_local\n\n",mype);
+}
+
+void Mesh::compare_dev_local_to_local(void)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<int>i_tmp(ncells_ghost);
+   vector<int>j_tmp(ncells_ghost);
+   vector<int>level_tmp(ncells_ghost);
+   vector<int>nlft_tmp(ncells_ghost);
+   vector<int>nrht_tmp(ncells_ghost);
+   vector<int>nbot_tmp(ncells_ghost);
+   vector<int>ntop_tmp(ncells_ghost);
+   ezcl_enqueue_read_buffer(command_queue, dev_i,     CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_j,     CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nlft,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nrht,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nbot,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_ntop,  CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+   fprintf(fp,"\n%d:                      Comparing mesh for dev_local to local\n\n",mype);
+   //fprintf(fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mype);
+   for (uint ic=0; ic<ncells_ghost; ic++) {
+      if (i_tmp[ic]     != i[ic]    ) fprintf(fp,"%d: Error: cell %d dev_i     %d i     %d\n",mype,ic,i_tmp[ic],    i[ic]);
+      if (j_tmp[ic]     != j[ic]    ) fprintf(fp,"%d: Error: cell %d dev_j     %d j     %d\n",mype,ic,j_tmp[ic],    j[ic]);
+      if (level_tmp[ic] != level[ic]) fprintf(fp,"%d: Error: cell %d dev_level %d level %d\n",mype,ic,level_tmp[ic],level[ic]);
+
+      //fprintf(fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
+   }
+   fprintf(fp,"\n%d:                 Finished comparing mesh for dev_local to local\n\n",mype);
+}
+
+void Mesh::compare_neighbors_gpu_global_to_cpu_global()
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<int>nlft_check(ncells);
+   vector<int>nrht_check(ncells);
+   vector<int>nbot_check(ncells);
+   vector<int>ntop_check(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_nlft,  CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nrht,  CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nbot,  CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_ntop,  CL_TRUE,  0, ncells*sizeof(cl_int), &ntop_check[0], NULL);
+
+   //printf("\n%d:                      Comparing neighbors for gpu_global to cpu_global\n\n",mype);
+   for (uint ic=0; ic<ncells; ic++) {
+      if (nlft[ic] != nlft_check[ic]) printf("DEBUG -- nlft: ic %d nlft %d nlft_check %d\n",ic, nlft[ic], nlft_check[ic]);
+      if (nrht[ic] != nrht_check[ic]) printf("DEBUG -- nrht: ic %d nrht %d nrht_check %d\n",ic, nrht[ic], nrht_check[ic]);
+      if (nbot[ic] != nbot_check[ic]) printf("DEBUG -- nbot: ic %d nbot %d nbot_check %d\n",ic, nbot[ic], nbot_check[ic]);
+      if (ntop[ic] != ntop_check[ic]) printf("DEBUG -- ntop: ic %d ntop %d ntop_check %d\n",ic, ntop[ic], ntop_check[ic]);
+   }
+   //printf("\n%d:                 Finished comparing mesh for dev_local to local\n\n",mype);
+}
+#endif
+
+void Mesh::compare_neighbors_cpu_local_to_cpu_global(uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
+{
+
+#ifdef HAVE_MPI
+   int *nlft_global = mesh_global->nlft;
+   int *nrht_global = mesh_global->nrht;
+   int *nbot_global = mesh_global->nbot;
+   int *ntop_global = mesh_global->ntop;
+
+   vector<int> Test(ncells_ghost);
+   for(uint ic=0; ic<ncells; ic++){
+      Test[ic] = mype*1000 +ic;
+   }
+   if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
+
+   vector<int> Test_global(ncells_global);
+   MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   vector<int> Test_check(ncells);
+   vector<int> Test_check_global(ncells_global);
+
+   // ==================== check left value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nlft[ic]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
+      }
+   }
+   
+   // ==================== check left left value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nlft[nlft[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
+            mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
+         printf("%d:                         check %5d -- nlftl %5d nlftl nlftl %5d check  %5d\n",
+            mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
+      }
+   }
+   
+   // ==================== check right value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nrht[ic]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
+      }
+   }
+   
+   // ==================== check right right value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nrht[nrht[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
+            mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
+         printf("%d:                         check %5d -- nrhtl %5d nrhtl nrhtl %5d check  %5d\n",
+            mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check bottom value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nbot[ic]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
+      }
+   }
+   
+   // ==================== check bottom bottom value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nbot[nbot[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
+            mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
+         printf("%d:                         check %5d -- nbotl %5d nbotl nbotl %5d check  %5d\n",
+            mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
+      }
+   }
+   
+   // ==================== check top value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[ntop[ic]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check top top value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[ntop[ntop[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+            mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+         printf("%d:                         check %5d -- ntopl %5d ntopl ntopl %5d check  %5d\n",
+            mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+      }
+   }
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- ncells_global %d ncells_ghost %d mesh_global %p nsizes[0] %d ndispl[0] %d\n",
+               ncells_global,ncells_ghost,mesh_global,nsizes[0],ndispl[0]);
+#endif
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_neighbors_all_to_gpu_local(Mesh *mesh_global, int *nsizes, int *ndispl)
+//uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
+{
+#ifdef HAVE_MPI
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t &ncells_global = mesh_global->ncells;
+   int *nlft_global = mesh_global->nlft;
+   int *nrht_global = mesh_global->nrht;
+   int *nbot_global = mesh_global->nbot;
+   int *ntop_global = mesh_global->ntop;
+
+   // Checking CPU parallel to CPU global
+   vector<int> Test(ncells_ghost);
+   for(uint ic=0; ic<ncells; ic++){
+      Test[ic] = mype*1000 +ic; 
+   }    
+   if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
+
+   vector<int> Test_global(ncells_global);
+   MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   vector<int> Test_check(ncells);
+   vector<int> Test_check_global(ncells_global);
+
+   // ==================== check left value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nlft[ic]];
+      //if (mype == 1 && ic==0) printf("%d: nlft check for ic 0 is %d\n",mype,nlft[0]);
+   }    
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      //if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
+         //if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
+      //}  
+   }    
+     
+   // ==================== check left left value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nlft[nlft[ic]]];
+   }    
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
+            mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
+         printf("%d:                           check %5d -- nlftl %5d nlftl nlftl %5d check  %5d\n",
+            mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
+      }          
+   }       
+              
+   // ==================== check right value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nrht[ic]];
+   }       
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check right right value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nrht[nrht[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
+            mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
+         printf("%d:                         check %5d -- nrhtl %5d nrhtl nrhtl %5d check  %5d\n",
+            mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check bottom value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nbot[ic]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check bottom bottom value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[nbot[nbot[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
+            mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
+         printf("%d:                         check %5d -- nbotl %5d nbotl nbotl %5d check  %5d\n",
+            mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
+      }
+   }
+   // ==================== check top value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[ntop[ic]];
+   }
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
+         if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
+      }
+   }
+
+   // ==================== check top top value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[ntop[ntop[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+            mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+         printf("%d:                         check %5d -- ntopl %5d ntopl ntopl %5d check  %5d\n",
+            mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+      }
+   }
+   // checking gpu results
+   vector<int> nlft_check(ncells_ghost);         vector<int> nrht_check(ncells_ghost);
+   vector<int> nbot_check(ncells_ghost);         vector<int> ntop_check(ncells_ghost);
+   ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nlft_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nrht_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nbot_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells_ghost*sizeof(cl_int),  &ntop_check[0], NULL);
+
+   for (uint ic=0; ic<ncells_ghost; ic++){
+      if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
+      if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
+      if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
+      if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
+   }
+
+   // ==================== check top top value ====================
+   for (uint ic=0; ic<ncells; ic++){
+      Test_check[ic] = Test[ntop[ntop[ic]]];
+   }
+
+   MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+   for (uint ic=0; ic<ncells_global; ic++){
+      if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+         printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+            mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+         printf("%d:                         check %5d -- ntopl %5d ntopl ntopl %5d check  %5d\n",
+            mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+      }
+   }
+   // checking gpu results
+   //vector<int> nlft_check(ncells_ghost);         vector<int> nrht_check(ncells_ghost);
+   //vector<int> nbot_check(ncells_ghost);         vector<int> ntop_check(ncells_ghost);
+   ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nlft_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nrht_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int),  &nbot_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells_ghost*sizeof(cl_int),  &ntop_check[0], NULL);
+
+   for (uint ic=0; ic<ncells_ghost; ic++){
+      if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
+      if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
+      if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
+      if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
+   }
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- mesh_global %p nsizes[0] %d ndispl[0] %d\n",
+               mesh_global,nsizes[0],ndispl[0]);
+#endif
+}
+
+void Mesh::compare_indices_gpu_global_to_cpu_global(void)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<int> i_check(ncells);
+   vector<int> j_check(ncells);
+   vector<int> level_check(ncells);
+   vector<int> celltype_check(ncells);
+   /// Set read buffers for data.
+   ezcl_enqueue_read_buffer(command_queue, dev_i,        CL_FALSE, 0, ncells*sizeof(cl_int), &i_check[0],        NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_j,        CL_FALSE, 0, ncells*sizeof(cl_int), &j_check[0],        NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_level,    CL_FALSE, 0, ncells*sizeof(cl_int), &level_check[0],    NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE,  0, ncells*sizeof(cl_int), &celltype_check[0], NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (i[ic]        != i_check[ic] )        printf("DEBUG -- i: ic %d i %d i_check %d\n",ic, i[ic], i_check[ic]);
+      if (j[ic]        != j_check[ic] )        printf("DEBUG -- j: ic %d j %d j_check %d\n",ic, j[ic], j_check[ic]);
+      if (level[ic]    != level_check[ic] )    printf("DEBUG -- level: ic %d level %d level_check %d\n",ic, level[ic], level_check[ic]);
+      if (celltype[ic] != celltype_check[ic] ) printf("DEBUG -- celltype: ic %d celltype %d celltype_check %d\n",ic, celltype[ic], celltype_check[ic]);
+   }
+}
+#endif
+
+void Mesh::compare_indices_cpu_local_to_cpu_global(uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl, int cycle)
+{
+   int *celltype_global = mesh_global->celltype;
+   int *i_global        = mesh_global->i;
+   int *j_global        = mesh_global->j;
+   int *level_global    = mesh_global->level;
+
+   vector<int> i_check_global(ncells_global);
+   vector<int> j_check_global(ncells_global);
+   vector<int> level_check_global(ncells_global);
+   vector<int> celltype_check_global(ncells_global);
+
+/*
+   vector<int> i_check_local(ncells);
+   vector<int> j_check_local(ncells);
+   vector<int> level_check_local(ncells);
+   vector<int> celltype_check_local(ncells);
+*/
+
+#ifdef HAVE_MPI
+   MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   MPI_Allgatherv(&i[0],        nsizes[mype], MPI_INT, &i_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   MPI_Allgatherv(&j[0],        nsizes[mype], MPI_INT, &j_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   MPI_Allgatherv(&level[0],    nsizes[mype], MPI_INT, &level_check_global[0],    &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+/*
+   MPI_Scatterv(&celltype_global[0], &nsizes[0], &ndispl[0], MPI_INT, &celltype_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+   MPI_Scatterv(&i_global[0],        &nsizes[0], &ndispl[0], MPI_INT, &i_check_local[0],        nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+   MPI_Scatterv(&j_global[0],        &nsizes[0], &ndispl[0], MPI_INT, &j_check_local[0],        nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+   MPI_Scatterv(&level_global[0],    &nsizes[0], &ndispl[0], MPI_INT, &level_check_local[0],    nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+*/
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d\n",
+               nsizes[0],ndispl[0]);
+#endif
+
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (celltype_global[ic] != celltype_check_global[ic])  printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d  %d  \n",cycle,ic,celltype_global[ic],celltype_check_global[ic]);
+      if (i_global[ic] != i_check_global[ic])                printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d  %d  \n",cycle,ic,i_global[ic],i_check_global[ic]);
+      if (j_global[ic] != j_check_global[ic])                printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d  %d  \n",cycle,ic,j_global[ic],j_check_global[ic]);
+      if (level_global[ic] != level_check_global[ic])        printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d  %d  \n",cycle,ic,level_global[ic],level_check_global[ic]);
+   }
+
+/*
+   for (uint ic = 0; ic < ncells; ic++){
+      if (celltype[ic] != celltype_check_local[ic])  fprintf(fp,"DEBUG rezone 3 at cycle %d celltype & celltype_check_local %d %d  %d  \n",cycle,ic,celltype[ic],celltype_check_local[ic]);
+      if (i[ic] != i_check_local[ic])                fprintf(fp,"DEBUG rezone 3 at cycle %d i & i_check_local %d %d  %d  \n",cycle,ic,i[ic],i_check_local[ic]);
+      if (j[ic] != j_check_local[ic])                fprintf(fp,"DEBUG rezone 3 at cycle %d j & j_check_local %d %d  %d  \n",cycle,ic,j[ic],j_check_local[ic]);
+      if (level[ic] != level_check_local[ic])        fprintf(fp,"DEBUG rezone 3 at cycle %d level & level_check_local %d %d  %d  \n",cycle,ic,level[ic],level_check_local[ic]);
+   }
+*/
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_indices_all_to_gpu_local(Mesh *mesh_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
+{
+#ifdef HAVE_MPI
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   int *level_global = mesh_global->level;
+   int *celltype_global = mesh_global->celltype;
+   int *i_global = mesh_global->i;
+   int *j_global = mesh_global->j;
+
+   cl_mem &dev_celltype_global = mesh_global->dev_celltype;
+   cl_mem &dev_i_global = mesh_global->dev_i;
+   cl_mem &dev_j_global = mesh_global->dev_j;
+   cl_mem &dev_level_global = mesh_global->dev_level;
+
+   // Need to compare dev_H to H, etc
+   vector<int> level_check(ncells);
+   vector<int> celltype_check(ncells);
+   vector<int> i_check(ncells);
+   vector<int> j_check(ncells);
+   /// Set read buffers for data.
+   ezcl_enqueue_read_buffer(command_queue, dev_level,    CL_FALSE, 0, ncells*sizeof(cl_int),  &level_check[0],     NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int),  &celltype_check[0],  NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_i,        CL_FALSE, 0, ncells*sizeof(cl_int),  &i_check[0],         NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_j,        CL_TRUE,  0, ncells*sizeof(cl_int),  &j_check[0],         NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (level[ic] != level_check[ic] )       printf("%d: DEBUG rezone 1 cell %d level %d level_check %d\n",mype, ic, level[ic], level_check[ic]);
+      if (celltype[ic] != celltype_check[ic] ) printf("%d: DEBUG rezone 1 cell %d celltype %d celltype_check %d\n",mype, ic, celltype[ic], celltype_check[ic]);
+      if (i[ic] != i_check[ic] )               printf("%d: DEBUG rezone 1 cell %d i %d i_check %d\n",mype, ic, i[ic], i_check[ic]);
+      if (j[ic] != j_check[ic] )               printf("%d: DEBUG rezone 1 cell %d j %d j_check %d\n",mype, ic, j[ic], j_check[ic]);
+   }
+
+   // And compare dev_H gathered to H_global, etc
+   vector<int>celltype_check_global(ncells_global);
+   vector<int>i_check_global(ncells_global);
+   vector<int>j_check_global(ncells_global);
+   vector<int>level_check_global(ncells_global);
+   MPI_Allgatherv(&celltype_check[0], nsizes[mype], MPI_INT,    &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&i_check[0],        nsizes[mype], MPI_INT,    &i_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&j_check[0],        nsizes[mype], MPI_INT,    &j_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&level_check[0],    nsizes[mype], MPI_INT,    &level_check_global[0],    &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (level_global[ic] != level_check_global[ic] )       printf("%d: DEBUG rezone 2 cell %d level_global %d level_check_global %d\n",mype, ic, level_global[ic], level_check_global[ic]);
+      if (celltype_global[ic] != celltype_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d celltype_global %d celltype_check_global %d\n",mype, ic, celltype_global[ic], celltype_check_global[ic]);
+      if (i_global[ic] != i_check_global[ic] )               printf("%d: DEBUG rezone 2 cell %d i_global %d i_check_global %d\n",mype, ic, i_global[ic], i_check_global[ic]);
+      if (j_global[ic] != j_check_global[ic] )               printf("%d: DEBUG rezone 2 cell %d j_global %d j_check_global %d\n",mype, ic, j_global[ic], j_check_global[ic]);
+   }
+
+   // And compare H gathered to H_global, etc
+   MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT,    &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&i[0],        nsizes[mype], MPI_INT,    &i_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&j[0],        nsizes[mype], MPI_INT,    &j_check_global[0],        &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   MPI_Allgatherv(&level[0],    nsizes[mype], MPI_INT,    &level_check_global[0],    &nsizes[0], &ndispl[0], MPI_INT,    MPI_COMM_WORLD);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (celltype_global[ic] != celltype_check_global[ic])  printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d  %d  \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
+      if (i_global[ic] != i_check_global[ic])                printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d  %d  \n",ncycle,ic,i_global[ic],i_check_global[ic]);
+      if (j_global[ic] != j_check_global[ic])                printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d  %d  \n",ncycle,ic,j_global[ic],j_check_global[ic]);
+      if (level_global[ic] != level_check_global[ic])        printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d  %d  \n",ncycle,ic,level_global[ic],level_check_global[ic]);
+   }
+
+   // Now the global dev_H_global to H_global, etc
+   ezcl_enqueue_read_buffer(command_queue, dev_celltype_global, CL_FALSE, 0, ncells_global*sizeof(cl_int),  &celltype_check_global[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_i_global,        CL_FALSE, 0, ncells_global*sizeof(cl_int),  &i_check_global[0],        NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_j_global,        CL_FALSE, 0, ncells_global*sizeof(cl_int),  &j_check_global[0],        NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_level_global,    CL_TRUE,  0, ncells_global*sizeof(cl_int),  &level_check_global[0],    NULL);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (celltype_global[ic] != celltype_check_global[ic])  printf("DEBUG rezone 4 at cycle %d celltype_global & celltype_check_global %d %d  %d  \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
+      if (i_global[ic] != i_check_global[ic])                printf("DEBUG rezone 4 at cycle %d i_global & i_check_global %d %d  %d  \n",ncycle,ic,i_global[ic],i_check_global[ic]);
+      if (j_global[ic] != j_check_global[ic])                printf("DEBUG rezone 4 at cycle %d j_global & j_check_global %d %d  %d  \n",ncycle,ic,j_global[ic],j_check_global[ic]);
+      if (level_global[ic] != level_check_global[ic])        printf("DEBUG rezone 4 at cycle %d level_global & level_check_global %d %d  %d  \n",ncycle,ic,level_global[ic],level_check_global[ic]);
+   }
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- mesh_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
+               mesh_global,ncells_global,nsizes[0],ndispl[0],ncycle);
+#endif
+}
+
+void Mesh::compare_coordinates_gpu_global_to_cpu_global_double(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, double *H)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<spatial_t>x_check(ncells);
+   vector<spatial_t>dx_check(ncells);
+   vector<spatial_t>y_check(ncells);
+   vector<spatial_t>dy_check(ncells);
+   vector<double>H_check(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_x,   CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0],  NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_dx,  CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_y,   CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0],  NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_dy,  CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_H,   CL_TRUE,  0, ncells*sizeof(cl_double), &H_check[0],  NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
+         printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
+         exit(0);
+      }
+   }  
+   for (uint ic = 0; ic < ncells; ic++){
+      if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
+         printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
+         exit(0);
+      }
+   }
+}
+
+void Mesh::compare_coordinates_gpu_global_to_cpu_global_float(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, float *H)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<spatial_t>x_check(ncells);
+   vector<spatial_t>dx_check(ncells);
+   vector<spatial_t>y_check(ncells);
+   vector<spatial_t>dy_check(ncells);
+   vector<float>H_check(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_x,   CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0],  NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_dx,  CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_y,   CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0],  NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_dy,  CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_H,   CL_TRUE,  0, ncells*sizeof(cl_float), &H_check[0],  NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
+         printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
+         exit(0);
+      }
+   }  
+   for (uint ic = 0; ic < ncells; ic++){
+      if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
+         printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
+         exit(0);
+      }
+   }
+}
+#endif
+
+void Mesh::compare_coordinates_cpu_local_to_cpu_global_double(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, double *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, double *H_global, int cycle)
+{
+   vector<spatial_t> x_check_global(ncells_global);
+   vector<spatial_t> dx_check_global(ncells_global);
+   vector<spatial_t> y_check_global(ncells_global);
+   vector<spatial_t> dy_check_global(ncells_global);
+   vector<double> H_check_global(ncells_global);
+
+#ifdef HAVE_MPI
+   MPI_Allgatherv(&x[0],  nsizes[mype], MPI_SPATIAL_T, &x_check_global[0],  &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&y[0],  nsizes[mype], MPI_SPATIAL_T, &y_check_global[0],  &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&H[0],  nsizes[mype], MPI_DOUBLE, &H_check_global[0],  &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
+               nsizes[0],ndispl[0],x,dx,y,dy,H);
+#endif
+
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global  %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
+      if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
+      if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global  %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
+      if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
+      if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global  %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
+   }
+
+}
+
+void Mesh::compare_coordinates_cpu_local_to_cpu_global_float(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, float *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, float *H_global, int cycle)
+{
+   vector<spatial_t> x_check_global(ncells_global);
+   vector<spatial_t> dx_check_global(ncells_global);
+   vector<spatial_t> y_check_global(ncells_global);
+   vector<spatial_t> dy_check_global(ncells_global);
+   vector<float> H_check_global(ncells_global);
+
+#ifdef HAVE_MPI
+   MPI_Allgatherv(&x[0],  nsizes[mype], MPI_SPATIAL_T, &x_check_global[0],  &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&y[0],  nsizes[mype], MPI_SPATIAL_T, &y_check_global[0],  &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&H[0],  nsizes[mype], MPI_FLOAT,     &H_check_global[0],  &nsizes[0], &ndispl[0], MPI_FLOAT,     MPI_COMM_WORLD);
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
+               nsizes[0],ndispl[0],x,dx,y,dy,H);
+#endif
+
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global  %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
+      if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
+      if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global  %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
+      if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
+      if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global  %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
+   }
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_mpot_gpu_global_to_cpu_global(int *mpot, cl_mem dev_mpot)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<int>mpot_check(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_mpot,  CL_TRUE,  0, ncells*sizeof(cl_int), &mpot_check[0], NULL);
+
+   for (uint ic=0; ic<ncells; ic++) {
+      if (mpot[ic] != mpot_check[ic]) printf("DEBUG -- mpot: ic %d mpot %d mpot_check %d\n",ic, mpot[ic], mpot_check[ic]);
+   }
+}
+#endif
+
+void Mesh::compare_mpot_cpu_local_to_cpu_global(uint ncells_global, int *nsizes, int *ndispl, int *mpot, int *mpot_global, int cycle)
+{
+   vector<int>mpot_save_global(ncells_global);
+#ifdef HAVE_MPI
+   MPI_Allgatherv(&mpot[0], ncells, MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d mpot %p\n",
+               nsizes[0],ndispl[0],mpot);
+#endif
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (mpot_global[ic] != mpot_save_global[ic]) {
+         if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,cycle,ic,mpot_global[ic],mpot_save_global[ic]);
+      }
+   }
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_mpot_all_to_gpu_local(int *mpot, int *mpot_global, cl_mem dev_mpot, cl_mem dev_mpot_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
+{
+#ifdef HAVE_MPI
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   // Need to compare dev_mpot to mpot 
+   vector<int>mpot_save(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE,  0, ncells*sizeof(cl_int), &mpot_save[0], NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (mpot[ic] != mpot_save[ic]) {
+         printf("%d: DEBUG refine_potential 1 at cycle %d cell %d mpot & mpot_save %d %d \n",mype,ncycle,ic,mpot[ic],mpot_save[ic]);
+      }    
+   }    
+
+   // Compare dev_mpot to mpot_global
+   vector<int>mpot_save_global(ncells_global);
+   MPI_Allgatherv(&mpot_save[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (mpot_global[ic] != mpot_save_global[ic]) {
+         if (mype == 0) printf("%d: DEBUG refine_potential 2 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+      }    
+   }    
+
+   // Compare mpot to mpot_global
+   MPI_Allgatherv(&mpot[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (mpot_global[ic] != mpot_save_global[ic]) {
+         if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+      }    
+   }    
+
+   // Compare dev_mpot_global to mpot_global
+   ezcl_enqueue_read_buffer(command_queue, dev_mpot_global, CL_TRUE,  0, ncells_global*sizeof(cl_int), &mpot_save_global[0], NULL);
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (mpot_global[ic] != mpot_save_global[ic]) {
+         if (mype == 0) printf("%d: DEBUG refine_potential 4 at cycle %d cell %u mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+      }    
+   }    
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- mpot %p mpot_global %p dev_mpot %p dev_mpot_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
+               mpot,mpot_global,dev_mpot,dev_mpot_global,ncells_global,nsizes[0],ndispl[0],ncycle);
+#endif
+}
+
+void Mesh::compare_ioffset_gpu_global_to_cpu_global(uint old_ncells, int *mpot)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t local_work_size  = MIN(ncells, TILE_SIZE);
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+   //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; //  For on-device global reduction kernel.
+   size_t block_size     = global_work_size/local_work_size;
+
+   vector<int> ioffset_check(block_size);
+   ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset_check[0], NULL);
+
+   int mcount, mtotal;
+   mtotal = 0;
+   for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
+      mcount = 0;
+      for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+         if (ic >= old_ncells) break;
+
+         if (mpot[ic] < 0) {
+            if (celltype[ic] == REAL_CELL) {
+               // remove all but cell that will remain to get count right when split
+               // across processors
+               if (is_lower_left(i[ic],j[ic]) ) mcount++;
+            } else {
+               // either upper right or lower left will remain for boundary cells
+               if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
+            }
+         }
+         if (mpot[ic] >= 0) {
+            if (celltype[ic] == REAL_CELL){
+               mcount += mpot[ic] ? 4 : 1;
+            } else {
+               mcount += mpot[ic] ? 2 : 1;
+            }
+         }
+      }
+      if (mtotal != ioffset_check[ig]) printf("DEBUG ig %d ioffset %d mcount %d\n",ig,ioffset_check[ig],mtotal);
+      mtotal += mcount;
+   }
+}
+
+void Mesh::compare_ioffset_all_to_gpu_local(uint old_ncells, uint old_ncells_global, int block_size, int block_size_global, int *mpot, int *mpot_global, cl_mem dev_ioffset, cl_mem dev_ioffset_global, int *ioffset, int *ioffset_global, int *celltype_global, int *i_global, int *j_global)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   // This compares ioffset for each block in the calculation
+   ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset[0], NULL);
+   int mtotal = 0; 
+   for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
+      int mcount = 0; 
+      for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+         if (ic >= old_ncells) break;
+
+         if (mpot[ic] < 0) {
+            if (celltype[ic] == REAL_CELL) {
+               // remove all but cell that will remain to get count right when split
+               // across processors
+               if (is_lower_left(i[ic],j[ic]) ) mcount++;
+            } else {
+               // either upper right or lower left will remain for boundary cells
+               if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
+            }
+         }
+         if (mpot[ic] >= 0) {
+            if (celltype[ic] == REAL_CELL){
+               mcount += mpot[ic] ? 4 : 1;
+            } else {
+               mcount += mpot[ic] ? 2 : 1;
+            }
+         }
+      }    
+      if (mtotal != ioffset[ig]) printf("%d: DEBUG ig %d ioffset %d mtotal %d\n",mype,ig,ioffset[ig],mtotal);
+      mtotal += mcount;
+   }    
+
+   // For global This compares ioffset for each block in the calculation
+   ezcl_enqueue_read_buffer(command_queue, dev_ioffset_global, CL_TRUE, 0, block_size_global*sizeof(cl_int), &ioffset_global[0], NULL);
+   mtotal = 0; 
+   int count = 0; 
+   for (uint ig=0; ig<(old_ncells_global+TILE_SIZE-1)/TILE_SIZE; ig++){
+      int mcount = 0; 
+      for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+         if (ic >= old_ncells_global) break;
+
+         if (mpot_global[ic] < 0) {
+            if (celltype_global[ic] == REAL_CELL) {
+               // remove all but cell that will remain to get count right when split
+               // across processors
+               if (is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
+            } else {
+               // either upper right or lower left will remain for boundary cells
+               if (is_upper_right(i_global[ic],j_global[ic]) || is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
+            }
+         }
+
+         if (mpot_global[ic] >= 0) {
+            if (celltype_global[ic] == REAL_CELL) {
+               mcount += mpot_global[ic] ? 4 : 1; 
+            } else {
+               mcount += mpot_global[ic] ? 2 : 1; 
+            }
+         }    
+      }    
+      if (mtotal != ioffset_global[ig]) {
+         printf("DEBUG global ig %d ioffset %d mtotal %d\n",ig,ioffset_global[ig],mtotal);
+         count++;
+      }    
+      if (count > 10) exit(0);
+      mtotal += mcount;
+   }    
+}
+#endif
+
+Mesh::Mesh(int nx, int ny, int levmx_in, int ndim_in, double deltax_in, double deltay_in, int boundary, int parallel_in, int do_gpu_calc)
+{
+   lowerBound_Global = NULL;
+   upperBound_Global = NULL;
+   for (int i = 0; i < MESH_TIMER_SIZE; i++){
+      cpu_timers[i] = 0.0;
+      gpu_timers[i] = 0L;
+   }
+
+   for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+      cpu_counters[i] = 0;
+      gpu_counters[i] = 0;
+   }
+
+   ndim   = ndim_in;
+   levmx  = levmx_in;
+#ifdef HAVE_OPENCL
+   if (ndim == TWO_DIMENSIONAL) defines = "-DTWO_DIMENSIONAL -DCARTESIAN";
+#endif
+
+   offtile_ratio_local = 0;
+   offtile_local_count = 1;
+
+   mype  = 0;
+   numpe = 1;
+   ncells = 0;
+   ncells_ghost = 0;
+   parallel = parallel_in;
+   noffset = 0;
+   mem_factor = 1.0;
+   //mem_factor = 1.5;
+   
+#ifdef HAVE_MPI
+   int mpi_init;
+   MPI_Initialized(&mpi_init);
+   if (mpi_init && parallel){
+      MPI_Comm_rank(MPI_COMM_WORLD,&mype);
+      MPI_Comm_size(MPI_COMM_WORLD,&numpe);
+   }
+   // TODO add fini
+   if (parallel) mesh_memory.pinit(MPI_COMM_WORLD, 2L * 1024 * 1024 * 1024);
+#endif
+   cell_handle = 0;
+
+   if (numpe == 1) mem_factor = 1.0;
+
+   deltax = deltax_in;
+   deltay = deltay_in;
+
+   have_boundary = boundary;
+
+   //int istart = 1;
+   //int jstart = 1;
+   //int iend   = nx;
+   //int jend   = ny;
+   int nxx    = nx;
+   int nyy    = ny;
+   imin = 0;
+   jmin = 0;
+   imax = nx+1;
+   jmax = ny+1;
+   if (have_boundary) {
+      //istart = 0;
+      //jstart = 0;
+      //iend   = nx + 1;
+      //jend   = ny + 1;
+      nxx    = nx + 2;
+      nyy    = ny + 2;
+      imin   = 0;
+      jmin   = 0;
+      imax   = nx + 1;
+      jmax   = ny + 1;
+   }
+   
+   xmin = -deltax * 0.5 * (real_t)nxx;
+   ymin = -deltay * 0.5 * (real_t)nyy;
+   xmax =  deltax * 0.5 * (real_t)nxx;
+   ymax =  deltay * 0.5 * (real_t)nyy;
+   
+   size_t lvlMxSize = levmx + 1;
+
+   levtable.resize(lvlMxSize);
+   lev_ibegin.resize(lvlMxSize);
+   lev_jbegin.resize(lvlMxSize);
+   lev_iend.resize(  lvlMxSize);
+   lev_jend.resize(  lvlMxSize);
+   lev_deltax.resize(lvlMxSize);
+   lev_deltay.resize(lvlMxSize);
+   
+   lev_ibegin[0] = imin + 1;
+   lev_iend[0]   = imax - 1;
+   lev_jbegin[0] = jmin + 1;
+   lev_jend[0]   = jmax - 1;
+   lev_deltax[0] = deltax;
+   lev_deltay[0] = deltay;
+   
+   for (int lev = 1; lev <= levmx; lev++) {
+      lev_ibegin[lev] = lev_ibegin[lev-1]*2;
+      lev_iend[lev]   = lev_iend  [lev-1]*2 + 1;
+      lev_jbegin[lev] = lev_jbegin[lev-1]*2;
+      lev_jend[lev]   = lev_jend  [lev-1]*2 + 1;
+      lev_deltax[lev] = lev_deltax[lev-1]*0.5;
+      lev_deltay[lev] = lev_deltay[lev-1]*0.5;
+   }
+   for (uint lev=0; lev<lvlMxSize; lev++){
+      levtable[lev] = IPOW2(lev);
+   }
+
+   if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+   // The copy host ptr flag will have the data copied to the GPU as part of the allocation
+      dev_levtable = ezcl_malloc(&levtable[0],   const_cast<char *>("dev_levtable"), &lvlMxSize, sizeof(cl_int),    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_levdx    = ezcl_malloc(&lev_deltax[0], const_cast<char *>("dev_levdx"),    &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_levdy    = ezcl_malloc(&lev_deltay[0], const_cast<char *>("dev_levdy"),    &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_levibeg  = ezcl_malloc(&lev_ibegin[0], const_cast<char *>("dev_levibeg"),  &lvlMxSize, sizeof(cl_int),    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_leviend  = ezcl_malloc(&lev_iend[0],   const_cast<char *>("dev_leviend"),  &lvlMxSize, sizeof(cl_int),    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_levjbeg  = ezcl_malloc(&lev_jbegin[0], const_cast<char *>("dev_levjbeg"),  &lvlMxSize, sizeof(cl_int),    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+      dev_levjend  = ezcl_malloc(&lev_jend[0],   const_cast<char *>("dev_levjend"),  &lvlMxSize, sizeof(cl_int),    CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+#endif
+   }
+
+   ibase = 0;
+
+   int ncells_corners = 4;
+   int i_corner[] = {   0,   0,imax,imax};
+   int j_corner[] = {   0,jmax,   0,jmax};
+
+   for(int ic=0; ic<ncells_corners; ic++){
+      for (int    jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
+         for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
+            corners_i.push_back(ii);
+            corners_j.push_back(jj);
+         }
+      }
+   }
+
+   do_rezone = true;
+   gpu_do_rezone = true;
+
+   celltype = NULL;
+   nlft     = NULL;
+   nrht     = NULL;
+   nbot     = NULL;
+   ntop     = NULL;
+}
+
+void Mesh::init(int nx, int ny, real_t circ_radius, partition_method initial_order, int do_gpu_calc)
+{
+   if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+      cl_context context = ezcl_get_context();
+
+      hash_lib_init();
+      if (mype == 0) printf("Starting compile of kernels in mesh\n");
+      char *bothsources = (char *)malloc(strlen(mesh_kern_source)+strlen(get_hash_kernel_source_string())+1);
+      strcpy(bothsources, get_hash_kernel_source_string());
+      strcat(bothsources, mesh_kern_source);
+      strcat(bothsources, "\0");
+      const char *defines = NULL;
+      cl_program program = ezcl_create_program_wsource(context, defines, bothsources);
+      free(bothsources);
+
+      kernel_reduction_scan2          = ezcl_create_kernel_wprogram(program, "finish_reduction_scan2_cl");
+      kernel_reduction_count          = ezcl_create_kernel_wprogram(program, "finish_reduction_count_cl");
+      kernel_reduction_count2         = ezcl_create_kernel_wprogram(program, "finish_reduction_count2_cl");
+      kernel_hash_adjust_sizes        = ezcl_create_kernel_wprogram(program, "hash_adjust_sizes_cl");
+      kernel_hash_setup               = ezcl_create_kernel_wprogram(program, "hash_setup_cl");
+      kernel_hash_setup_local         = ezcl_create_kernel_wprogram(program, "hash_setup_local_cl");
+      kernel_neighbor_init            = ezcl_create_kernel_wprogram(program, "neighbor_init_cl");
+      kernel_calc_neighbors           = ezcl_create_kernel_wprogram(program, "calc_neighbors_cl");
+      kernel_calc_neighbors_local     = ezcl_create_kernel_wprogram(program, "calc_neighbors_local_cl");
+      kernel_calc_border_cells        = ezcl_create_kernel_wprogram(program, "calc_border_cells_cl");
+      kernel_calc_border_cells2       = ezcl_create_kernel_wprogram(program, "calc_border_cells2_cl");
+      kernel_finish_scan              = ezcl_create_kernel_wprogram(program, "finish_scan_cl");
+      kernel_get_border_data          = ezcl_create_kernel_wprogram(program, "get_border_data_cl");
+      kernel_calc_layer1              = ezcl_create_kernel_wprogram(program, "calc_layer1_cl");
+      kernel_calc_layer1_sethash      = ezcl_create_kernel_wprogram(program, "calc_layer1_sethash_cl");
+      kernel_calc_layer2              = ezcl_create_kernel_wprogram(program, "calc_layer2_cl");
+      kernel_get_border_data2         = ezcl_create_kernel_wprogram(program, "get_border_data2_cl");
+      kernel_calc_layer2_sethash      = ezcl_create_kernel_wprogram(program, "calc_layer2_sethash_cl");
+      kernel_copy_mesh_data           = ezcl_create_kernel_wprogram(program, "copy_mesh_data_cl");
+      kernel_fill_mesh_ghost          = ezcl_create_kernel_wprogram(program, "fill_mesh_ghost_cl");
+      kernel_fill_neighbor_ghost      = ezcl_create_kernel_wprogram(program, "fill_neighbor_ghost_cl");
+      kernel_set_corner_neighbor      = ezcl_create_kernel_wprogram(program, "set_corner_neighbor_cl");
+      kernel_adjust_neighbors_local   = ezcl_create_kernel_wprogram(program, "adjust_neighbors_local_cl");
+      kernel_hash_size                = ezcl_create_kernel_wprogram(program, "calc_hash_size_cl");
+      kernel_finish_hash_size         = ezcl_create_kernel_wprogram(program, "finish_reduction_minmax4_cl");
+      kernel_calc_spatial_coordinates = ezcl_create_kernel_wprogram(program, "calc_spatial_coordinates_cl");
+      kernel_do_load_balance_lower    = ezcl_create_kernel_wprogram(program, "do_load_balance_lower_cl");
+      kernel_do_load_balance_middle   = ezcl_create_kernel_wprogram(program, "do_load_balance_middle_cl");
+      kernel_do_load_balance_upper    = ezcl_create_kernel_wprogram(program, "do_load_balance_upper_cl");
+#ifndef MINIMUM_PRECISION
+      kernel_do_load_balance_double   = ezcl_create_kernel_wprogram(program, "do_load_balance_double_cl");
+#endif
+      kernel_do_load_balance_float    = ezcl_create_kernel_wprogram(program, "do_load_balance_float_cl");
+      kernel_refine_smooth            = ezcl_create_kernel_wprogram(program, "refine_smooth_cl");
+      kernel_coarsen_smooth           = ezcl_create_kernel_wprogram(program, "coarsen_smooth_cl");
+      kernel_coarsen_check_block      = ezcl_create_kernel_wprogram(program, "coarsen_check_block_cl");
+      kernel_rezone_all               = ezcl_create_kernel_wprogram(program, "rezone_all_cl");
+      kernel_rezone_neighbors         = ezcl_create_kernel_wprogram(program, "rezone_neighbors_cl");
+#ifndef MINIMUM_PRECISION
+      kernel_rezone_one_double        = ezcl_create_kernel_wprogram(program, "rezone_one_double_cl");
+#endif
+      kernel_rezone_one_float         = ezcl_create_kernel_wprogram(program, "rezone_one_float_cl");
+      kernel_copy_mpot_ghost_data     = ezcl_create_kernel_wprogram(program, "copy_mpot_ghost_data_cl");
+      kernel_set_boundary_refinement  = ezcl_create_kernel_wprogram(program, "set_boundary_refinement");
+      init_kernel_2stage_sum();
+      init_kernel_2stage_sum_int();
+      if (! have_boundary){
+        kernel_count_BCs              = ezcl_create_kernel_wprogram(program, "count_BCs_cl");
+      }
+
+      ezcl_program_release(program);
+      if (mype == 0) printf("Finishing compile of kernels in mesh\n");
+#endif
+   }
+
+   //KDTree_Initialize(&tree);
+   if (ncells > 0) { // this is a restart.
+        nsizes.resize (numpe);
+        ndispl.resize (numpe);
+       if (parallel && numpe > 1) {
+#ifdef HAVE_MPI
+          int ncells_int = ncells;
+          MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+          ndispl[0]=0;
+          for (int ip=1; ip<numpe; ip++){
+             ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+          }
+          noffset=ndispl[mype];
+          ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
+#endif
+       } else {
+          noffset = 0;
+          ncells_global = ncells;
+          proc.resize (ncells);
+          calc_distribution(numpe);
+       }
+       calc_celltype(ncells);
+
+   } else {
+       int istart = 1,
+           jstart = 1,
+           iend   = nx,
+           jend   = ny,
+           nxx    = nx,
+           nyy    = ny;
+       if (have_boundary) {
+          istart = 0;
+          jstart = 0;
+          iend   = nx + 1;
+          jend   = ny + 1;
+          nxx    = nx + 2;
+          nyy    = ny + 2;
+       }
+
+       if (ndim == TWO_DIMENSIONAL) ncells = nxx * nyy - have_boundary * 4;
+       else                         ncells = nxx * nyy;
+
+       noffset = 0;
+       if (parallel) {
+          ncells_global = ncells;
+          nsizes.resize(numpe);
+          ndispl.resize(numpe);
+
+          for (int ip=0; ip<numpe; ip++){
+             nsizes[ip] = ncells_global/numpe;
+             if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+          }
+
+          ndispl[0]=0;
+          for (int ip=1; ip<numpe; ip++){
+             ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+          }
+          ncells= nsizes[mype];
+          noffset=ndispl[mype];
+       }
+
+       allocate(ncells);
+       index.resize(ncells);
+
+       int ic = 0;
+
+       for (int jj = jstart; jj <= jend; jj++) {
+          for (int ii = istart; ii <= iend; ii++) {
+             if (have_boundary && ii == 0    && jj == 0   ) continue;
+             if (have_boundary && ii == 0    && jj == jend) continue;
+             if (have_boundary && ii == iend && jj == 0   ) continue;
+             if (have_boundary && ii == iend && jj == jend) continue;
+
+             if (ic >= (int)noffset && ic < (int)(ncells+noffset)){
+                int iclocal = ic-noffset;
+                index[iclocal] = ic;
+                i[iclocal]     = ii;
+                j[iclocal]     = jj;
+                level[iclocal] = 0;
+             }
+             ic++;
+          }
+       }
+
+       //if (numpe > 1 && (initial_order != HILBERT_SORT && initial_order != HILBERT_PARTITION) ) mem_factor = 2.0;
+       partition_cells(numpe, index, initial_order);
+
+       calc_celltype(ncells);
+       calc_spatial_coordinates(0);
+
+       //  Start lev loop here
+       for (int ilevel=1; ilevel<=levmx; ilevel++) {
+
+          //int old_ncells = ncells;
+
+          ncells_ghost = ncells;
+          calc_neighbors_local();
+
+          kdtree_setup();
+
+          int nez;
+          vector<int> ind(ncells);
+
+    #ifdef FULL_PRECISION
+          KDTree_QueryCircleIntersect_Double(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
+    #else
+          KDTree_QueryCircleIntersect_Float(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
+    #endif
+
+          vector<int> mpot(ncells_ghost,0);
+
+          for (int ic=0; ic<nez; ++ic){
+             if (level[ind[ic]] < levmx) mpot[ind[ic]] = 1;
+          }
+
+          KDTree_Destroy(&tree);
+          //  Refine the cells.
+          int icount = 0;
+          int jcount = 0;
+          int new_ncells = refine_smooth(mpot, icount, jcount);
+
+          MallocPlus dummy;
+          rezone_all(icount, jcount, mpot, 0, dummy);
+
+          ncells = new_ncells;
+
+          calc_spatial_coordinates(0);
+
+    #ifdef HAVE_MPI
+          if (parallel && numpe > 1) {
+             int ncells_int = ncells;
+             MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+             ndispl[0]=0;
+             for (int ip=1; ip<numpe; ip++){
+                ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+             }
+             noffset=ndispl[mype];
+             ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
+          }
+    #endif
+       }  // End lev loop here
+       index.clear();
+       ncells_ghost = ncells;
+   }
+   int ncells_corners = 4;
+   int i_corner[] = {   0,   0,imax,imax};
+   int j_corner[] = {   0,jmax,   0,jmax};
+
+   for(int ic=0; ic<ncells_corners; ic++){
+      for (int    jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
+         for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
+            corners_i.push_back(ii);
+            corners_j.push_back(jj);
+         }
+      }
+   }
+}
+
+size_t Mesh::refine_smooth(vector<int> &mpot, int &icount, int &jcount)
+{
+   vector<int> mpot_old;
+
+   int newcount;
+   int newcount_global;
+
+   struct timeval tstart_lev2;
+
+   rezone_count(mpot, icount, jcount);
+
+#ifdef _OPENMP
+#pragma omp parallel
+{ //START Parallel Region
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+   newcount = icount;
+   newcount_global = newcount;
+
+   if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+   }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+   if(newcount_global > 0 && levmx > 1) {
+
+      size_t my_ncells=ncells;
+      if (parallel) my_ncells=ncells_ghost;
+
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+      cpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
+
+      mpot_old.resize(my_ncells);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+      int levcount = 1;
+       
+      while (newcount_global > 0 && levcount < levmx){
+
+         levcount++; 
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+
+         mpot.swap(mpot_old);
+         newcount=0;
+#ifdef HAVE_MPI
+         if (numpe > 1) {
+            L7_Update(&mpot_old[0], L7_INT, cell_handle);
+         }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+         int upperBound, lowerBound;
+         get_bounds(upperBound, lowerBound);
+         int mynewcount = newcount; //All threads get a mynewcount
+
+#ifdef _OPENMP
+#pragma omp for reduction(+:newcount)
+#endif
+         for(uint ic = 0; ic < ncells; ic++) {
+        // for(uint ic = lowerBound; ic < upperBound; ic++){
+            int lev = level[ic];
+            mpot[ic] = mpot_old[ic];
+            if(mpot_old[ic] > 0) continue;
+   
+            int nl = nlft[ic];
+            if (nl >= 0 && nl < (int)ncells_ghost) {
+               int ll = level[nl];
+               if(mpot_old[nl] > 0) ll++;
+   
+               if(ll - lev > 1) {
+                  mpot[ic]=1;
+                  mynewcount++;
+                  continue;
+               }
+
+               ll = level[nl];
+               if (ll > lev) {
+                  int nlt = ntop[nl];
+                  if (nlt >= 0 && nlt < (int)ncells_ghost) {
+                     int llt = level[nlt];
+                     if(mpot_old[nlt] > 0) llt++;
+
+                     if(llt - lev > 1) {
+                        mpot[ic]=1;
+                        mynewcount++;
+                        continue;
+                     }
+                  }
+               }
+            }
+
+            int nr = nrht[ic];
+            if (nr >= 0 && nr < (int)ncells_ghost) {
+               int lr = level[nr];
+               if(mpot_old[nr] > 0) lr++;
+   
+               if(lr - lev > 1) {
+                  mpot[ic]=1;
+                  mynewcount++;
+                  continue;
+               }
+
+               lr = level[nr];
+               if (lr > lev) {
+                  int nrt = ntop[nr];
+                  if (nrt >= 0 && nrt < (int)ncells_ghost) {
+                     int lrt = level[nrt];
+                     if(mpot_old[nrt] > 0) lrt++;
+
+                     if(lrt - lev > 1) {
+                        mpot[ic]=1;
+                        mynewcount++;
+                        continue;
+                     }
+                  }
+               }
+            }
+
+            int nt = ntop[ic];
+            if (nt >= 0 && nt < (int)ncells_ghost) {
+               int lt = level[nt];
+               if(mpot_old[nt] > 0) lt++;
+   
+               if(lt - lev > 1) {
+                  mpot[ic]=1;
+                  mynewcount++;
+                  continue;
+               }
+
+               lt = level[nt];
+               if (lt > lev) {
+                  int ntr = nrht[nt];
+                  if (ntr >= 0 && ntr < (int)ncells_ghost) {
+                     int ltr = level[ntr];
+                     if(mpot_old[ntr] > 0) ltr++;
+
+                     if(ltr - lev > 1) {
+                        mpot[ic]=1;
+                        mynewcount++;
+                        continue;
+                     }
+                  }
+               }
+            }
+
+            int nb = nbot[ic];
+            if (nb >= 0 && nb < (int)ncells_ghost) {
+               int lb = level[nb];
+               if(mpot_old[nb] > 0) lb++;
+   
+               if(lb - lev > 1) {
+                  mpot[ic]=1;
+                  mynewcount++;
+                  continue;
+               }
+
+               lb = level[nb];
+               if (lb > lev) {
+                  int nbr = nrht[nb];
+                  if (nbr >= 0 && nbr < (int)ncells_ghost) {
+                     int lbr = level[nbr];
+                     if(mpot_old[nbr] > 0) lbr++;
+
+                     if(lbr - lev > 1) {
+                        mpot[ic]=1;
+                        mynewcount++;
+                        continue;
+                     }
+                  }
+               }
+            }
+         }
+#ifdef _OPENMP
+#pragma omp atomic 
+#endif
+         newcount += mynewcount;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+{
+#endif
+         icount += newcount;
+         newcount_global = newcount;
+
+#ifdef HAVE_MPI
+         if (parallel) {
+            MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+         }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+      } // while (newcount_global > 0 && levcount < levmx);
+
+   }
+
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+   if (numpe > 1) {
+      L7_Update(&mpot[0], L7_INT, cell_handle);
+  }
+#endif
+
+   mpot_old.clear();
+   mpot_old.resize(ncells_ghost);
+
+   mpot_old.swap(mpot);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+   for(uint ic=0; ic<ncells; ic++) {
+      mpot[ic] = mpot_old[ic];
+      if (mpot_old[ic] >= 0) continue;
+      if (mpot_old[ic] <= -1000000) continue;
+      if (        is_upper_right(i[ic],j[ic]) ) {
+         int nr = nrht[ic];
+         int lr = level[nr];
+         if (mpot_old[nr] > 0) lr++;
+         int nt = ntop[ic];
+         int lt = level[nt];
+         if (mpot_old[nt] > 0) lt++;
+         if (lr > level[ic] || lt > level[ic]) mpot[ic] = 0;
+      } else if ( is_upper_left(i[ic],j[ic] ) ) {
+         int nl = nlft[ic];
+         int ll = level[nl];
+         if (mpot_old[nl] > 0) ll++;
+         int nt = ntop[ic];
+         int lt = level[nt];
+         if (mpot_old[nt] > 0) lt++;
+         if (ll > level[ic] || lt > level[ic]) mpot[ic] = 0;
+      } else if ( is_lower_right(i[ic],j[ic] ) ) {
+         int nr = nrht[ic];
+         int lr = level[nr];
+         if (mpot_old[nr] > 0) lr++;
+         int nb = nbot[ic];
+         int lb = level[nb];
+         if (mpot_old[nb] > 0) lb++;
+         if (lr > level[ic] || lb > level[ic]) mpot[ic] = 0;
+      } else if ( is_lower_left(i[ic],j[ic] ) ) {
+         int nl = nlft[ic];
+         int ll = level[nl];
+         if (mpot_old[nl] > 0) ll++;
+         int nb = nbot[ic];
+         int lb = level[nb];
+         if (mpot_old[nb] > 0) lb++;
+         if (ll > level[ic] || lb > level[ic]) mpot[ic] = 0;
+      }
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+   if (numpe > 1) {
+      L7_Update(&mpot[0], L7_INT, cell_handle);
+  }
+#endif
+
+   mpot_old.swap(mpot);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+   for(uint ic=0; ic<ncells; ic++) {
+      int n1=0, n2=0, n3=0;
+      mpot[ic] = mpot_old[ic];
+      if (mpot_old[ic] >= 0) continue;
+      if (mpot_old[ic] <= -1000000) continue;
+      if ( is_upper_right(i[ic],j[ic]) ) {
+         n1 = nbot[ic];
+         n2 = nlft[ic];
+         n3 = nlft[n1];
+      } else if ( is_upper_left(i[ic],j[ic] ) ) {
+         n1 = nbot[ic];
+         n2 = nrht[ic];
+         n3 = nrht[n1];
+      } else if ( is_lower_right(i[ic],j[ic] ) ) {
+         n1 = ntop[ic];
+         n2 = nlft[ic];
+         n3 = nlft[n1];
+      } else if ( is_lower_left(i[ic],j[ic] ) ) {
+         n1 = ntop[ic];
+         n2 = nrht[ic];
+         n3 = nrht[n1];
+      }
+      if (n3 < 0) {
+         mpot[ic] = 0;
+      } else {
+         int lev1 = level[n1];
+         int lev2 = level[n2];
+         int lev3 = level[n3];
+         if (mpot_old[n1] > 0) lev1++;
+         if (mpot_old[n2] > 0) lev2++;
+         if (mpot_old[n3] > 0) lev3++;
+
+         if (mpot_old[n1] != -1 || lev1 != level[ic] ||
+             mpot_old[n2] != -1 || lev2 != level[ic] ||
+             mpot_old[n3] != -1 || lev3 != level[ic]) {
+            mpot[ic] = 0;
+         }
+      }
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+   if (numpe > 1) {
+      L7_Update(&mpot[0], L7_INT, cell_handle);
+  }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+   for (uint ic=0; ic<ncells; ic++) {
+      if (celltype[ic] < 0) {
+         switch (celltype[ic]) {
+            case LEFT_BOUNDARY:
+               mpot[ic] = mpot[nrht[ic]];
+               break;
+            case RIGHT_BOUNDARY:
+               mpot[ic] = mpot[nlft[ic]];
+               break;
+            case BOTTOM_BOUNDARY:
+               mpot[ic] = mpot[ntop[ic]];
+               break;
+            case TOP_BOUNDARY:
+               mpot[ic] = mpot[nbot[ic]];
+               break;
+         }
+      }
+   }
+
+#ifdef _OPENMP
+#pragma omp barrier
+}//END Parallel Region
+#endif
+
+   newcount = ncells + rezone_count(mpot, icount, jcount);
+
+#ifdef HAVE_MPI
+   int icount_global = icount;
+   int jcount_global = jcount;
+   if (parallel) {
+      int count[2], global_count[2];
+      count[0] = icount;
+      count[1] = jcount;
+      MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+      icount_global = global_count[0];
+      jcount_global = global_count[1];
+   }
+   do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
+#else
+   do_rezone = (icount != 0 || jcount != 0) ? true : false;
+#endif
+
+
+   if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_REFINE_SMOOTH] += cpu_timer_stop(tstart_lev2);
+
+   return(newcount);
+}
+
+#ifdef HAVE_OPENCL
+int Mesh::gpu_refine_smooth(cl_mem &dev_mpot, int &icount, int &jcount)
+{
+   struct timeval tstart_lev2;
+   if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t local_work_size = 128;
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+   size_t block_size = global_work_size/local_work_size;
+
+   int icount_global = icount;
+   int jcount_global = jcount;
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      int count[2], count_global[2];
+      count[0] = icount;
+      count[1] = jcount;
+      MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+      icount_global = count_global[0];
+      jcount_global = count_global[1];
+   }
+#endif
+
+   int levcount = 1;
+   //int which_smooth=0;
+
+   if(icount_global > 0 && levcount < levmx) {
+      size_t result_size = 1;
+      cl_mem dev_result  = ezcl_malloc(NULL, const_cast<char *>("dev_result"),  &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+      int newcount = icount;
+      int newcount_global = icount_global;
+      while (newcount_global > 0 && levcount < levmx) {
+         levcount++;
+
+         gpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
+
+#ifdef HAVE_MPI
+         if (numpe > 1) {
+            L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+         }
+#endif
+
+         if (icount_global) {
+            ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+            ezcl_set_kernel_arg(kernel_refine_smooth, 0, sizeof(cl_int),  (void *)&ncells);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 1, sizeof(cl_int),  (void *)&ncells_ghost);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 2, sizeof(cl_int),  (void *)&levmx);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 3, sizeof(cl_mem),  (void *)&dev_nlft);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 4, sizeof(cl_mem),  (void *)&dev_nrht);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 5, sizeof(cl_mem),  (void *)&dev_nbot);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 6, sizeof(cl_mem),  (void *)&dev_ntop);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 7, sizeof(cl_mem),  (void *)&dev_level);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 8, sizeof(cl_mem),  (void *)&dev_celltype);
+            ezcl_set_kernel_arg(kernel_refine_smooth, 9, sizeof(cl_mem),  (void *)&dev_mpot_old);
+            ezcl_set_kernel_arg(kernel_refine_smooth,10, sizeof(cl_mem),  (void *)&dev_mpot);
+            ezcl_set_kernel_arg(kernel_refine_smooth,11, sizeof(cl_mem),  (void *)&dev_redscratch);
+            ezcl_set_kernel_arg(kernel_refine_smooth,12, sizeof(cl_mem),  (void *)&dev_result);
+            ezcl_set_kernel_arg(kernel_refine_smooth,13, local_work_size*sizeof(cl_int),    NULL);
+
+            ezcl_enqueue_ndrange_kernel(command_queue, kernel_refine_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+            gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
+
+            int result;
+            ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
+
+            //printf("result = %d after %d refine smooths\n",result,which_smooth);
+            //which_smooth++;
+
+            icount = result;
+         }
+
+         newcount = icount-newcount;
+         newcount_global = newcount;
+#ifdef HAVE_MPI
+         if (parallel) {
+            MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+         }
+#endif
+         icount_global += newcount_global;
+         //printf("DEBUG -- icount %d icount_global %d newcount %d newcount_global %d\n",icount,icount_global,newcount,newcount_global);
+      }
+
+      ezcl_device_memory_delete(dev_mpot_old);
+      ezcl_device_memory_delete(dev_redscratch);
+      ezcl_device_memory_delete(dev_result);
+   }
+
+   if (jcount_global) {
+#ifdef HAVE_MPI
+      if (numpe > 1) {
+         L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+      }
+#endif
+
+      cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+      if (jcount) {
+         ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 0, sizeof(cl_int),  (void *)&ncells);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 1, sizeof(cl_mem),  (void *)&dev_nlft);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 2, sizeof(cl_mem),  (void *)&dev_nrht);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 3, sizeof(cl_mem),  (void *)&dev_nbot);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 4, sizeof(cl_mem),  (void *)&dev_ntop);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 5, sizeof(cl_mem),  (void *)&dev_i);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 6, sizeof(cl_mem),  (void *)&dev_j);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 7, sizeof(cl_mem),  (void *)&dev_level);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 8, sizeof(cl_mem),  (void *)&dev_mpot_old);
+         ezcl_set_kernel_arg(kernel_coarsen_smooth, 9, sizeof(cl_mem),  (void *)&dev_mpot);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
+      }
+
+#ifdef HAVE_MPI
+      if (numpe > 1) {
+         L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+      }
+#endif
+
+      if (jcount) {
+         size_t result_size = 1;
+         cl_mem dev_result  = ezcl_malloc(NULL, const_cast<char *>("dev_result"),  &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+         ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 0, sizeof(cl_int),  (void *)&ncells);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 1, sizeof(cl_mem),  (void *)&dev_nlft);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 2, sizeof(cl_mem),  (void *)&dev_nrht);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 3, sizeof(cl_mem),  (void *)&dev_nbot);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 4, sizeof(cl_mem),  (void *)&dev_ntop);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 5, sizeof(cl_mem),  (void *)&dev_i);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 6, sizeof(cl_mem),  (void *)&dev_j);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 7, sizeof(cl_mem),  (void *)&dev_level);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 8, sizeof(cl_mem),  (void *)&dev_celltype);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block, 9, sizeof(cl_mem),  (void *)&dev_mpot_old);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block,10, sizeof(cl_mem),  (void *)&dev_mpot);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block,11, sizeof(cl_mem),  (void *)&dev_redscratch);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block,12, sizeof(cl_mem),  (void *)&dev_result);
+         ezcl_set_kernel_arg(kernel_coarsen_check_block,13, local_work_size*sizeof(cl_int),    NULL);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_check_block, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+         gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
+
+         int result;
+         ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
+
+         //printf("result = %d after coarsen smooth\n",result);
+
+         jcount = result;
+
+         ezcl_device_memory_delete(dev_redscratch);
+         ezcl_device_memory_delete(dev_result);
+      }
+
+      jcount_global = jcount;
+
+#ifdef HAVE_MPI
+      if (parallel) {
+         MPI_Allreduce(&jcount, &jcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+      }
+#endif
+
+      ezcl_device_memory_delete(dev_mpot_old);
+   }
+
+   if (icount_global || jcount_global) {
+#ifdef HAVE_MPI
+      if (numpe > 1) {
+         L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+      }
+#endif
+
+      size_t result_size = 1;
+      cl_mem dev_result  = ezcl_malloc(NULL, const_cast<char *>("dev_result"),  &result_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+      dev_ioffset  = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size,   sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 0,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 1,  sizeof(cl_mem), (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 2,  sizeof(cl_mem), (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 3,  sizeof(cl_mem), (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 4,  sizeof(cl_mem), (void *)&dev_ntop);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 5,  sizeof(cl_mem), (void *)&dev_i);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 6,  sizeof(cl_mem), (void *)&dev_j);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 7,  sizeof(cl_mem), (void *)&dev_celltype);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 8,  sizeof(cl_mem), (void *)&dev_mpot);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 9,  sizeof(cl_mem), (void *)&dev_redscratch);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 10, sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 11, sizeof(cl_mem), (void *)&dev_result);
+      ezcl_set_kernel_arg(kernel_set_boundary_refinement, 12, local_work_size*sizeof(cl_int2),    NULL);
+
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_set_boundary_refinement, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+      gpu_rezone_count2(block_size, local_work_size, dev_redscratch, dev_result);
+
+      int my_result[2];
+      ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, 1*sizeof(cl_int2), &my_result, NULL);
+      //printf("Result is %lu icount %d jcount %d\n", ncells+my_result[0]-my_result[1],my_result[0],my_result[1]);
+      icount = my_result[0];
+      jcount = my_result[1];
+
+      icount_global = icount;
+      jcount_global = jcount;
+#ifdef HAVE_MPI
+      if (parallel) {
+         int count[2], count_global[2];
+         count[0] = icount;
+         count[1] = jcount;
+         MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+         icount_global = count_global[0];
+         jcount_global = count_global[1];
+      }
+#endif
+
+      gpu_rezone_scan(block_size, local_work_size, dev_ioffset, dev_result);
+
+      //ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &my_result, NULL);
+      //printf("After scan, Result is %d\n", my_result[0]);
+
+      ezcl_device_memory_delete(dev_result);
+      ezcl_device_memory_delete(dev_redscratch);
+
+   } else {
+      ezcl_device_memory_delete(dev_mpot);
+      dev_mpot = NULL;
+   }
+
+   gpu_do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
+
+   if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_REFINE_SMOOTH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+
+   return ncells+icount-jcount;
+}
+#endif
+
+void Mesh::terminate(void)
+{
+      mesh_memory.memory_delete(i);
+      mesh_memory.memory_delete(j);
+      mesh_memory.memory_delete(level);
+      mesh_memory.memory_delete(celltype);
+      if (neighbor_remap) {
+         mesh_memory.memory_delete(nlft);
+         mesh_memory.memory_delete(nrht);
+         mesh_memory.memory_delete(nbot);
+         mesh_memory.memory_delete(ntop);
+      }
+
+#ifdef HAVE_OPENCL
+      hash_lib_terminate();
+
+      ezcl_device_memory_delete(dev_levtable);
+      ezcl_device_memory_delete(dev_levdx);
+      ezcl_device_memory_delete(dev_levdy);
+      ezcl_device_memory_delete(dev_levibeg);
+      ezcl_device_memory_delete(dev_leviend);
+      ezcl_device_memory_delete(dev_levjbeg);
+      ezcl_device_memory_delete(dev_levjend);
+
+      ezcl_device_memory_delete(dev_level);
+      ezcl_device_memory_delete(dev_i);
+      ezcl_device_memory_delete(dev_j);
+      ezcl_device_memory_delete(dev_celltype);
+      if (neighbor_remap && dev_nlft != NULL){
+         ezcl_device_memory_delete(dev_nlft);
+         ezcl_device_memory_delete(dev_nrht);
+         ezcl_device_memory_delete(dev_nbot);
+         ezcl_device_memory_delete(dev_ntop);
+      }
+
+      ezcl_kernel_release(kernel_reduction_scan2);
+      ezcl_kernel_release(kernel_reduction_count);
+      ezcl_kernel_release(kernel_reduction_count2);
+      ezcl_kernel_release(kernel_hash_adjust_sizes);
+      ezcl_kernel_release(kernel_hash_setup);
+      ezcl_kernel_release(kernel_hash_setup_local);
+      ezcl_kernel_release(kernel_neighbor_init);
+      ezcl_kernel_release(kernel_calc_neighbors);
+      ezcl_kernel_release(kernel_calc_neighbors_local);
+      ezcl_kernel_release(kernel_calc_border_cells);
+      ezcl_kernel_release(kernel_calc_border_cells2);
+      ezcl_kernel_release(kernel_finish_scan);
+      ezcl_kernel_release(kernel_get_border_data);
+      ezcl_kernel_release(kernel_calc_layer1);
+      ezcl_kernel_release(kernel_calc_layer1_sethash);
+      ezcl_kernel_release(kernel_calc_layer2);
+      ezcl_kernel_release(kernel_get_border_data2);
+      ezcl_kernel_release(kernel_calc_layer2_sethash);
+      //ezcl_kernel_release(kernel_calc_neighbors_local2);
+      ezcl_kernel_release(kernel_copy_mesh_data);
+      ezcl_kernel_release(kernel_fill_mesh_ghost);
+      ezcl_kernel_release(kernel_fill_neighbor_ghost);
+      ezcl_kernel_release(kernel_set_corner_neighbor);
+      ezcl_kernel_release(kernel_adjust_neighbors_local);
+      //ezcl_kernel_release(kernel_copy_ghost_data);
+      //ezcl_kernel_release(kernel_adjust_neighbors);
+      ezcl_kernel_release(kernel_hash_size);
+      ezcl_kernel_release(kernel_finish_hash_size);
+      ezcl_kernel_release(kernel_calc_spatial_coordinates);
+      ezcl_kernel_release(kernel_do_load_balance_lower);
+      ezcl_kernel_release(kernel_do_load_balance_middle);
+      ezcl_kernel_release(kernel_do_load_balance_upper);
+#ifndef MINIMUM_PRECISION
+      ezcl_kernel_release(kernel_do_load_balance_double);
+#endif
+      ezcl_kernel_release(kernel_do_load_balance_float);
+      ezcl_kernel_release(kernel_refine_smooth);
+      ezcl_kernel_release(kernel_coarsen_smooth);
+      ezcl_kernel_release(kernel_coarsen_check_block);
+      ezcl_kernel_release(kernel_rezone_all);
+      ezcl_kernel_release(kernel_rezone_neighbors);
+#ifndef MINIMUM_PRECISION
+      ezcl_kernel_release(kernel_rezone_one_double);
+#endif
+      ezcl_kernel_release(kernel_rezone_one_float);
+      ezcl_kernel_release(kernel_copy_mpot_ghost_data);
+      ezcl_kernel_release(kernel_set_boundary_refinement);
+      terminate_kernel_2stage_sum();
+      terminate_kernel_2stage_sum_int();
+      if (! have_boundary){
+        ezcl_kernel_release(kernel_count_BCs);
+      }
+#endif
+#if defined(HAVE_J7) && defined(HAVE_MPI)
+   if (parallel) mesh_memory.pfini();
+#endif
+}
+
+int Mesh::rezone_count(vector<int> mpot, int &icount, int &jcount)
+{
+   int my_icount=0;
+   int my_jcount=0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction (+:my_jcount,my_icount)
+#endif
+   for (uint ic=0; ic<ncells; ++ic){
+      if (mpot[ic] < 0) {
+         if (celltype[ic] == REAL_CELL) {
+            // remove all but cell that will remain to get count right when split
+            // across processors
+            if (! is_lower_left(i[ic],j[ic]) ) my_jcount--;
+         } else {
+            // either upper right or lower left will remain for boundary cells
+            if (! (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) ) my_jcount--;
+         }
+      }
+
+      if (mpot[ic] > 0) {
+         //printf("mpot[%d] = %d level %d levmx %d\n",ic,mpot[ic],level[ic],levmx);
+         if (celltype[ic] == REAL_CELL){
+            my_icount += 3;
+         } else {
+            my_icount ++;
+         }
+      }
+   }
+   //printf("icount is %d\n",my_icount);
+   icount = my_icount;
+   jcount = my_jcount;
+
+   return(icount+jcount);
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_rezone_count2(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+     /*
+     __kernel void finish_reduction_count2_cl(
+                       const    int   isize,      // 0
+              __global          int  *redscratch, // 1
+              __global          int  *result,     // 2
+              __local           int  *tile)       // 3
+     */
+   ezcl_set_kernel_arg(kernel_reduction_count2, 0, sizeof(cl_int),  (void *)&block_size);
+   ezcl_set_kernel_arg(kernel_reduction_count2, 1, sizeof(cl_mem),  (void *)&dev_redscratch);
+   ezcl_set_kernel_arg(kernel_reduction_count2, 2, sizeof(cl_mem),  (void *)&dev_result);
+   ezcl_set_kernel_arg(kernel_reduction_count2, 3, local_work_size*sizeof(cl_int2),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count2, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+
+void Mesh::gpu_rezone_count(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+     /*
+     __kernel void finish_reduction_count_cl(
+                       const    int   isize,      // 0
+              __global          int  *redscratch, // 1
+              __global          int  *result,     // 2
+              __local           int  *tile)       // 3
+     */
+   ezcl_set_kernel_arg(kernel_reduction_count, 0, sizeof(cl_int),  (void *)&block_size);
+   ezcl_set_kernel_arg(kernel_reduction_count, 1, sizeof(cl_mem),  (void *)&dev_redscratch);
+   ezcl_set_kernel_arg(kernel_reduction_count, 2, sizeof(cl_mem),  (void *)&dev_result);
+   ezcl_set_kernel_arg(kernel_reduction_count, 3, local_work_size*sizeof(cl_int),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+
+void Mesh::gpu_rezone_scan(size_t block_size, size_t local_work_size, cl_mem dev_ioffset, cl_mem &dev_result)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+     /*
+     __kernel void finish_reduction_scan_cl(
+                       const    int   isize,    // 0
+              __global          int  *ioffset,  // 1
+              __global          int  *result,   // 2
+              __local           int  *tile)     // 3
+     */
+   ezcl_set_kernel_arg(kernel_reduction_scan2, 0, sizeof(cl_int),  (void *)&block_size);
+   ezcl_set_kernel_arg(kernel_reduction_scan2, 1, sizeof(cl_mem),  (void *)&dev_ioffset);
+   ezcl_set_kernel_arg(kernel_reduction_scan2, 2, sizeof(cl_mem),  (void *)&dev_result);
+   ezcl_set_kernel_arg(kernel_reduction_scan2, 3, local_work_size*sizeof(cl_uint2),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_scan2, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+#endif
+
+void Mesh::kdtree_setup()
+{
+   KDTree_Initialize(&tree);
+
+   TBounds box;
+   for (uint ic=0; ic<ncells; ic++) {
+     box.min.x = x[ic];
+     box.max.x = x[ic]+dx[ic];
+     box.min.y = y[ic];
+     box.max.y = y[ic]+dy[ic];
+     KDTree_AddElement(&tree, &box);
+   }
+}
+
+void Mesh::calc_spatial_coordinates(int ibase)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   x.resize(ncells);
+   dx.resize(ncells);
+   y.resize(ncells);
+   dy.resize(ncells);
+
+#ifdef _OPENMP
+#pragma omp parallel
+   {
+#endif
+
+   int lowerBounds, upperBounds;
+   set_bounds(ncells);
+   get_bounds(lowerBounds, upperBounds);
+
+   if (have_boundary) {
+      for (uint ic = lowerBounds; ic < upperBounds; ic++) {
+         int lev = level[ic];
+         x[ic]  = xmin + (lev_deltax[lev] * (i[ic] - ibase));
+         dx[ic] =        lev_deltax[lev];
+         y[ic]  = ymin + (lev_deltay[lev] * (j[ic] - ibase));
+         dy[ic] =        lev_deltay[lev];
+      }
+   } else {
+      for (uint ic = lowerBounds; ic < upperBounds; ic++) {
+         int lev = level[ic];
+         x[ic]  = xmin + (lev_deltax[lev] * (i[ic] - lev_ibegin[lev]));
+         dx[ic] =        lev_deltax[lev];
+         y[ic]  = ymin + (lev_deltay[lev] * (j[ic] - lev_jbegin[lev]));
+         dy[ic] =        lev_deltay[lev];
+      }
+   }
+
+   cpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += cpu_timer_stop(tstart_cpu);
+
+#ifdef _OPENMP
+#pragma omp barrier
+   } // end parallel region
+#endif
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_calc_spatial_coordinates(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   cl_event calc_spatial_coordinates_event;
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t local_work_size = MIN(ncells, TILE_SIZE);
+   size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+// Only coded for base 0 and have boundary
+//  Need:
+//     xmin
+//     ymin
+//
+//     lev_deltax -- dev_levdx
+//     lev_deltay -- dev_levdy
+//     x
+//     dx
+//     y
+//     dy
+//     level
+//     i
+//     j
+
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  0, sizeof(cl_int),    (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  1, sizeof(cl_real_t), (void *)&xmin);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  2, sizeof(cl_real_t), (void *)&ymin);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  3, sizeof(cl_mem),    (void *)&dev_levdx);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  4, sizeof(cl_mem),    (void *)&dev_levdy);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  5, sizeof(cl_mem),    (void *)&dev_x);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  6, sizeof(cl_mem),    (void *)&dev_dx);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  7, sizeof(cl_mem),    (void *)&dev_y);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  8, sizeof(cl_mem),    (void *)&dev_dy);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates,  9, sizeof(cl_mem),    (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 10, sizeof(cl_mem),    (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 11, sizeof(cl_mem),    (void *)&dev_j);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_spatial_coordinates, 1, NULL, &global_work_size, &local_work_size, &calc_spatial_coordinates_event);
+
+   ezcl_wait_for_events(1, &calc_spatial_coordinates_event);
+   ezcl_event_release(calc_spatial_coordinates_event);
+
+   gpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::calc_minmax(void)
+{
+   xmin=+1.0e30, ymin=+1.0e30, zmin=+1.0e30;
+
+   for (uint ic=0; ic<ncells; ic++){
+      if (x[ic] < xmin) xmin = x[ic];
+   }
+   for (uint ic=0; ic<ncells; ic++){
+      if (y[ic] < ymin) ymin = y[ic];
+   }
+   if (ndim > TWO_DIMENSIONAL) {
+      for (uint ic=0; ic<ncells; ic++){
+         if (z[ic] < zmin) zmin = z[ic];
+      }
+   }
+
+   xmax=-1.0e30, ymax=-1.0e30, zmax=-1.0e30;
+   real_t xhigh, yhigh, zhigh;
+
+   for (uint ic=0; ic<ncells; ic++){
+      xhigh = x[ic]+dx[ic];
+      if (xhigh > xmax) xmax = xhigh;
+   }
+   for (uint ic=0; ic<ncells; ic++){
+      yhigh = y[ic]+dy[ic];
+      if (yhigh > ymax) ymax = yhigh;
+   }
+   if (ndim > TWO_DIMENSIONAL) {
+      for (uint ic=0; ic<ncells; ic++){
+        zhigh = z[ic]+dz[ic];
+        if (zhigh > zmax) zmax = zhigh;
+      }
+   }
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      real_t xmin_global,xmax_global,ymin_global,ymax_global;
+      MPI_Allreduce(&xmin, &xmin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+      MPI_Allreduce(&xmax, &xmax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+      MPI_Allreduce(&ymin, &ymin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+      MPI_Allreduce(&ymax, &ymax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+      xmin = xmin_global;
+      xmax = xmax_global;
+      ymin = ymin_global;
+      ymax = ymax_global;
+   }
+#endif
+
+}
+void Mesh::calc_centerminmax(void)
+{
+   xcentermin=+1.0e30, ycentermin=+1.0e30, zcentermin=+1.0e30;
+   xcentermax=-1.0e30, ycentermax=-1.0e30, zcentermax=-1.0e30;
+   real_t xmid, ymid, zmid;
+
+   for (uint ic=0; ic<ncells; ic++){
+      xmid = x[ic]+0.5*dx[ic];
+      if (xmid < xcentermin) xcentermin = xmid;
+      if (xmid > xcentermax) xcentermax = xmid;
+   }
+   for (uint ic=0; ic<ncells; ic++){
+      ymid = y[ic]+0.5*dy[ic];
+      if (ymid < ycentermin) ycentermin = ymid;
+      if (ymid > ycentermax) ycentermax = ymid;
+   }
+   if (ndim > TWO_DIMENSIONAL) {
+      for (uint ic=0; ic<ncells; ic++){
+         zmid = z[ic]+0.5*dz[ic];
+         if (zmid < zcentermin) zcentermin = zmid;
+         if (zmid > zcentermax) zcentermax = zmid;
+      }
+   }
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      real_t xcentermin_global,xcentermax_global,ycentermin_global,ycentermax_global;
+      MPI_Allreduce(&xcentermin, &xcentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+      MPI_Allreduce(&xcentermax, &xcentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+      MPI_Allreduce(&ycentermin, &ycentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+      MPI_Allreduce(&ycentermax, &ycentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+      xcentermin = xcentermin_global;
+      xcentermax = xcentermax_global;
+      ycentermin = ycentermin_global;
+      ycentermax = ycentermax_global;
+   }
+#endif
+
+}
+
+void Mesh::rezone_all(int icount, int jcount, vector<int> mpot, int have_state, MallocPlus &state_memory)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   if (! do_rezone) {
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      index.clear();
+      index.resize(ncells);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (uint ic=0; ic<ncells; ic++){
+         index[ic]=ic;
+      }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+      cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+
+   } else {
+
+// sign for jcount is different in GPU and CPU code -- abs is a quick fix
+   int add_ncells = icount - abs(jcount);
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   cpu_counters[MESH_COUNTER_REZONE]++;
+
+   static vector<int> celltype_save;
+
+   static int new_ncells;
+
+   static int *i_old, *j_old, *level_old;
+
+   static int ifirst;
+   static int ilast;
+   static int jfirst;
+   static int jlast;
+   static int level_first;
+   static int level_last;
+
+   static vector<int> new_ic;
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+      celltype_save.resize(ncells);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+   if (have_state) {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (int ic = 0; ic < (int)ncells; ic++){
+         celltype_save[ic] = celltype[ic];
+      }
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+   new_ncells = ncells + add_ncells;
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+// int ref_entry_count = 0;
+   if (have_state){
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (uint ic=0; ic<ncells; ic++) {
+//       if (mpot[ic] > 0) ref_entry_count++;
+         if (mpot[ic] < 0) {
+            // Normal cell coarsening
+            if (is_lower_left(i[ic],j[ic]) ) mpot[ic] = -2;
+            // Boundary cell case
+            if (celltype[ic] != REAL_CELL && is_upper_right(i[ic],j[ic]) ) mpot[ic] = -3;
+         }
+      }
+   }
+
+   //  Initialize new variables
+// int *i_old, *j_old, *level_old;
+
+   int flags = RESTART_DATA;
+#ifdef HAVE_J7
+   if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+   i_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "i_old",     flags);
+   j_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "j_old",     flags);
+   level_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "level_old", flags);
+
+   mesh_memory.memory_swap(&i,     &i_old);
+   mesh_memory.memory_swap(&j,     &j_old);
+   mesh_memory.memory_swap(&level, &level_old);
+
+   index.clear();
+   index.resize(new_ncells);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+   static vector<int> order; //  Vector of refined mesh traversal order; set to -1 to indicate errors.
+   //
+   //vector<int>  invorder(4, -1); //  Vector mapping location from base index.
+
+   //int ref_entry = 0;
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+   //  Insert new cells into the mesh at the point of refinement.
+   order.resize(4,    -1); //  Vector of refined mesh traversal order; set to -1 to indicate errors.
+
+   ifirst      = 0;
+   ilast       = 0;
+   jfirst      = 0;
+   jlast       = 0;
+   level_first = 0;
+   level_last  = 0;
+
+   if (parallel) {
+#ifdef HAVE_MPI
+      MPI_Request req[12];
+      MPI_Status status[12];
+
+      static int prev     = MPI_PROC_NULL;
+      static int next     = MPI_PROC_NULL;
+
+      if (mype != 0)         prev = mype-1;
+      if (mype < numpe - 1)  next = mype+1;
+
+      MPI_Isend(&i_old[ncells-1],     1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
+      MPI_Irecv(&ifirst,              1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
+
+      MPI_Isend(&i_old[0],            1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
+      MPI_Irecv(&ilast,               1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
+
+      MPI_Isend(&j_old[ncells-1],     1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
+      MPI_Irecv(&jfirst,              1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
+
+      MPI_Isend(&j_old[0],            1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
+      MPI_Irecv(&jlast,               1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
+
+      MPI_Isend(&level_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
+      MPI_Irecv(&level_first,         1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
+
+      MPI_Isend(&level_old[0],        1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
+      MPI_Irecv(&level_last,          1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
+
+      MPI_Waitall(12, req, status);
+#endif
+   }
+
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef REZONE_NO_OPTIMIZATION
+   vector<int>  invorder(4, -1); //  Vector mapping location from base index.
+   for (int ic = 0, nc = 0; ic < (int)ncells; ic++)
+   {
+      if (mpot[ic] == 0 || mpot[ic] == -1000000)
+      {  //  No change is needed; copy the old cell straight to the new mesh at this location.
+         index[ic] = nc;
+         i[nc]     = i_old[ic];
+         j[nc]     = j_old[ic];
+         level[nc] = level_old[ic];
+         nc++;
+      } //  Complete no change needed.
+      
+      else if (mpot[ic] < 0)
+      {  //  Coarsening is needed; remove this cell and the other three and replace them with one.
+         index[ic] = nc;
+         if (mpot[ic] <= -2) {
+            //printf("                     %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
+            i[nc] = i_old[ic]/2;
+            j[nc] = j_old[ic]/2;
+            level[nc] = level_old[ic] - 1;
+            nc++;
+         }
+      } //  Coarsening complete.
+      
+      else if (mpot[ic] > 0)
+      {  //  Refinement is needed; insert four cells where once was one.
+         index[ic] = nc;
+         if (celltype[ic] == REAL_CELL)
+         {  
+            set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
+                                 level_first, level_last, i_old, j_old, level_old);
+
+            //  Create the cells in the correct order and orientation.
+            for (int ii = 0; ii < 4; ii++)
+            {  level[nc] = level_old[ic] + 1;
+               switch (order[ii])
+               {  case SW:
+                     // lower left
+                     invorder[SW] = ii;
+                     i[nc]     = i_old[ic]*2;
+                     j[nc]     = j_old[ic]*2;
+                     nc++;
+                     break;
+                     
+                  case SE:
+                     // lower right
+                     invorder[SE] = ii;
+                     i[nc]     = i_old[ic]*2 + 1;
+                     j[nc]     = j_old[ic]*2;
+                     nc++;
+                     break;
+                     
+                  case NW:
+                     // upper left
+                     invorder[NW] = ii;
+                     i[nc]     = i_old[ic]*2;
+                     j[nc]     = j_old[ic]*2 + 1;
+                     nc++;
+                     break;
+                     
+                  case NE:
+                     // upper right
+                     invorder[NE] = ii;
+                     i[nc]     = i_old[ic]*2 + 1;
+                     j[nc]     = j_old[ic]*2 + 1;
+                     nc++;
+                     break; } } //  Complete cell refinement.
+         }  //  Complete real cell refinement.
+         
+         else if (celltype[ic] == LEFT_BOUNDARY) {
+            // lower
+            i[nc]  = i_old[ic]*2 + 1;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // upper
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == RIGHT_BOUNDARY) {
+            // lower
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // upper
+            i[nc] = i_old[ic]*2;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == BOTTOM_BOUNDARY) {
+            // left
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // right
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == TOP_BOUNDARY) {
+            // right
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+
+            // left
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+      } //  Complete refinement needed.
+   } //  Complete addition of new cells to the mesh.
+
+   mesh_memory.memory_delete(i_old);
+   mesh_memory.memory_delete(j_old);
+   mesh_memory.memory_delete(level_old);
+
+   calc_celltype(new_ncells);
+
+   if (have_state){
+      flags = RESTART_DATA;
+      MallocPlus state_memory_old = state_memory;
+      malloc_plus_memory_entry *memory_item;
+
+      for (memory_item = state_memory_old.memory_entry_by_name_begin();
+           memory_item != state_memory_old.memory_entry_by_name_end();
+           memory_item = state_memory_old.memory_entry_by_name_next() ) {
+         //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+         if (memory_item->mem_elsize == 8) {
+            double *state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
+                                                                             "state_temp_double", flags);
+
+            double *mem_ptr_double = (double *)memory_item->mem_ptr;
+
+            //ref_entry = 0;
+            for (int ic=0, nc=0; ic<(int)ncells; ic++) {
+
+               if (mpot[ic] == 0) {
+                  state_temp_double[nc] = mem_ptr_double[ic];
+                  nc++;
+               } else if (mpot[ic] < 0){
+                  if (mpot[ic] == -2) {
+                     int nr = nrht[ic];
+                     int nt = ntop[ic];
+                     int nrt = nrht[nt];
+                     state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
+                                              mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
+                     nc++;
+                  }
+                  if (mpot[ic] == -3) {
+                     int nl = nlft[ic];
+                     int nb = nbot[ic];
+                     int nlb = nlft[nb];
+                     state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
+                                              mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
+                     nc++;
+                  }
+               } else if (mpot[ic] > 0){
+                  // lower left
+                  state_temp_double[nc] = mem_ptr_double[ic];
+                  nc++;
+
+                  // lower right
+                  state_temp_double[nc] = mem_ptr_double[ic];
+                  nc++;
+
+                  if (celltype_save[ic] == REAL_CELL){
+                     // upper left
+                     state_temp_double[nc] = mem_ptr_double[ic];
+                     nc++;
+
+                     // upper right
+                     state_temp_double[nc] = mem_ptr_double[ic];
+                     nc++;
+                  }
+               }
+            }
+
+            state_memory.memory_replace(mem_ptr_double, state_temp_double);
+         } else if (memory_item->mem_elsize == 4) {
+            float *state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
+                                                                          "state_temp_float", flags);
+
+            float *mem_ptr_float = (float *)memory_item->mem_ptr;
+
+            for (int ic=0, nc=0; ic<(int)ncells; ic++) {
+
+               if (mpot[ic] == 0) {
+                  state_temp_float[nc] = mem_ptr_float[ic];
+                  nc++;
+               } else if (mpot[ic] < 0){
+                  if (mpot[ic] == -2) {
+                     int nr = nrht[ic];
+                     int nt = ntop[ic];
+                     int nrt = nrht[nt];
+                     state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
+                                             mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
+                     nc++;
+                  }
+                  if (mpot[ic] == -3) {
+                     int nl = nlft[ic];
+                     int nb = nbot[ic];
+                     int nlb = nlft[nb];
+                     state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
+                                             mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
+                     nc++;
+                  }
+               } else if (mpot[ic] > 0){
+                  // lower left
+                  state_temp_float[nc] = mem_ptr_float[ic];
+                  nc++;
+
+                  // lower right
+                  state_temp_float[nc] = mem_ptr_float[ic];
+                  nc++;
+
+                  if (celltype_save[ic] == REAL_CELL){
+                     // upper left
+                     state_temp_float[nc] = mem_ptr_float[ic];
+                     nc++;
+
+                     // upper right
+                     state_temp_float[nc] = mem_ptr_float[ic];
+                     nc++;
+                  }
+               }
+            }
+
+            state_memory.memory_replace(mem_ptr_float, state_temp_float);
+         }
+      }
+   }
+#else
+   // Data parallel optimizations for thread parallel -- slows down serial
+   // code by about 25%
+   static vector<int> add_count;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      add_count.resize(ncells);
+      new_ic.resize(ncells+1);
+#ifdef _OPENMP
+   } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (int ic = 0; ic < (int)ncells; ic++){
+         if (mpot[ic] == 0) {
+            add_count[ic] = 1;
+         } else if (mpot[ic] < 0) {
+            if (mpot[ic] == -2){
+               add_count[ic] = 1;
+            } else {
+               add_count[ic] = 0;
+            }
+         } else if (mpot[ic] > 0) {
+            if (celltype[ic] != REAL_CELL) {
+               add_count[ic] = 2;
+            } else {
+               add_count[ic] = 4;
+            }
+         }
+      }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+      scan (&add_count[0], &new_ic[0], ncells);
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+   for (int ic = 0; ic < (int)ncells; ic++) {
+   vector<int>  invorder(4, -1); //  Vector mapping location from base index.
+      int nc = new_ic[ic];
+      if (mpot[ic] == 0)
+      {  //  No change is needed; copy the old cell straight to the new mesh at this location.
+         index[ic] = nc;
+         i[nc]     = i_old[ic];
+         j[nc]     = j_old[ic];
+         level[nc] = level_old[ic];
+      } //  Complete no change needed.
+
+      else if (mpot[ic] < 0)
+      {  //  Coarsening is needed; remove this cell and the other three and replace them with one.
+         index[ic] = nc;
+         if (mpot[ic] <= -2) {
+            //printf("                     %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
+            i[nc] = i_old[ic]/2;
+            j[nc] = j_old[ic]/2;
+            level[nc] = level_old[ic] - 1;
+         }
+      } //  Coarsening complete.
+
+      else if (mpot[ic] > 0)
+      {  //  Refinement is needed; insert four cells where once was one.
+         index[ic] = nc;
+         if (celltype[ic] == REAL_CELL)
+         {  
+            int order[4];
+            set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
+                                 level_first, level_last, i_old, j_old, level_old);
+
+            //  Create the cells in the correct order and orientation.
+            for (int ii = 0; ii < 4; ii++) {
+               level[nc] = level_old[ic] + 1;
+               switch (order[ii]) {
+                  case SW:
+                     // lower left
+                     invorder[SW] = ii;
+                     i[nc]     = i_old[ic]*2;
+                     j[nc]     = j_old[ic]*2;
+                     nc++;
+                     break;
+                     
+                  case SE:
+                     // lower right
+                     invorder[SE] = ii;
+                     i[nc]     = i_old[ic]*2 + 1;
+                     j[nc]     = j_old[ic]*2;
+                     nc++;
+                     break;
+                     
+                  case NW:
+                     // upper left
+                     invorder[NW] = ii;
+                     i[nc]     = i_old[ic]*2;
+                     j[nc]     = j_old[ic]*2 + 1;
+                     nc++;
+                     break;
+                     
+                  case NE:
+                     // upper right
+                     invorder[NE] = ii;
+                     i[nc]     = i_old[ic]*2 + 1;
+                     j[nc]     = j_old[ic]*2 + 1;
+                     nc++;
+                     break;
+                  }
+               } //  Complete cell refinement.
+         }  //  Complete real cell refinement.
+         
+         else if (celltype[ic] == LEFT_BOUNDARY) {
+            // lower
+            i[nc]  = i_old[ic]*2 + 1;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // upper
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == RIGHT_BOUNDARY) {
+            // lower
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // upper
+            i[nc] = i_old[ic]*2;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == BOTTOM_BOUNDARY) {
+            // left
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+            
+            // right
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2 + 1;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+         else if (celltype[ic] == TOP_BOUNDARY) {
+            // right
+            i[nc] = i_old[ic]*2 + 1;
+            j[nc] = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+
+            // left
+            i[nc]  = i_old[ic]*2;
+            j[nc]  = j_old[ic]*2;
+            level[nc] = level_old[ic] + 1;
+            nc++;
+         }
+      } //  Complete refinement needed.
+   } //  Complete addition of new cells to the mesh.
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   mesh_memory.memory_delete(i_old);
+   mesh_memory.memory_delete(j_old);
+   mesh_memory.memory_delete(level_old);
+#ifdef _OPENMP
+   } // end master region
+#endif
+
+   calc_celltype_threaded(new_ncells);
+
+   if (have_state){
+
+      static MallocPlus state_memory_old;
+      static malloc_plus_memory_entry *memory_begin;
+      static malloc_plus_memory_entry *memory_end;
+      static malloc_plus_memory_entry *memory_next;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+      state_memory_old = state_memory;
+
+      memory_begin = state_memory_old.memory_entry_by_name_begin();
+      memory_end   = state_memory_old.memory_entry_by_name_end();
+#ifdef _OPENMP
+      } // end master region
+#pragma omp barrier
+#endif
+
+      for (malloc_plus_memory_entry *memory_item = memory_begin;
+           memory_item != memory_end;
+           memory_item = memory_next ) {
+         //ref_entry = 0;
+         //printf("DEBUG -- memory_item->mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+         if (memory_item->mem_elsize == 8) {
+
+            static double *state_temp_double, *mem_ptr_double;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
+                                                                                "state_temp_double", flags);
+               mem_ptr_double = (double *)memory_item->mem_ptr;
+#ifdef _OPENMP
+            } // end master region
+#pragma omp barrier
+#endif
+
+            //ref_entry = 0;
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int ic=0; ic<(int)ncells; ic++) {
+
+               int nc = new_ic[ic];
+               if (mpot[ic] == 0) {
+                  state_temp_double[nc] = mem_ptr_double[ic];
+               } else if (mpot[ic] < 0){
+                  if (mpot[ic] == -2) {
+                     int nr = nrht[ic];
+                     int nt = ntop[ic];
+                     int nrt = nrht[nt];
+                     state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
+                                              mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
+                  }
+                  if (mpot[ic] == -3) {
+                     int nl = nlft[ic];
+                     int nb = nbot[ic];
+                     int nlb = nlft[nb];
+                     state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
+                                              mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
+                  }
+               } else if (mpot[ic] > 0){
+                  // lower left
+                  state_temp_double[nc] = mem_ptr_double[ic];
+                  nc++;
+
+                  // lower right
+                  state_temp_double[nc] = mem_ptr_double[ic];
+                  nc++;
+
+                  if (celltype_save[ic] == REAL_CELL){
+                     // upper left
+                     state_temp_double[nc] = mem_ptr_double[ic];
+                     nc++;
+
+                     // upper right
+                     state_temp_double[nc] = mem_ptr_double[ic];
+                     nc++;
+                  }
+               }
+            } // end cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+            state_memory.memory_replace(mem_ptr_double, state_temp_double);
+#ifdef _OPENMP
+            } // end master region
+#pragma omp barrier
+#endif
+
+         } else if (memory_item->mem_elsize == 4) {
+
+            static float *state_temp_float, *mem_ptr_float;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
+                                                                             "state_temp_float", flags);
+               mem_ptr_float = (float *)memory_item->mem_ptr;
+#ifdef _OPENMP
+            } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int ic=0; ic<(int)ncells; ic++) {
+
+               int nc = new_ic[ic];
+               if (mpot[ic] == 0) {
+                  state_temp_float[nc] = mem_ptr_float[ic];
+               } else if (mpot[ic] < 0){
+                  if (mpot[ic] == -2) {
+                     int nr = nrht[ic];
+                     int nt = ntop[ic];
+                     int nrt = nrht[nt];
+                     state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
+                                             mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
+                  }
+                  if (mpot[ic] == -3) {
+                     int nl = nlft[ic];
+                     int nb = nbot[ic];
+                     int nlb = nlft[nb];
+                     state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
+                                             mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
+                  }
+               } else if (mpot[ic] > 0){
+                  // lower left
+                  state_temp_float[nc] = mem_ptr_float[ic];
+                  nc++;
+
+                  // lower right
+                  state_temp_float[nc] = mem_ptr_float[ic];
+                  nc++;
+
+                  if (celltype_save[ic] == REAL_CELL){
+                     // upper left
+                     state_temp_float[nc] = mem_ptr_float[ic];
+                     nc++;
+
+                     // upper right
+                     state_temp_float[nc] = mem_ptr_float[ic];
+                     nc++;
+                  }
+               }
+            } // end cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               state_memory.memory_replace(mem_ptr_float, state_temp_float);
+#ifdef _OPENMP
+            } // end master region
+#pragma omp barrier
+#endif
+         } // mem elem size 4 bytes
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         memory_next = state_memory_old.memory_entry_by_name_next();
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+      } // memory item iteration
+
+   } // if have state
+   // End of data parallel optimizations
+#endif
+
+   if (neighbor_remap) {
+      int flags = 0;
+      static int *nlft_old, *nrht_old, *nbot_old, *ntop_old;
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+      nlft_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nlft_old",  flags);
+      nrht_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nrht_old",  flags);
+      nbot_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nbot_old",  flags);
+      ntop_old     = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "ntop_old",  flags);
+#ifdef _OPENMP
+      } // end master region
+#pragma omp barrier
+#endif
+      flags = RESTART_DATA;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (int ic = 0; ic < new_ncells; ic++){
+         nlft_old[ic] = -1;
+         nrht_old[ic] = -1;
+         nbot_old[ic] = -1;
+         ntop_old[ic] = -1;
+      }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+      mesh_memory.memory_swap(&nlft,  &nlft_old);
+      mesh_memory.memory_swap(&nrht,  &nrht_old);
+      mesh_memory.memory_swap(&nbot,  &nbot_old);
+      mesh_memory.memory_swap(&ntop,  &ntop_old);
+#ifdef _OPENMP
+      } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (int ic = 0; ic < (int)ncells; ic++){
+         int nc = index[ic];
+
+         if (mpot[ic] == 0){
+            if (nlft_old[ic] < (int)ncells && nlft_old[ic] >= 0){
+               nlft[nc] = (mpot[nlft_old[ic]] == 0) ? index[nlft_old[ic]] : -1;
+            }
+            if (nrht_old[ic] < (int)ncells && nrht_old[ic] >= 0){
+               nrht[nc] = (mpot[nrht_old[ic]] == 0) ? index[nrht_old[ic]] : -1;
+            }
+            if (nbot_old[ic] < (int)ncells && nbot_old[ic] >= 0){
+               nbot[nc] = (mpot[nbot_old[ic]] == 0) ? index[nbot_old[ic]] : -1;
+            }
+            if (ntop_old[ic] < (int)ncells && ntop_old[ic] >= 0){
+               ntop[nc] = (mpot[ntop_old[ic]] == 0) ? index[ntop_old[ic]] : -1;
+            }
+         } else if (mpot[ic] <= -2) {
+            nlft[nc]  = -1;
+            nrht[nc]  = -1;
+            nbot[nc]  = -1;
+            ntop[nc]  = -1;
+         } else if (mpot[ic] > 0){
+            nlft[nc]    = -1;
+            nlft[nc+1]  = -1;
+            nrht[nc]    = -1;
+            nrht[nc+1]  = -1;
+            nbot[nc]    = -1;
+            nbot[nc+1]  = -1;
+            ntop[nc]    = -1;
+            ntop[nc+1]  = -1;
+            if (celltype[nc] == REAL_CELL){
+               nlft[nc+2]  = -1;
+               nlft[nc+3]  = -1;
+               nrht[nc+2]  = -1;
+               nrht[nc+3]  = -1;
+               nbot[nc+2]  = -1;
+               nbot[nc+3]  = -1;
+               ntop[nc+2]  = -1;
+               ntop[nc+3]  = -1;
+            }
+         }
+         if (mpot[ic] > 0){
+            nc++;
+            switch(celltype[nc]){
+            case LEFT_BOUNDARY:
+               nlft[nc] = nc;
+               break;
+            case RIGHT_BOUNDARY:
+               nrht[nc] = nc;
+               break;
+            case BOTTOM_BOUNDARY:
+               nbot[nc] = nc;
+               break;
+            case TOP_BOUNDARY:
+               ntop[nc] = nc;
+               break;
+            }
+         }
+      }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+      nlft_old = (int *)mesh_memory.memory_delete(nlft_old);
+      nrht_old = (int *)mesh_memory.memory_delete(nrht_old);
+      nbot_old = (int *)mesh_memory.memory_delete(nbot_old);
+      ntop_old = (int *)mesh_memory.memory_delete(ntop_old);
+#ifdef _OPENMP
+      } // end master region
+#pragma omp barrier
+#endif
+   } else {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+      nlft = (int *)mesh_memory.memory_delete(nlft);
+      nrht = (int *)mesh_memory.memory_delete(nrht);
+      nbot = (int *)mesh_memory.memory_delete(nbot);
+      ntop = (int *)mesh_memory.memory_delete(ntop);
+#ifdef _OPENMP
+      } // end master region
+#pragma omp barrier
+#endif
+   }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   //ncells = nc;
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+      ndispl[0]=0;
+      for (int ip=1; ip<numpe; ip++){
+         ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+      }  
+      noffset=ndispl[mype];
+      ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
+   }  
+#endif
+
+   cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+   } // end master region
+#pragma omp barrier
+#endif
+
+   } // if do_rezone
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_rezone_all(int icount, int jcount, cl_mem &dev_mpot, MallocPlus &gpu_state_memory)
+{
+   if (! gpu_do_rezone) return;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   gpu_counters[MESH_COUNTER_REZONE]++;
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   assert(dev_mpot);
+   assert(dev_level);
+   assert(dev_i);
+   assert(dev_j);
+   assert(dev_celltype);
+   assert(dev_ioffset);
+   assert(dev_levdx);
+   assert(dev_levdy);
+
+   int add_ncells = icount - jcount;
+
+// int global_icount = icount;
+// int global_jcount = jcount;
+
+   size_t old_ncells = ncells;
+   size_t new_ncells = ncells + add_ncells;
+
+#ifdef HAVE_MPI
+   //int global_add_ncells = add_ncells;
+
+// if (parallel) {
+//    int count[2], global_count[2];
+//    count[0] = icount;
+//    count[1] = jcount;
+//    MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+//    global_icount = global_count[0];
+//    global_jcount = global_count[1];
+//    //global_add_ncells = global_icount + global_jcount;
+// }
+#endif
+
+   int ifirst      = 0;
+   int ilast       = 0;
+   int jfirst      = 0;
+   int jlast       = 0;
+   int level_first = 0;
+   int level_last  = 0;
+
+#ifdef HAVE_MPI
+   if (numpe > 1) {
+      int i_tmp_first, i_tmp_last;
+      int j_tmp_first, j_tmp_last;
+      int level_tmp_first, level_tmp_last;
+
+      ezcl_enqueue_read_buffer(command_queue,  dev_i,     CL_FALSE, 0,                             1*sizeof(cl_int), &i_tmp_first,     NULL);
+      ezcl_enqueue_read_buffer(command_queue,  dev_j,     CL_FALSE, 0,                             1*sizeof(cl_int), &j_tmp_first,     NULL);
+      ezcl_enqueue_read_buffer(command_queue,  dev_level, CL_FALSE, 0,                             1*sizeof(cl_int), &level_tmp_first, NULL);
+      ezcl_enqueue_read_buffer(command_queue,  dev_i,     CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &i_tmp_last,      NULL);
+      ezcl_enqueue_read_buffer(command_queue,  dev_j,     CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &j_tmp_last,      NULL);
+      ezcl_enqueue_read_buffer(command_queue,  dev_level, CL_TRUE,  (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &level_tmp_last,  NULL);
+
+      MPI_Request req[12];
+      MPI_Status status[12];
+
+      static int prev     = MPI_PROC_NULL;
+      static int next     = MPI_PROC_NULL;
+
+      if (mype != 0)         prev = mype-1;
+      if (mype < numpe - 1)  next = mype+1;
+
+      MPI_Isend(&i_tmp_last,      1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
+      MPI_Irecv(&ifirst,          1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
+
+      MPI_Isend(&i_tmp_first,     1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
+      MPI_Irecv(&ilast,           1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
+
+      MPI_Isend(&j_tmp_last,      1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
+      MPI_Irecv(&jfirst,          1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
+
+      MPI_Isend(&j_tmp_first,     1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
+      MPI_Irecv(&jlast,           1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
+
+      MPI_Isend(&level_tmp_last,  1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
+      MPI_Irecv(&level_first,     1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
+
+      MPI_Isend(&level_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
+      MPI_Irecv(&level_last,      1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
+
+      MPI_Waitall(12, req, status);
+   }
+#endif
+
+/*
+   if (new_ncells != old_ncells){
+      ncells = new_ncells;
+   }
+*/
+
+   size_t mem_request = (int)((float)new_ncells*mem_factor);
+   cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+   cl_mem dev_level_new    = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"),    &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+   cl_mem dev_i_new        = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"),        &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+   cl_mem dev_j_new        = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"),        &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+   cl_mem dev_ijadd;
+
+   vector<int>ijadd(6);
+   if (numpe > 1) {
+      ijadd[0] = ifirst;
+      ijadd[1] = ilast;
+      ijadd[2] = jfirst;
+      ijadd[3] = jlast;
+      ijadd[4] = level_first;
+      ijadd[5] = level_last;
+   }
+
+   size_t six = 6;
+   dev_ijadd = ezcl_malloc(NULL, const_cast<char *>("dev_ijadd"), &six, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+   ezcl_enqueue_write_buffer(command_queue, dev_ijadd, CL_TRUE, 0, 6*sizeof(cl_int), (void*)&ijadd[0], NULL);
+
+   cl_mem dev_indexoffset = ezcl_malloc(NULL, const_cast<char *>("dev_indexoffset"), &old_ncells, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+   int stencil = 0;
+   if (localStencil) stencil = 1;
+
+   size_t local_work_size = 128;
+   size_t global_work_size = ((old_ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+   ezcl_set_kernel_arg(kernel_rezone_all, 0,  sizeof(cl_int),  (void *)&old_ncells);
+   ezcl_set_kernel_arg(kernel_rezone_all, 1,  sizeof(cl_int),  (void *)&stencil);
+   ezcl_set_kernel_arg(kernel_rezone_all, 2,  sizeof(cl_int),  (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_rezone_all, 3,  sizeof(cl_mem),  (void *)&dev_mpot);
+   ezcl_set_kernel_arg(kernel_rezone_all, 4,  sizeof(cl_mem),  (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_rezone_all, 5,  sizeof(cl_mem),  (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_rezone_all, 6,  sizeof(cl_mem),  (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_rezone_all, 7,  sizeof(cl_mem),  (void *)&dev_celltype);
+   ezcl_set_kernel_arg(kernel_rezone_all, 8,  sizeof(cl_mem),  (void *)&dev_level_new);
+   ezcl_set_kernel_arg(kernel_rezone_all, 9,  sizeof(cl_mem),  (void *)&dev_i_new);
+   ezcl_set_kernel_arg(kernel_rezone_all, 10, sizeof(cl_mem),  (void *)&dev_j_new);
+   ezcl_set_kernel_arg(kernel_rezone_all, 11, sizeof(cl_mem),  (void *)&dev_celltype_new);
+   ezcl_set_kernel_arg(kernel_rezone_all, 12, sizeof(cl_mem),  (void *)&dev_ioffset);
+   ezcl_set_kernel_arg(kernel_rezone_all, 13, sizeof(cl_mem),  (void *)&dev_indexoffset);
+   ezcl_set_kernel_arg(kernel_rezone_all, 14, sizeof(cl_mem),  (void *)&dev_levdx);
+   ezcl_set_kernel_arg(kernel_rezone_all, 15, sizeof(cl_mem),  (void *)&dev_levdy);
+   ezcl_set_kernel_arg(kernel_rezone_all, 16, sizeof(cl_mem),  (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_rezone_all, 17, sizeof(cl_mem),  (void *)&dev_ijadd);
+   ezcl_set_kernel_arg(kernel_rezone_all, 18, local_work_size * sizeof(cl_uint), NULL);
+   //ezcl_set_kernel_arg(kernel_rezone_all, 19, local_work_size * sizeof(cl_real4_t),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_all,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+   MallocPlus gpu_state_memory_old = gpu_state_memory;
+   malloc_plus_memory_entry *memory_item;
+
+   for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
+        memory_item != gpu_state_memory_old.memory_entry_by_name_end();
+        memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
+      //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+      cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
+
+      if (memory_item->mem_elsize == 8){
+#ifndef MINIMUM_PRECISION
+         cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_double), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
+
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 0, sizeof(cl_int),  (void *)&old_ncells);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 1, sizeof(cl_mem),  (void *)&dev_i);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 2, sizeof(cl_mem),  (void *)&dev_j);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 3, sizeof(cl_mem),  (void *)&dev_nlft);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 4, sizeof(cl_mem),  (void *)&dev_nrht);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 5, sizeof(cl_mem),  (void *)&dev_nbot);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 6, sizeof(cl_mem),  (void *)&dev_ntop);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 7, sizeof(cl_mem),  (void *)&dev_celltype);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 8, sizeof(cl_mem),  (void *)&dev_mpot);
+         ezcl_set_kernel_arg(kernel_rezone_one_double, 9, sizeof(cl_mem),  (void *)&dev_indexoffset);
+         ezcl_set_kernel_arg(kernel_rezone_one_double,10, sizeof(cl_mem),  (void *)&dev_state_mem_ptr);
+         ezcl_set_kernel_arg(kernel_rezone_one_double,11, sizeof(cl_mem),  (void *)&dev_state_var_new);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_double,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+         gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+#else
+         printf("ERROR -- can't have double type for state variable\n");
+         exit(1);
+#endif
+      } else if (memory_item->mem_elsize == 4){
+         cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_float), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
+
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 0, sizeof(cl_int),  (void *)&old_ncells);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 1, sizeof(cl_mem),  (void *)&dev_i);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 2, sizeof(cl_mem),  (void *)&dev_j);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 3, sizeof(cl_mem),  (void *)&dev_nlft);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 4, sizeof(cl_mem),  (void *)&dev_nrht);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 5, sizeof(cl_mem),  (void *)&dev_nbot);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 6, sizeof(cl_mem),  (void *)&dev_ntop);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 7, sizeof(cl_mem),  (void *)&dev_celltype);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 8, sizeof(cl_mem),  (void *)&dev_mpot);
+         ezcl_set_kernel_arg(kernel_rezone_one_float, 9, sizeof(cl_mem),  (void *)&dev_indexoffset);
+         ezcl_set_kernel_arg(kernel_rezone_one_float,10, sizeof(cl_mem),  (void *)&dev_state_mem_ptr);
+         ezcl_set_kernel_arg(kernel_rezone_one_float,11, sizeof(cl_mem),  (void *)&dev_state_var_new);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_float,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+         gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+      }
+   }
+
+   if (neighbor_remap & ! parallel) {
+      size_t mem_request = (int)((float)new_ncells*mem_factor);
+      cl_mem dev_nlft_new = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_nrht_new = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_nbot_new = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_ntop_new = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_neighbor_init,  0, sizeof(cl_int),   (void *)&new_ncells);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  1, sizeof(cl_mem),   (void *)&dev_nlft_new);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  2, sizeof(cl_mem),   (void *)&dev_nrht_new);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  3, sizeof(cl_mem),   (void *)&dev_nbot_new);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  4, sizeof(cl_mem),   (void *)&dev_ntop_new);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  0, sizeof(cl_int),  (void *)&old_ncells);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  1, sizeof(cl_mem),  (void *)&dev_mpot);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  2, sizeof(cl_mem),  (void *)&dev_indexoffset);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  3, sizeof(cl_mem),  (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  4, sizeof(cl_mem),  (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  5, sizeof(cl_mem),  (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  6, sizeof(cl_mem),  (void *)&dev_ntop);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  7, sizeof(cl_mem),  (void *)&dev_celltype_new);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  8, sizeof(cl_mem),  (void *)&dev_nlft_new);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors,  9, sizeof(cl_mem),  (void *)&dev_nrht_new);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors, 10, sizeof(cl_mem),  (void *)&dev_nbot_new);
+      ezcl_set_kernel_arg(kernel_rezone_neighbors, 11, sizeof(cl_mem),  (void *)&dev_ntop_new);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_neighbors,   1, NULL, &global_work_size, &local_work_size, NULL);
+   
+      ezcl_device_memory_swap(&dev_nlft, &dev_nlft_new);
+      ezcl_device_memory_swap(&dev_nrht, &dev_nrht_new);
+      ezcl_device_memory_swap(&dev_nbot, &dev_nbot_new);
+      ezcl_device_memory_swap(&dev_ntop, &dev_ntop_new);
+
+      ezcl_device_memory_delete(dev_nlft_new);
+      ezcl_device_memory_delete(dev_nrht_new);
+      ezcl_device_memory_delete(dev_nbot_new);
+      ezcl_device_memory_delete(dev_ntop_new);
+   } else {
+      ezcl_device_memory_delete(dev_nlft);
+      ezcl_device_memory_delete(dev_nrht);
+      ezcl_device_memory_delete(dev_nbot);
+      ezcl_device_memory_delete(dev_ntop);
+      dev_nlft = NULL;
+      dev_nrht = NULL;
+      dev_nbot = NULL;
+      dev_ntop = NULL;
+   }
+
+   ezcl_device_memory_delete(dev_indexoffset);
+
+   if (new_ncells != old_ncells){
+      resize_old_device_memory(new_ncells);
+   }
+
+   ezcl_device_memory_swap(&dev_celltype, &dev_celltype_new);
+   ezcl_device_memory_swap(&dev_level, &dev_level_new);
+   ezcl_device_memory_swap(&dev_i, &dev_i_new);
+   ezcl_device_memory_swap(&dev_j, &dev_j_new);
+
+   ezcl_device_memory_delete(dev_mpot);
+   ezcl_device_memory_delete(dev_ijadd);
+   ezcl_device_memory_delete(dev_ioffset);
+
+   ezcl_device_memory_delete(dev_i_new);
+   ezcl_device_memory_delete(dev_j_new);
+   ezcl_device_memory_delete(dev_celltype_new);
+   ezcl_device_memory_delete(dev_level_new);
+
+#ifdef HAVE_MPI
+   if (parallel) {
+      int new_ncells = ncells + add_ncells;
+      MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+      ndispl[0]=0;
+      for (int ip=1; ip<numpe; ip++){
+         ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+      }
+      noffset=ndispl[mype];
+      ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
+   }
+#endif
+
+   gpu_timers[MESH_TIMER_REZONE_ALL] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::calc_neighbors(int ncells)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   if (do_rezone) {
+
+      int flags = INDEX_ARRAY_MEMORY;
+
+#if defined (HAVE_J7)
+      if (parallel) flags |= LOAD_BALANCE_MEMORY;
+#endif
+
+      static int nlft_size = 0;
+
+#ifdef _OPENMP
+#pragma omp master
+      {
+#endif
+      cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+      if (nlft != NULL){
+         nlft_size = mesh_memory.get_memory_size(nlft);
+      }
+
+      if (nlft_size < ncells){
+            if (nlft != NULL){
+               nlft = (int *)mesh_memory.memory_delete(nlft);
+               nrht = (int *)mesh_memory.memory_delete(nrht);
+               nbot = (int *)mesh_memory.memory_delete(nbot);
+               ntop = (int *)mesh_memory.memory_delete(ntop);
+            }
+
+            nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
+            nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
+            nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
+            ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
+      }
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+      if (nlft_size < ncells){
+         int lowerBounds, upperBounds;
+         get_bounds(lowerBounds, upperBounds);
+
+         for(int ic=lowerBounds; ic<upperBounds; ic++){
+            nlft[ic] = -1;
+            nrht[ic] = -1;
+            nbot[ic] = -1;
+            ntop[ic] = -1;
+         }
+      }
+
+      if (calc_neighbor_type == HASH_TABLE) {
+
+         struct timeval tstart_lev2;
+         if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+         int jmaxsize = (jmax+1)*IPOW2(levmx);
+         int imaxsize = (imax+1)*IPOW2(levmx);
+
+         int *hash;
+
+#ifdef _OPENMP
+         hash = compact_hash_init_openmp(ncells, imaxsize, jmaxsize, 0);
+#else
+         hash = compact_hash_init(ncells, imaxsize, jmaxsize, 0);
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for(int ic=0; ic<ncells; ic++){
+               int lev = level[ic];
+
+               bool need_hash = (nlft[ic] == -1 || nrht[ic] == -1 || nbot[ic] == -1 || ntop[ic] == -1) ? true : false;
+
+               if (! need_hash){
+                   if ( (level[nlft[ic]] > lev && ntop[nlft[ic]] == -1) || 
+                        (level[nrht[ic]] > lev && ntop[nrht[ic]] == -1) ||
+                        (level[nbot[ic]] > lev && nrht[nbot[ic]] == -1) || 
+                        (level[ntop[ic]] > lev && nrht[ntop[ic]] == -1) ) need_hash = true;
+               }
+            
+               if (need_hash) {
+                  int levmult = IPOW2(levmx-lev);
+                  int ii = i[ic]*levmult;
+                  int jj = j[ic]*levmult;
+
+                  write_hash(ic,jj*imaxsize+ii,hash);
+               }
+            }
+
+            if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+               cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
+               cpu_timer_start(&tstart_lev2);
+            }
+
+            //fprintf(fp,"DEBUG ncells is %lu\n",ncells);
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int ic=0; ic<(int)ncells; ic++){
+               int ii = i[ic];
+               int jj = j[ic];
+               int lev = level[ic];
+               int levmult = IPOW2(levmx-lev);
+               int iicur = ii*levmult;
+               int iilft = max( (ii-1)*levmult, 0         );
+               int iirht = min( (ii+1)*levmult, imaxsize-1);
+               int jjcur = jj*levmult;
+               int jjbot = max( (jj-1)*levmult, 0         );
+               int jjtop = min( (jj+1)*levmult, jmaxsize-1);
+
+               int nlftval = nlft[ic];
+               int nrhtval = nrht[ic];
+               int nbotval = nbot[ic];
+               int ntopval = ntop[ic];
+
+               // Taking care of boundary cells
+               // Force each boundary cell to point to itself on its boundary direction
+               if (nlftval < 0 && iicur <    1*IPOW2(levmx)  ) nlftval = ic;
+               if (nbotval < 0 && jjcur <    1*IPOW2(levmx)  ) nbotval = ic;
+               if (nrhtval < 0 && iicur > imax*IPOW2(levmx)-1) nrhtval = ic;
+               if (ntopval < 0 && jjcur > jmax*IPOW2(levmx)-1) ntopval = ic;
+               // Boundary cells next to corner boundary need special checks
+               if (nlftval < 0 && iicur ==    1*IPOW2(levmx) &&  (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nlftval = ic;
+               if (nbotval < 0 && jjcur ==    1*IPOW2(levmx) &&  (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) nbotval = ic;
+               if (nrhtval < 0 && iirht == imax*IPOW2(levmx) &&  (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nrhtval = ic;
+               if (ntopval < 0 && jjtop == jmax*IPOW2(levmx) &&  (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) ntopval = ic;
+
+               // need to check for finer neighbor first
+               // Right and top neighbor don't change for finer, so drop through to same size
+               // Left and bottom need to be half of same size index for finer test
+               if (lev != levmx) {
+                  int iilftfiner = iicur-(iicur-iilft)/2;
+                  //int iirhtfiner = (iicur+iirht)/2;
+                  int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                  //int jjtopfiner = (jjcur+jjtop)/2;
+                  if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilftfiner, hash);
+                  if (nbotval < 0) nbotval = read_hash(jjbotfiner*imaxsize+iicur, hash);
+               }
+
+               // same size neighbor
+               if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilft, hash);
+               if (nrhtval < 0) nrhtval = read_hash(jjcur*imaxsize+iirht, hash);
+               if (nbotval < 0) nbotval = read_hash(jjbot*imaxsize+iicur, hash);
+               if (ntopval < 0) ntopval = read_hash(jjtop*imaxsize+iicur, hash);
+
+               // Now we need to take care of special case where bottom and left boundary need adjustment since
+               // expected cell doesn't exist on these boundaries if it is finer than current cell
+               if (lev != levmx) {
+                  if (jjcur < 1*IPOW2(levmx)) {
+                     if (nrhtval < 0) {
+                        int jjtopfiner = (jjcur+jjtop)/2;
+                        nrhtval = read_hash(jjtopfiner*imaxsize+iirht, hash);
+                     }
+                     if (nlftval < 0) {
+                        int iilftfiner = iicur-(iicur-iilft)/2;
+                        int jjtopfiner = (jjcur+jjtop)/2;
+                        nlftval = read_hash(jjtopfiner*imaxsize+iilftfiner, hash);
+                     }
+                  }
+            
+                  if (iicur < 1*IPOW2(levmx)) {
+                     if (ntopval < 0) {
+                        int iirhtfiner = (iicur+iirht)/2;
+                        ntopval = read_hash(jjtop*imaxsize+iirhtfiner, hash);
+                     }
+                     if (nbotval < 0) {
+                        int iirhtfiner = (iicur+iirht)/2;
+                        int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                        nbotval = read_hash(jjbotfiner*imaxsize+iirhtfiner, hash);
+                     }
+                  }
+               }
+            
+               // coarser neighbor
+               if (lev != 0){
+                  if (nlftval < 0) {
+                     iilft -= iicur-iilft;
+                     int jjlft = (jj/2)*2*levmult;
+                     nlftval = read_hash(jjlft*imaxsize+iilft, hash);
+                  }
+                  if (nrhtval < 0) {
+                     int jjrht = (jj/2)*2*levmult;
+                     nrhtval = read_hash(jjrht*imaxsize+iirht, hash);
+                  }
+                  if (nbotval < 0) {
+                     jjbot -= jjcur-jjbot;
+                     int iibot = (ii/2)*2*levmult;
+                     nbotval = read_hash(jjbot*imaxsize+iibot, hash);
+                  }
+                  if (ntopval < 0) {
+                     int iitop = (ii/2)*2*levmult;
+                     ntopval = read_hash(jjtop*imaxsize+iitop, hash);
+                  }
+               }
+
+               nlft[ic] = nlftval;
+               nrht[ic] = nrhtval;
+               nbot[ic] = nbotval;
+               ntop[ic] = ntopval;
+
+               //printf("neighbors[%d] = %d %d %d %d\n",ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+            }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         write_hash_collision_report();
+         read_hash_collision_report();
+
+         compact_hash_delete(hash);
+
+         if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
+#ifdef _OPENMP
+         } // master block
+#endif
+
+      } else if (calc_neighbor_type == KDTREE) {
+
+         struct timeval tstart_lev2;
+         if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         TBounds box;
+         vector<int> index_list(IPOW2(levmx*levmx) );
+
+         int num;
+
+         ibase = 0;
+         calc_spatial_coordinates(ibase);
+
+         kdtree_setup();
+
+         if (TIMING_LEVEL >= 2) {
+            cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         for (int ic=0; ic<ncells; ic++) {
+
+            //left
+            nlft[ic]  = ic;
+            box.min.x = x[ic]-0.25*dx[ic];
+            box.max.x = x[ic]-0.25*dx[ic];
+            box.min.y = y[ic]+0.25*dy[ic];
+            box.max.y = y[ic]+0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nlft[ic]=index_list[0];
+
+            //right
+            nrht[ic]  = ic;
+            box.min.x = x[ic]+1.25*dx[ic];
+            box.max.x = x[ic]+1.25*dx[ic];
+            box.min.y = y[ic]+0.25*dy[ic];
+            box.max.y = y[ic]+0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nrht[ic]=index_list[0];
+
+            //bot
+            nbot[ic]  = ic;
+            box.min.x = x[ic]+0.25*dx[ic];
+            box.max.x = x[ic]+0.25*dx[ic];
+            box.min.y = y[ic]-0.25*dy[ic];
+            box.max.y = y[ic]-0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nbot[ic]=index_list[0];
+
+            //top
+            ntop[ic]  = ic;
+            box.min.x = x[ic]+0.25*dx[ic];
+            box.max.x = x[ic]+0.25*dx[ic];
+            box.min.y = y[ic]+1.25*dy[ic];
+            box.max.y = y[ic]+1.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) ntop[ic]=index_list[0];
+         }  //  End main loop over cells.
+
+         KDTree_Destroy(&tree);
+
+         if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+      } // calc_neighbor_type
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+      ncells_ghost = ncells;
+
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
+}
+
+void Mesh::calc_neighbors_local(void)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   if (do_rezone) {
+
+      int flags = INDEX_ARRAY_MEMORY;
+
+#if defined (HAVE_J7)
+      if (parallel) flags |= LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+      {
+#endif
+      cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+      if (mesh_memory.get_memory_size(nlft) < ncells){
+         if (nlft != NULL) nlft = (int *)mesh_memory.memory_delete(nlft);
+         if (nrht != NULL) nrht = (int *)mesh_memory.memory_delete(nrht);
+         if (nbot != NULL) nbot = (int *)mesh_memory.memory_delete(nbot);
+         if (ntop != NULL) ntop = (int *)mesh_memory.memory_delete(ntop);
+         nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
+         nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
+         nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
+         ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
+      }
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+      int lowerBound, upperBound;
+      set_bounds(ncells);
+      get_bounds(lowerBound, upperBound);
+      for (int ic = lowerBound; ic < upperBound; ic++){
+         nlft[ic] = -98;
+         nrht[ic] = -98;
+         nbot[ic] = -98;
+         ntop[ic] = -98;
+      }
+
+      if (calc_neighbor_type == HASH_TABLE) {
+
+         struct timeval tstart_lev2;
+         if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+         ncells_ghost = ncells;
+
+         // Find maximum i column and j row for this processor
+         static int jmintile, imintile, jmaxtile, imaxtile;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         jmintile = (jmax+1)*IPOW2(levmx);
+         imintile = (imax+1)*IPOW2(levmx);
+         jmaxtile = 0;
+         imaxtile = 0;
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         int my_jmintile = jmintile;
+         int my_imintile = imintile;
+         int my_jmaxtile = 0;
+         int my_imaxtile = 0;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for(uint ic=0; ic<ncells; ic++){
+            int lev = level[ic];
+//          if (lev < 0 || lev > levmx) printf("DEBUG -- cell %d lev %d\n",ic,level[ic]);
+            if ( j[ic]   *IPOW2(levmx-lev)   < my_jmintile) my_jmintile =  j[ic]   *IPOW2(levmx-lev)  ;
+            if ((j[ic]+1)*IPOW2(levmx-lev)-1 > my_jmaxtile) my_jmaxtile = (j[ic]+1)*IPOW2(levmx-lev)-1;
+            if ( i[ic]   *IPOW2(levmx-lev)   < my_imintile) my_imintile =  i[ic]   *IPOW2(levmx-lev)  ;
+            if ((i[ic]+1)*IPOW2(levmx-lev)-1 > my_imaxtile) my_imaxtile = (i[ic]+1)*IPOW2(levmx-lev)-1;
+         }
+#ifdef _OPENMP
+#pragma omp critical
+         {
+#endif
+            if (my_jmintile < jmintile) jmintile = my_jmintile;
+            if (my_imintile < imintile) imintile = my_imintile;
+            if (my_jmaxtile > jmaxtile) jmaxtile = my_jmaxtile;
+            if (my_imaxtile > imaxtile) imaxtile = my_imaxtile;
+#ifdef _OPENMP
+         } // end critical region
+#pragma omp barrier
+#endif
+
+      //if (DEBUG) fprintf(fp,"%d: Tile Sizes are imin %d imax %d jmin %d jmax %d\n",mype,imintile,imaxtile,jmintile,jmaxtile);
+
+      // Expand size by 2*coarse_cells for ghost cells
+      int jminsize = max(jmintile-2*IPOW2(levmx),0);
+      int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
+      int iminsize = max(imintile-2*IPOW2(levmx),0);
+      int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
+      //if (DEBUG) fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+
+      //fprintf(fp,"DEBUG -- ncells %lu\n",ncells);
+
+      static int *hash;
+
+#ifdef _OPENMP
+      hash = compact_hash_init_openmp(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
+#else
+      hash = compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
+#endif
+
+      //printf("%d: DEBUG -- noffset %d cells %d\n",mype,noffset,ncells);
+
+      if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+         fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+      }
+
+      static int imaxcalc, jmaxcalc;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for(uint ic=0; ic<ncells; ic++){
+         int cellnumber = ic+noffset;
+         int lev = level[ic];
+         int levmult = IPOW2(levmx-lev);
+         int ii = i[ic]*levmult-iminsize;
+         int jj = j[ic]*levmult-jminsize;
+
+         write_hash(cellnumber, jj*(imaxsize-iminsize)+ii, hash);
+      } // end for loop
+
+      if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+         cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+#ifdef _OPENMP
+#pragma omp master
+      {
+#endif
+      // Set neighbors to global cell numbers from hash
+      jmaxcalc = (jmax+1)*IPOW2(levmx);
+      imaxcalc = (imax+1)*IPOW2(levmx);
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+      for (uint ic=0; ic<ncells; ic++){
+         int ii = i[ic];
+         int jj = j[ic];
+         int lev = level[ic];
+         int levmult = IPOW2(levmx-lev);
+
+         int iicur = ii*levmult-iminsize;
+         int iilft = max( (ii-1)*levmult, 0         )-iminsize;
+         int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;   
+         int jjcur = jj*levmult-jminsize;
+         int jjbot = max( (jj-1)*levmult, 0         )-jminsize;
+         int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;   
+
+         int nlftval = -1;
+         int nrhtval = -1;
+         int nbotval = -1;
+         int ntopval = -1;
+
+         // Taking care of boundary cells
+         // Force each boundary cell to point to itself on its boundary direction
+         if (iicur <    1*IPOW2(levmx)  -iminsize) nlftval = ic+noffset;
+         if (jjcur <    1*IPOW2(levmx)  -jminsize) nbotval = ic+noffset;
+         if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = ic+noffset;
+         if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = ic+noffset;
+         // Boundary cells next to corner boundary need special checks
+         if (iicur ==    1*IPOW2(levmx)-iminsize &&  (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = ic+noffset;
+         if (jjcur ==    1*IPOW2(levmx)-jminsize &&  (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = ic+noffset;
+         if (iirht == imax*IPOW2(levmx)-iminsize &&  (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = ic+noffset;
+         if (jjtop == jmax*IPOW2(levmx)-jminsize &&  (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = ic+noffset;
+
+         // need to check for finer neighbor first
+         // Right and top neighbor don't change for finer, so drop through to same size
+         // Left and bottom need to be half of same size index for finer test
+         if (lev != levmx) {
+            int iilftfiner = iicur-(iicur-iilft)/2;
+            int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+            if (nlftval < 0) nlftval = read_hash(jjcur     *(imaxsize-iminsize)+iilftfiner, hash);
+            if (nbotval < 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur,      hash);
+         }
+
+         // same size neighbor
+         if (nlftval < 0) {
+            int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+            if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev) nlftval = nlfttry;
+         }
+         if (nrhtval < 0) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+         if (nbotval < 0) {
+            int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+            if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev) nbotval = nbottry;
+         }
+         if (ntopval < 0) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+              
+         // Now we need to take care of special case where bottom and left boundary need adjustment since
+         // expected cell doesn't exist on these boundaries if it is finer than current cell
+         if (lev != levmx) {
+            if (jjcur < 1*IPOW2(levmx)) {
+               if (nrhtval < 0) {
+                  int jjtopfiner = (jjcur+jjtop)/2;
+                  nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+               }
+               if (nlftval < 0) {
+                  int iilftfiner = iicur-(iicur-iilft)/2;
+                  int jjtopfiner = (jjcur+jjtop)/2;
+                  nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+               }
+            }
+
+            if (iicur < 1*IPOW2(levmx)) {
+               if (ntopval < 0) {
+                  int iirhtfiner = (iicur+iirht)/2;
+                  ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+               }
+               if (nbotval < 0) {
+                  int iirhtfiner = (iicur+iirht)/2;
+                  int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                  nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+               }
+            }
+         }
+
+         // coarser neighbor
+         if (lev != 0){
+            if (nlftval < 0) {
+               iilft -= iicur-iilft;
+               int jjlft = (jj/2)*2*levmult-jminsize;
+               int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+               if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev-1) nlftval = nlfttry;
+            }       
+            if (nrhtval < 0) {
+               int jjrht = (jj/2)*2*levmult-jminsize;
+               int nrhttry = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
+               if (nrhttry >= 0 && nrhttry < (int)ncells && level[nrhttry] == lev-1) nrhtval = nrhttry;
+            }       
+            if (nbotval < 0) {
+               jjbot -= jjcur-jjbot;
+               int iibot = (ii/2)*2*levmult-iminsize;
+               int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+               if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev-1) nbotval = nbottry;
+            }       
+            if (ntopval < 0) {
+               int iitop = (ii/2)*2*levmult-iminsize;
+               int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
+               if (ntoptry >= 0 && ntoptry < (int)ncells && level[ntoptry] == lev-1) ntopval = ntoptry;
+            }       
+         }       
+
+         nlft[ic] = nlftval;
+         nrht[ic] = nrhtval;
+         nbot[ic] = nbotval;
+         ntop[ic] = ntopval;
+
+         //fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+      }
+
+      if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         print_local();
+
+         int jmaxglobal = (jmax+1)*IPOW2(levmx);
+         int imaxglobal = (imax+1)*IPOW2(levmx);
+         fprintf(fp,"\n                                    HASH 0 numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    nlft numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+                     if (hashval >= 0 && hashval < (int)ncells) {
+                        fprintf(fp,"%5d",nlft[hashval]);
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+      
+         fprintf(fp,"\n                                    nrht numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+                     if (hashval >= 0 && hashval < (int)ncells) {
+                        fprintf(fp,"%5d",nrht[hashval]);
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    nbot numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+                     if (hashval >= 0 && hashval < (int)ncells) {
+                        fprintf(fp,"%5d",nbot[hashval]);
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    ntop numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+                     if (hashval >= 0 && hashval < (int)ncells) {
+                        fprintf(fp,"%5d",ntop[hashval]);
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+      }
+
+      if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+         cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+#ifdef HAVE_MPI
+      if (numpe > 1) {
+         static int num_comm_partners;
+
+         static vector<int> iminsize_global;
+         static vector<int> imaxsize_global;
+         static vector<int> jminsize_global;
+         static vector<int> jmaxsize_global;
+         static vector<int> comm_partner;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         iminsize_global.resize(numpe);
+         imaxsize_global.resize(numpe);
+         jminsize_global.resize(numpe);
+         jmaxsize_global.resize(numpe);
+         comm_partner.resize(numpe,-1);
+
+         MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+         num_comm_partners = 0;
+         for (int ip = 0; ip < numpe; ip++){
+            if (ip == mype) continue;
+            if (iminsize_global[ip] > imaxtile) continue;
+            if (imaxsize_global[ip] < imintile) continue;
+            if (jminsize_global[ip] > jmaxtile) continue;
+            if (jmaxsize_global[ip] < jmintile) continue;
+            comm_partner[num_comm_partners] = ip;
+            num_comm_partners++;
+            //if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
+         }
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         static vector<int> border_cell;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         border_cell.resize(ncells);
+
+#ifdef BOUNDS_CHECK
+         for (uint ic=0; ic<ncells; ic++){
+            int nl = nlft[ic];
+            if (nl != -1){
+               nl -= noffset;
+               if (nl<0 || nl>= (int)ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+            }
+            int nr = nrht[ic];
+            if (nr != -1){
+               nr -= noffset;
+               if (nr<0 || nr>= (int)ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+            }
+            int nb = nbot[ic];
+            if (nb != -1){
+               nb -= noffset;
+               if (nb<0 || nb>= (int)ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+            }
+            int nt = ntop[ic];
+            if (nt != -1){
+               nt -= noffset;
+               if (nt<0 || nt>= (int)ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+            }
+         }
+#endif
+
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         static vector<int> border_cell_out;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         border_cell_out.resize(ncells);
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (uint ic=0; ic<ncells; ic++){
+               int iborder_cell = 0;
+
+               // left neighbor is undefined -- or -- if left is at finer level check left top for undefined
+               if (nlft[ic] == -1 || (level[nlft[ic]-noffset] > level[ic] && ntop[nlft[ic]-noffset] == -1) ){
+                  iborder_cell |= 0x0001;
+               }
+               if (nrht[ic] == -1 || (level[nrht[ic]-noffset] > level[ic] && ntop[nrht[ic]-noffset] == -1) ){
+                  iborder_cell |= 0x0002;
+               }
+               if (nbot[ic] == -1 || (level[nbot[ic]-noffset] > level[ic] && nrht[nbot[ic]-noffset] == -1) ) {
+                  iborder_cell |= 0x0004;
+               }
+               if (ntop[ic] == -1 || (level[ntop[ic]-noffset] > level[ic] && nrht[ntop[ic]-noffset] == -1) ) {
+                  iborder_cell |= 0x0008;
+               }
+   
+               border_cell[ic] = iborder_cell;
+            }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (uint ic=0; ic<ncells; ic++){
+               int iborder_cell = border_cell[ic];
+
+               if (iborder_cell == 0) {
+
+                  int nl = nlft[ic]-noffset;
+                  if (nl >= 0 && nl < (int)ncells) {
+                     if ((border_cell[nl] & 0x0001) == 0x0001) {
+                        iborder_cell |= 0x0016;
+                     } else if (level[nl] > level[ic]){
+                        int ntl = ntop[nl]-noffset;
+                        if (ntl >= 0 && ntl < (int)ncells && (border_cell[ntl] & 0x0001) == 0x0001) {
+                           iborder_cell |= 0x0016;
+                        }
+                     }
+                  }
+                  int nr = nrht[ic]-noffset;
+                  if (nr >= 0 && nr < (int)ncells) {
+                     if ((border_cell[nrht[ic]-noffset] & 0x0002) == 0x0002) {
+                        iborder_cell |= 0x0032;
+                     } else if (level[nr] > level[ic]){
+                        int ntr = ntop[nr]-noffset;
+                        if (ntr >= 0 && ntr < (int)ncells && (border_cell[ntr] & 0x0002) == 0x0002) {
+                           iborder_cell |= 0x0032;
+                        }
+                     }
+                  }
+                  int nb = nbot[ic]-noffset;
+                  if (nb >= 0 && nb < (int)ncells) {
+                     if ((border_cell[nb] & 0x0004) == 0x0004) {
+                        iborder_cell |= 0x0064;
+                     } else if (level[nb] > level[ic]){
+                        int nrb = nrht[nb]-noffset;
+                        if (nrb >= 0 && nrb < (int)ncells && (border_cell[nrb] & 0x0004) == 0x0004) {
+                           iborder_cell |= 0x0064;
+                        }
+                     }
+                  }
+                  int nt = ntop[ic]-noffset;
+                  if (nt >= 0 && nt < (int)ncells) {
+                     if ((border_cell[nt] & 0x0008) == 0x0008) {
+                        iborder_cell |= 0x0128;
+                     } else if (level[nt] > level[ic]){
+                        int nrt = nrht[nt]-noffset;
+                        if (nrt >= 0 && nrt < (int)ncells && (border_cell[nrt] & 0x0008) == 0x0008) {
+                           iborder_cell |= 0x0128;
+                        }
+                     }
+                  }
+               }
+
+               border_cell_out[ic] = iborder_cell;
+            }
+// indent offset
+
+         vector<int> border_cell_num;
+
+         static int nbsize_local;
+
+         static vector<int> border_cell_i;
+         static vector<int> border_cell_j;
+         static vector<int> border_cell_level;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         for (int ic=0; ic<(int)ncells; ic++){
+            if (border_cell_out[ic] > 0) border_cell_num.push_back(ic+noffset);
+         }
+         //printf("%d: border cell size is %d\n",mype,border_cell_num.size());
+
+         nbsize_local = border_cell_num.size();
+
+         border_cell_i.resize(nbsize_local);
+         border_cell_j.resize(nbsize_local);
+         border_cell_level.resize(nbsize_local);
+
+         for (int ic = 0; ic <nbsize_local; ic++){
+            int cell_num = border_cell_num[ic]-noffset;
+            border_cell_i[ic] = i[cell_num]; 
+            border_cell_j[ic] = j[cell_num]; 
+            border_cell_level[ic] = level[cell_num]; 
+         }
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            fprintf(fp,"%d: Border cell size is %d\n",mype,nbsize_local);
+            for (int ib = 0; ib <nbsize_local; ib++){
+               fprintf(fp,"%d: Border cell %d is %d i %d j %d level %d\n",mype,ib,border_cell_num[ib],
+                  border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+            }
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_FIND_BOUNDARY] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         // Allocate push database
+
+         static int **send_database;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         send_database = (int**)malloc(num_comm_partners*sizeof(int *));
+         for (int ip = 0; ip < num_comm_partners; ip++){
+            send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
+         }
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         // Compute the overlap between processor bounding boxes and set up push database
+
+         static vector<int> send_buffer_count;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         send_buffer_count.resize(num_comm_partners);
+         for (int ip = 0; ip < num_comm_partners; ip++){
+            int icount = 0;
+            for (int ib = 0; ib <nbsize_local; ib++){
+               int lev = border_cell_level[ib];
+               int levmult = IPOW2(levmx-lev);
+               if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] && 
+                   border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] && 
+                   border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] && 
+                   border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
+                  //   border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+                  send_database[ip][icount] = ib;
+                  icount++;
+               }
+            }
+            send_buffer_count[ip]=icount;
+         }
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         // Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and 
+         // send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
+         // It will return receive_count_total for use in allocations
+
+         static int receive_count_total;
+         int i_push_handle = 0;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         i_push_handle = 0;
+         L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
+                       send_database, &receive_count_total, &i_push_handle);
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
+            for (int ip = 0; ip < num_comm_partners; ip++){
+               fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
+               for (int ic = 0; ic < send_buffer_count[ip]; ic++){
+                  int ib = send_database[ip][ic];
+                  fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
+                     border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+               }
+            }
+#ifdef _OPENMP
+         }
+#endif
+         }
+
+         // Can now free the send database. Other arrays are vectors and will automatically 
+         // deallocate
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         for (int ip = 0; ip < num_comm_partners; ip++){
+            free(send_database[ip]);
+         }
+         free(send_database);
+#ifdef _OPENMP
+         }
+#endif
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_PUSH_SETUP] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         // Push the data needed to the adjacent processors
+         static int *border_cell_num_local;
+         static int *border_cell_i_local;
+         static int *border_cell_j_local;
+         static int *border_cell_level_local;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
+         border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
+         border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
+         border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
+
+         L7_Push_Update(&border_cell_num[0],   border_cell_num_local,   i_push_handle);
+         L7_Push_Update(&border_cell_i[0],     border_cell_i_local,     i_push_handle);
+         L7_Push_Update(&border_cell_j[0],     border_cell_j_local,     i_push_handle);
+         L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
+
+         L7_Push_Free(&i_push_handle);
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         nbsize_local = receive_count_total; 
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            for (int ic = 0; ic < nbsize_local; ic++) {
+               fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
+                  border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
+            }
+#ifdef _OPENMP
+         }
+#endif
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_PUSH_BOUNDARY] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_LOCAL_LIST] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            int jmaxglobal = (jmax+1)*IPOW2(levmx);
+            int imaxglobal = (imax+1)*IPOW2(levmx);
+            fprintf(fp,"\n                                    HASH numbering before layer 1\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if (ii >= iminsize && ii < imaxsize) {
+                        fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+#ifdef _OPENMP
+         }
+#endif
+         }
+
+         vector<int> border_cell_needed_local;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         border_cell_needed_local.resize(nbsize_local, 0);
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+
+         // Layer 1
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         for (int ic =0; ic<nbsize_local; ic++){
+            int jj = border_cell_j_local[ic];
+            int ii = border_cell_i_local[ic];
+            int lev = border_cell_level_local[ic];
+            int levmult = IPOW2(levmx-lev);
+
+            int iicur = ii*levmult-iminsize;
+            int iilft = max( (ii-1)*levmult, 0         )-iminsize;
+            int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+            int jjcur = jj*levmult-jminsize;
+            int jjbot = max( (jj-1)*levmult, 0         )-jminsize;
+            int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+            //fprintf(fp,"DEBUG layer ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
+
+            int iborder = 0;
+
+            // Test for cell to left
+            if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 && (jjcur+jjtop)/2 < jmaxsize-jminsize){
+               int nlftval = -1;
+               // Check for finer cell left and bottom side
+               if (lev != levmx){                                // finer neighbor
+                  int iilftfiner = iicur-(iicur-iilft)/2;
+                  nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+                  // Also check for finer cell left and top side
+                  if (nlftval < 0) {
+                     int jjtopfiner = (jjcur+jjtop)/2; 
+                     nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+                  }
+               }
+
+               if (nlftval < 0 && iilft >= 0) {  // same size
+                  int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+                  // we have to test for same level or it could be a finer cell one cell away that it is matching
+                  if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev) {
+                     nlftval = nlfttry;
+                  }
+               }
+    
+               if (lev != 0 && nlftval < 0 && iilft-(iicur-iilft) >= 0){      // coarser neighbor
+                  iilft -= iicur-iilft;
+                  int jjlft = (jj/2)*2*levmult-jminsize;
+                  int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+                  // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+                  if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev-1) {
+                    nlftval = nlfttry;
+                  }
+               }
+               if (nlftval >= 0) iborder |= 0x0001;
+            }
+
+            // Test for cell to right
+            if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
+               int nrhtval = -1;
+               // right neighbor -- finer, same size and coarser
+               nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+               // right neighbor -- finer right top test
+               if (nrhtval < 0 && lev != levmx){
+                  int jjtopfiner = (jjcur+jjtop)/2;
+                  nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+               }
+               if (nrhtval < 0 && lev != 0) { // test for coarser, but not directly above
+                  int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
+                  if (jjrhtcoarser != jjcur) {
+                     int nrhttry = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
+                     if (nrhttry-noffset >= 0 && nrhttry-noffset < (int)ncells && level[nrhttry-noffset] == lev-1) {
+                        nrhtval = nrhttry;
+                     }
+                  }
+               }
+               if (nrhtval > 0)  iborder |= 0x0002;
+            }
+
+            // Test for cell to bottom
+            if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
+               int nbotval = -1;
+               // Check for finer cell below and left side
+               if (lev != levmx){                                // finer neighbor
+                  int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                  nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+                  // Also check for finer cell below and right side
+                  if (nbotval < 0) {
+                     int iirhtfiner = (iicur+iirht)/2; 
+                     nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+                  }
+               }
+
+               if (nbotval < 0 && jjbot >= 0) {  // same size
+                  int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+                  // we have to test for same level or it could be a finer cell one cell away that it is matching
+                  if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev) {
+                     nbotval = nbottry;
+                  }
+               }
+    
+               if (lev != 0 && nbotval < 0 && jjbot-(jjcur-jjbot) >= 0){      // coarser neighbor
+                  jjbot -= jjcur-jjbot;
+                  int iibot = (ii/2)*2*levmult-iminsize;
+                  int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+                  // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+                  if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev-1) {
+                    nbotval = nbottry;
+                  }
+               }
+               if (nbotval >= 0) iborder |= 0x0004;
+            }
+
+            // Test for cell to top
+            if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
+               int ntopval = -1;
+               // top neighbor -- finer, same size and coarser
+               ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+               // top neighbor -- finer top right test
+               if (ntopval < 0 && lev != levmx){
+                  int iirhtfiner = (iicur+iirht)/2;
+                  ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+               }
+               if (ntopval < 0 && lev != 0) { // test for coarser, but not directly above
+                  int iitopcoarser = (ii/2)*2*levmult-iminsize;
+                  if (iitopcoarser != iicur) {
+                     int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
+                     if (ntoptry-noffset >= 0 && ntoptry-noffset < (int)ncells && level[ntoptry-noffset] == lev-1) {
+                        ntopval = ntoptry;
+                     }
+                  }
+               }
+               if (ntopval > 0)  iborder |= 0x0008;
+            }
+
+            if (iborder) border_cell_needed_local[ic] = iborder;
+         }
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            for(int ic=0; ic<nbsize_local; ic++){
+               if (border_cell_needed_local[ic] == 0) continue;
+               fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+            }
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+         }
+
+         // Walk through cell array and set hash to border local index plus ncells+noffset for next pass
+         //fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         for(int ic=0; ic<nbsize_local; ic++){
+            if (border_cell_needed_local[ic] == 0) continue;
+            //fprintf(fp,"%d: index %d cell %d i %d j %d\n",mype,ic,border_cell_num_local[ic],border_cell_i_local[ic],border_cell_j_local[ic]);
+            int lev = border_cell_level_local[ic];
+            int levmult = IPOW2(levmx-lev);
+            int ii = border_cell_i_local[ic]*levmult-iminsize;
+            int jj = border_cell_j_local[ic]*levmult-jminsize;
+
+            write_hash(ncells+noffset+ic, jj*(imaxsize-iminsize)+ii, hash);
+         }
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_LAYER1] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            print_local();
+
+            int jmaxglobal = (jmax+1)*IPOW2(levmx);
+            int imaxglobal = (imax+1)*IPOW2(levmx);
+            fprintf(fp,"\n                                    HASH numbering for 1 layer\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if (ii >= iminsize && ii < imaxsize) {
+                        fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+         }
+
+         // Layer 2
+#ifdef _OPENMP
+#pragma omp master
+         {
+#endif
+         for (int ic =0; ic<nbsize_local; ic++){
+            if (border_cell_needed_local[ic] > 0) continue;
+            int jj = border_cell_j_local[ic];
+            int ii = border_cell_i_local[ic];
+            int lev = border_cell_level_local[ic];
+            int levmult = IPOW2(levmx-lev);
+
+            int iicur = ii*levmult-iminsize;
+            int iilft = max( (ii-1)*levmult, 0         )-iminsize;
+            int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+            int jjcur = jj*levmult-jminsize;
+            int jjbot = max( (jj-1)*levmult, 0         )-jminsize;
+            int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+            //fprintf(fp,"            DEBUG layer2 ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
+   
+            int iborder = 0;
+
+            // Test for cell to left
+            if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 &&      (jjcur+jjtop)/2 < jmaxsize-jminsize){
+               // Check for finer cell left and bottom side
+               if (lev != levmx){                                // finer neighbor
+                  int iilftfiner = iicur-(iicur-iilft)/2;
+                  int nl = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+                  if (nl >= (int)(ncells+noffset) && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+                     iborder = 0x0001;
+                  } else {
+                     // Also check for finer cell left and top side
+                     int jjtopfiner = (jjcur+jjtop)/2;
+                     int nlt = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+                     if ( nlt >= (int)(ncells+noffset) && (border_cell_needed_local[nlt-ncells-noffset] & 0x0001) == 0x0001) {
+                        iborder = 0x0001;
+                     }
+                  }
+               }
+               if ( (iborder & 0x0001) == 0 && iilft >= 0) { //same size
+                  int nl = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+                  int levcheck = -1;
+                  if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
+                     levcheck = level[nl-noffset];
+                  } else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
+                     levcheck = border_cell_level_local[nl-ncells-noffset];
+                  }
+                  if (nl >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+                     iborder = 0x0001;
+                  } else if (lev != 0 && iilft-(iicur-iilft) >= 0){      // coarser neighbor
+                     iilft -= iicur-iilft;
+                     int jjlft = (jj/2)*2*levmult-jminsize;
+                     nl = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+                     levcheck = -1;
+                     if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
+                        levcheck = level[nl-noffset];
+                     } else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
+                        levcheck = border_cell_level_local[nl-ncells-noffset];
+                     }
+                     // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+                     if (nl  >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+                        iborder = 0x0001;
+                     }
+                  }
+               }
+            }
+
+            // Test for cell to right
+            if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
+               // right neighbor -- finer, same size and coarser
+               int nr = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+               if (nr >= (int)(ncells+noffset) && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
+                  iborder = 0x0002;
+               } else if (lev != levmx){
+                  // right neighbor -- finer right top test
+                  int jjtopfiner = (jjcur+jjtop)/2;
+                  int nrt = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+                  if (nrt >= (int)(ncells+noffset) && (border_cell_needed_local[nrt-ncells-noffset] & 0x0002) == 0x0002) {
+                     iborder = 0x0002;
+                  }
+               }
+               if ( (iborder & 0x0002) == 0  && lev != 0) { // test for coarser, but not directly right
+                  int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
+                  if (jjrhtcoarser != jjcur) {
+                     int nr = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
+                     int levcheck = -1;
+                     if (nr-noffset >= 0 && nr-noffset < (int)ncells) {
+                        levcheck = level[nr-noffset];
+                     } else if (nr >= 0 && (int)(nr-ncells-noffset) >= 0 && (int)(nr-ncells-noffset) < nbsize_local) {
+                        levcheck = border_cell_level_local[nr-ncells-noffset];
+                     }
+                     if (nr >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
+                        iborder = 0x0002;
+                     }
+                  }
+               }
+            }
+
+            // Test for cell to bottom
+            if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
+               // Check for finer cell below and left side
+               if (lev != levmx){                                // finer neighbor
+                  int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                  int nb = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+                  if (nb >= (int)(ncells+noffset) && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+                     iborder = 0x0004;
+                  } else {
+                     // Also check for finer cell below and right side
+                     int iirhtfiner = (iicur+iirht)/2;
+                     int nbr = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+                     if (nbr >= (int)(ncells+noffset) && (border_cell_needed_local[nbr-ncells-noffset] & 0x0004) == 0x0004) {
+                        iborder = 0x0004;
+                     }
+                  }
+               }
+               if ( (iborder & 0x0004) == 0 && jjbot >= 0) { //same size
+                  int nb = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+                  int levcheck = -1;
+                  if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+                     levcheck = level[nb-noffset];
+                  } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+                     levcheck = border_cell_level_local[nb-ncells-noffset];
+                  }
+                  if (nb >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+                     iborder = 0x0004;
+                  } else if (lev != 0 && jjbot-(jjcur-jjbot) >= 0){      // coarser neighbor
+                     jjbot -= jjcur-jjbot;
+                     int iibot = (ii/2)*2*levmult-iminsize;
+                     nb = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+                     levcheck = -1;
+                     if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+                        levcheck = level[nb-noffset];
+                     } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+                        levcheck = border_cell_level_local[nb-ncells-noffset];
+                     }
+                     // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+                     if (nb >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+                        iborder = 0x0004;
+                     }
+                  }
+               }
+            }
+
+            // Test for cell to top
+            if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
+               // top neighbor -- finer, same size and coarser
+               int nt = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+               if (nt  >= (int)(ncells+noffset) && (border_cell_needed_local[nt-ncells-noffset] & 0x0008) == 0x0008) {
+                  iborder = 0x0008;
+               } else if (lev != levmx){
+                  int iirhtfiner = (iicur+iirht)/2;
+                  int ntr = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+                  if ( ntr >= (int)(ncells+noffset) && (border_cell_needed_local[ntr-ncells-noffset] & 0x0008) == 0x0008) {
+                     iborder = 0x0008;
+                  }
+               }
+               if ( (iborder & 0x0008) == 0  && lev != 0) { // test for coarser, but not directly above
+                  int iitopcoarser = (ii/2)*2*levmult-iminsize;
+                  if (iitopcoarser != iicur) {
+                     int nb = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
+                     int levcheck = -1;
+                     if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+                        levcheck = level[nb-noffset];
+                     } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+                        levcheck = border_cell_level_local[nb-ncells-noffset];
+                     }
+                     if (nb-noffset >= (int)(ncells-noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0008) == 0x0008) {
+                        iborder = 0x0008;
+                     }
+                  }
+               }
+            }
+
+            if (iborder) border_cell_needed_local[ic] = iborder |= 0x0016;
+         }
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+         vector<int> indices_needed;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         if (DEBUG) {
+            for(int ic=0; ic<nbsize_local; ic++){
+               if (border_cell_needed_local[ic] <  0x0016) fprintf(fp,"%d: First  set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+               if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+            }
+         }
+
+         int inew = 0;
+         for(int ic=0; ic<nbsize_local; ic++){
+            if (border_cell_needed_local[ic] <= 0) continue;
+            indices_needed.push_back(border_cell_num_local[ic]);
+
+            border_cell_num_local[inew]    = border_cell_num_local[ic];
+            border_cell_i_local[inew]      = border_cell_i_local[ic];
+            border_cell_j_local[inew]      = border_cell_j_local[ic];
+            border_cell_level_local[inew]  = border_cell_level_local[ic];
+            // border_cell_num_local is not used after -- could be commented out?
+            // border_cell_needed_local[inew] = 1;
+
+            inew++;
+         }
+         nbsize_local = inew;
+
+         free(border_cell_num_local);
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+         // Walk through cell array and set hash to global cell values
+         //fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for(int ic=0; ic<nbsize_local; ic++){
+            int lev = border_cell_level_local[ic];
+            int levmult = IPOW2(levmx-lev);
+
+            int ii = border_cell_i_local[ic]*levmult-iminsize;
+            int jj = border_cell_j_local[ic]*levmult-jminsize;
+
+            write_hash(-(ncells+ic), jj*(imaxsize-iminsize)+ii, hash);
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_LAYER2] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            print_local();
+
+            int jmaxglobal = (jmax+1)*IPOW2(levmx);
+            int imaxglobal = (imax+1)*IPOW2(levmx);
+            fprintf(fp,"\n                                    HASH numbering for 2 layer\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if (ii >= iminsize && ii < imaxsize) {
+                        fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+#ifdef _OPENMP
+         } // end master region
+#endif
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_LAYER_LIST] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         int nghost = nbsize_local;
+         ncells_ghost = ncells + nghost;
+
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         celltype = (int *)mesh_memory.memory_realloc(ncells_ghost, celltype);
+         i        = (int *)mesh_memory.memory_realloc(ncells_ghost, i);
+         j        = (int *)mesh_memory.memory_realloc(ncells_ghost, j);
+         level    = (int *)mesh_memory.memory_realloc(ncells_ghost, level);
+         nlft     = (int *)mesh_memory.memory_realloc(ncells_ghost, nlft);
+         nrht     = (int *)mesh_memory.memory_realloc(ncells_ghost, nrht);
+         nbot     = (int *)mesh_memory.memory_realloc(ncells_ghost, nbot);
+         ntop     = (int *)mesh_memory.memory_realloc(ncells_ghost, ntop);
+         memory_reset_ptrs();
+#ifdef _OPENMP
+         } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for (int ic = ncells; ic < (int)ncells_ghost; ic++){
+            nlft[ic] = -1;
+            nrht[ic] = -1;
+            nbot[ic] = -1;
+            ntop[ic] = -1;
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_COPY_MESH_DATA] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for(int ic=0; ic<nbsize_local; ic++){
+            int ii = border_cell_i_local[ic];
+            int jj = border_cell_j_local[ic];
+            int lev = border_cell_level_local[ic];
+            if (ii < lev_ibegin[lev]) celltype[ncells+ic] = LEFT_BOUNDARY;
+            if (ii > lev_iend[lev])   celltype[ncells+ic] = RIGHT_BOUNDARY;
+            if (jj < lev_jbegin[lev]) celltype[ncells+ic] = BOTTOM_BOUNDARY;
+            if (jj > lev_jend[lev])   celltype[ncells+ic] = TOP_BOUNDARY;
+            i[ncells+ic]     = ii;
+            j[ncells+ic]     = jj;
+            level[ncells+ic] = lev;
+         }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            free(border_cell_i_local);
+            free(border_cell_j_local);
+            free(border_cell_level_local);
+#ifdef _OPENMP
+         } // end master region
+#endif
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_FILL_MESH_GHOST] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               fprintf(fp,"After copying i,j, level to ghost cells\n");
+               print_local();
+#ifdef _OPENMP
+            } // end master region
+#endif
+         }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for (uint ic=0; ic<ncells_ghost; ic++){
+            int ii = i[ic];
+            int jj = j[ic];
+            int lev = level[ic];
+            int levmult = IPOW2(levmx-lev);
+
+            int iicur = ii*levmult-iminsize;
+            int iilft = max( (ii-1)*levmult, 0         )-iminsize;
+            int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+            int jjcur = jj*levmult-jminsize;
+            int jjbot = max( (jj-1)*levmult, 0         )-jminsize;
+            int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+            //fprintf(fp,"DEBUG neigh ic %d nlft %d ii %d levmult %d iminsize %d icheck %d\n",ic,nlft[ic],ii,levmult,iminsize,(max(  ii   *levmult-1, 0))-iminsize);
+
+            int nlftval = nlft[ic];
+            int nrhtval = nrht[ic];
+            int nbotval = nbot[ic];
+            int ntopval = ntop[ic];
+
+            if (nlftval == -1){
+               // Taking care of boundary cells
+               // Force each boundary cell to point to itself on its boundary direction
+               if (iicur <    1*IPOW2(levmx)  -iminsize) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // Boundary cells next to corner boundary need special checks
+               if (iicur ==    1*IPOW2(levmx)-iminsize &&  (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // need to check for finer neighbor first
+               // Right and top neighbor don't change for finer, so drop through to same size
+               // Left and bottom need to be half of same size index for finer test
+               if (lev != levmx) {
+                  int iilftfiner = iicur-(iicur-iilft)/2;
+                  if (nlftval == -1 && iilftfiner >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+               }
+
+               // same size neighbor
+               if (nlftval == -1 && iilft >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+
+               // Now we need to take care of special case where bottom and left boundary need adjustment since
+               // expected cell doesn't exist on these boundaries if it is finer than current cell
+               if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
+                  if (nlftval == -1) {
+                     int iilftfiner = iicur-(iicur-iilft)/2;
+                     int jjtopfiner = (jjcur+jjtop)/2;
+                     if (jjtopfiner < jmaxsize-jminsize && iilftfiner >= 0) nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+                  }
+               }
+
+               // coarser neighbor
+               if (lev != 0){
+                  if (nlftval == -1) {
+                     int iilftcoarser = iilft - (iicur-iilft);
+                     int jjlft = (jj/2)*2*levmult-jminsize;
+                     if (iilftcoarser >=0) nlftval = read_hash(jjlft*(imaxsize-iminsize)+iilftcoarser, hash);
+                  }
+               }
+
+               if (nlftval != -1) nlft[ic] = nlftval;
+            }
+
+            if (nrhtval == -1) {
+               // Taking care of boundary cells
+               // Force each boundary cell to point to itself on its boundary direction
+               if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // Boundary cells next to corner boundary need special checks
+               if (iirht == imax*IPOW2(levmx)-iminsize &&  (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // same size neighbor
+               if (nrhtval == -1 && iirht < imaxsize-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+
+               // Now we need to take care of special case where bottom and left boundary need adjustment since
+               // expected cell doesn't exist on these boundaries if it is finer than current cell
+               if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
+                  if (nrhtval == -1) {
+                     int jjtopfiner = (jjcur+jjtop)/2;
+                     if (jjtopfiner < jmaxsize-jminsize && iirht < imaxsize-iminsize) nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+                  }
+               }
+
+               // coarser neighbor
+               if (lev != 0){
+                  if (nrhtval == -1) {
+                     int jjrht = (jj/2)*2*levmult-jminsize;
+                     if (iirht < imaxsize-iminsize) nrhtval = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
+                  }
+               }
+               if (nrhtval != -1) nrht[ic] = nrhtval;
+            }
+ 
+            if (nbotval == -1) {
+               // Taking care of boundary cells
+               // Force each boundary cell to point to itself on its boundary direction
+               if (jjcur <    1*IPOW2(levmx)  -jminsize) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+               // Boundary cells next to corner boundary need special checks
+               if (jjcur ==    1*IPOW2(levmx)-jminsize &&  (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // need to check for finer neighbor first
+               // Right and top neighbor don't change for finer, so drop through to same size
+               // Left and bottom need to be half of same size index for finer test
+               if (lev != levmx) {
+                  int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                  if (nbotval == -1 && jjbotfiner >= 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+               }
+
+               // same size neighbor
+               if (nbotval == -1 && jjbot >=0) nbotval = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+
+               // Now we need to take care of special case where bottom and left boundary need adjustment since
+               // expected cell doesn't exist on these boundaries if it is finer than current cell
+               if (iicur < 1*IPOW2(levmx) && lev != levmx) {
+                  if (nbotval == -1) {
+                     int iirhtfiner = (iicur+iirht)/2;
+                     int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+                     if (jjbotfiner >= 0 && iirhtfiner < imaxsize-iminsize) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+                  }
+               }
+
+               // coarser neighbor
+               if (lev != 0){
+                  if (nbotval == -1) {
+                     int jjbotcoarser = jjbot - (jjcur-jjbot);
+                     int iibot = (ii/2)*2*levmult-iminsize;
+                     if (jjbotcoarser >= 0 && iibot >= 0) nbotval = read_hash(jjbotcoarser*(imaxsize-iminsize)+iibot, hash);
+                  }
+               }
+               if (nbotval != -1) nbot[ic] = nbotval;
+            }
+    
+            if (ntopval == -1) {
+               // Taking care of boundary cells
+               // Force each boundary cell to point to itself on its boundary direction
+               if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+               // Boundary cells next to corner boundary need special checks
+               if (jjtop == jmax*IPOW2(levmx)-jminsize &&  (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+               // same size neighbor
+               if (ntopval == -1 && jjtop < jmaxsize-jminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+   
+               if (iicur < 1*IPOW2(levmx)) {
+                  if (ntopval == -1) {
+                     int iirhtfiner = (iicur+iirht)/2;
+                     if (jjtop < jmaxsize-jminsize && iirhtfiner < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+                  }
+               }
+   
+               // coarser neighbor
+               if (lev != 0){
+                  if (ntopval == -1) {
+                     int iitop = (ii/2)*2*levmult-iminsize;
+                     if (jjtop < jmaxsize-jminsize && iitop < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
+                  }
+               }
+               if (ntopval != -1) ntop[ic] = ntopval;
+            }
+ 
+            //fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+         }
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+            fprintf(fp,"After setting neighbors through ghost cells\n");
+            print_local();
+#ifdef _OPENMP
+            } // end master region
+#endif
+         }
+
+/*
+         // Set neighbors to global cell numbers from hash
+         for (uint ic=0; ic<ncells; ic++){
+            ii = i[ic];
+            jj = j[ic];
+            lev = level[ic];
+            levmult = IPOW2(levmx-lev);
+            //fprintf(fp,"%d:Neighbors input for ic %d ii %d jj %d levmult %d lev %d\n",mype,ic, ii, jj, levmult,lev);
+            //fprintf(fp,"%d:Neighbors befor ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+            if (nlft[ic] == -1) nlft[ic] = hash[(      jj   *levmult               )-jminsize][(max(  ii   *levmult-1, 0         ))-iminsize];
+            if (celltype[ic] == BOTTOM_BOUNDARY && nlft[ic] == -1){
+               if (nlft[ic] == -1) nlft[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult,   imaxcalc-1))-iminsize];
+            }
+            if (nrht[ic] == -1) nrht[ic] = hash[(      jj   *levmult               )-jminsize][(min( (ii+1)*levmult,   imaxcalc-1))-iminsize];
+            if (celltype[ic] == BOTTOM_BOUNDARY && nrht[ic] == -1){
+               if (nrht[ic] == -1) nrht[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult,   imaxcalc-1))-iminsize];
+               //if (ic == 3 && mype == 0) printf("DEBUG line %d -- ic %d celltype %d nrht %d\n",__line__,ic,celltype[ic],nrht[ic]);
+               //printf("DEBUG line %d -- ic %d celltype %d nrht %d jj %d ii %d\n",__line__,ic,celltype[ic],nrht[ic],(jj+1)*levmult-jminsize,(min( (ii+1)*levmult,   imaxcalc-1))-iminsize);
+            }
+            if (nbot[ic] == -1) nbot[ic] = hash[(max(  jj   *levmult-1, 0)         )-jminsize][(      ii   *levmult               )-iminsize];
+            if (celltype[ic] == LEFT_BOUNDARY && nbot[ic] == -1){
+               if (nbot[ic] == -1) nbot[ic] = hash[(max(  jj   *levmult-1, 0)         )-jminsize][(      ii   *levmult+1             )-iminsize];
+            }
+            if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult,   jmaxcalc-1))-jminsize][(      ii   *levmult               )-iminsize];
+            if (celltype[ic] == LEFT_BOUNDARY && ntop[ic] == -1){
+               if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult,   jmaxcalc-1))-jminsize][(      ii   *levmult+1             )-iminsize];
+            }
+            //fprintf(fp,"%d:Neighbors after ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+         }
+*/
+
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               fprintf(fp,"After setting corner neighbors\n");
+               print_local();
+#ifdef _OPENMP
+            } // end master region
+#endif
+         }
+
+         // Adjusting neighbors to local indices
+#ifdef _OPENMP
+#pragma omp for
+#endif
+         for (uint ic=0; ic<ncells_ghost; ic++){
+            //fprintf(fp,"%d: ic %d nlft %d noffset %d ncells %ld\n",mype,ic,nlft[ic],noffset,ncells);
+            if (nlft[ic] <= -(int)ncells && nlft[ic] > -(int)ncells_ghost){
+               nlft[ic] = abs(nlft[ic]);
+            } else if (nlft[ic] >= noffset && nlft[ic] < (int)(noffset+ncells)) {
+               nlft[ic] -= noffset;
+            }
+            if (nrht[ic] <= -(int)ncells && nrht[ic] > -(int)ncells_ghost){
+               nrht[ic] = abs(nrht[ic]);
+            } else if (nrht[ic] >= noffset && nrht[ic] < (int)(noffset+ncells)) {
+               nrht[ic] -= noffset;
+            }
+            if (nbot[ic] <= -(int)ncells && nbot[ic] > -(int)ncells_ghost){
+               nbot[ic] = abs(nbot[ic]);
+            } else if (nbot[ic] >= noffset && nbot[ic] < (int)(noffset+ncells)) {
+               nbot[ic] -= noffset;
+            }
+            if (ntop[ic] <= -(int)ncells && ntop[ic] > -(int)ncells_ghost){
+               ntop[ic] = abs(ntop[ic]);
+            } else if (ntop[ic] >= noffset && ntop[ic] < (int)(noffset+ncells)) {
+               ntop[ic] -= noffset;
+            }
+         }
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+               fprintf(fp,"After adjusting neighbors to local indices\n");
+               print_local();
+#ifdef _OPENMP
+            } // end master region
+#endif
+         }
+         
+         if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+            cpu_timers[MESH_TIMER_NEIGH_ADJUST] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+            offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
+            offtile_local_count++;
+            offtile_ratio_local /= offtile_local_count;
+            //printf("%d ncells size is %ld ncells_ghost size is %ld nghost %d\n",mype,ncells,ncells_ghost,nghost);
+            //fprintf(fp,"%d ncells_ghost size is %ld nghost %d\n",mype,ncells_ghost,nghost);
+
+            if (cell_handle) L7_Free(&cell_handle);
+            cell_handle=0;
+
+            if (DEBUG) {
+               fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
+               for (int ig = 0; ig<nghost; ig++){
+                  fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ig,indices_needed[ig]);
+               }
+            }
+            L7_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
+
+            if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_SETUP_COMM] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+         } // end master region
+#endif
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+            print_local();
+
+            int jmaxglobal = (jmax+1)*IPOW2(levmx);
+            int imaxglobal = (imax+1)*IPOW2(levmx);
+            fprintf(fp,"\n                                    HASH numbering\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if (ii >= iminsize && ii < imaxsize) {
+                        fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+                     } else {
+                        fprintf(fp,"     ");
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+
+            fprintf(fp,"\n                                    nlft numbering\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if (ii >= iminsize && ii < imaxsize) {
+                        int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+                        if ( (hashval >= 0 && hashval < (int)ncells) ) {
+                              fprintf(fp,"%5d",nlft[hashval]);
+                        } else {
+                              fprintf(fp,"     ");
+                        }
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+      
+            fprintf(fp,"\n                                    nrht numbering\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if ( ii >= iminsize && ii < imaxsize ) {
+                        int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+                        if ( hashval >= 0 && hashval < (int)ncells ) {
+                           fprintf(fp,"%5d",nrht[hashval]);
+                        } else {
+                           fprintf(fp,"     ");
+                        }
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+
+            fprintf(fp,"\n                                    nbot numbering\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if ( ii >= iminsize && ii < imaxsize ) {
+                        int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+                        if ( hashval >= 0 && hashval < (int)ncells ) {
+                           fprintf(fp,"%5d",nbot[hashval]);
+                        } else {
+                           fprintf(fp,"     ");
+                        }
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+
+            fprintf(fp,"\n                                    ntop numbering\n");
+            for (int jj = jmaxglobal-1; jj>=0; jj--){
+               fprintf(fp,"%2d: %4d:",mype,jj);
+               if (jj >= jminsize && jj < jmaxsize) {
+                  for (int ii = 0; ii<imaxglobal; ii++){
+                     if ( ii >= iminsize && ii < imaxsize ) {
+                        int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+                        if ( hashval >= 0 && hashval < (int)ncells ) {
+                           fprintf(fp,"%5d",ntop[hashval]);
+                        } else {
+                           fprintf(fp,"     ");
+                        }
+                     }
+                  }
+               }
+               fprintf(fp,"\n");
+            }
+            fprintf(fp,"%2d:      ",mype);
+            for (int ii = 0; ii<imaxglobal; ii++){
+               fprintf(fp,"%4d:",ii);
+            }
+            fprintf(fp,"\n");
+      
+#ifdef _OPENMP
+            } // end master region
+#endif
+         } // end DEBUG
+
+         if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+            print_local();
+
+            for (uint ic=0; ic<ncells; ic++){
+               fprintf(fp,"%d: before update ic %d        i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+                   mype,ic,i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+            }
+            int ig=0;
+            for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
+               fprintf(fp,"%d: after  update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+                   mype,ic,indices_needed[ig],i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+            }
+#ifdef _OPENMP
+            } // end master region
+#endif
+         } // end DEBUG
+
+      } // if numpe > 1
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+            {
+#endif
+      write_hash_collision_report();
+      read_hash_collision_report();
+      compact_hash_delete(hash);
+
+#ifdef BOUNDS_CHECK
+      {
+         for (uint ic=0; ic<ncells; ic++){
+            int nl = nlft[ic];
+            if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+            if (level[nl] > level[ic]){
+               int ntl = ntop[nl];
+               if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
+            }
+            int nr = nrht[ic];
+            if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+            if (level[nr] > level[ic]){
+               int ntr = ntop[nr];
+               if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
+            }
+            int nb = nbot[ic];
+            if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+            if (level[nb] > level[ic]){
+               int nrb = nrht[nb];
+               if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
+            }
+            int nt = ntop[ic];
+            if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+            if (level[nt] > level[ic]){
+               int nrt = nrht[nt];
+               if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
+            }
+         }
+      }
+#endif
+
+#ifdef _OPENMP
+            } // end master region
+#pragma omp barrier
+#endif
+
+      } else if (calc_neighbor_type == KDTREE) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+         {
+#endif
+         struct timeval tstart_lev2;
+         if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+         TBounds box;
+         vector<int> index_list(IPOW2(levmx*levmx) );
+
+         int num;
+
+         ibase = 0;
+         calc_spatial_coordinates(ibase);
+
+         kdtree_setup();
+
+         if (TIMING_LEVEL >= 2) {
+            cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
+            cpu_timer_start(&tstart_lev2);
+         }
+
+         for (uint ic=0; ic<ncells; ic++) {
+
+            //left
+            nlft[ic]  = ic;
+            box.min.x = x[ic]-0.25*dx[ic];
+            box.max.x = x[ic]-0.25*dx[ic];
+            box.min.y = y[ic]+0.25*dy[ic];
+            box.max.y = y[ic]+0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nlft[ic]=index_list[0];
+
+            //right
+            nrht[ic]  = ic;
+            box.min.x = x[ic]+1.25*dx[ic];
+            box.max.x = x[ic]+1.25*dx[ic];
+            box.min.y = y[ic]+0.25*dy[ic];
+            box.max.y = y[ic]+0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nrht[ic]=index_list[0];
+
+            //bot
+            nbot[ic]  = ic;
+            box.min.x = x[ic]+0.25*dx[ic];
+            box.max.x = x[ic]+0.25*dx[ic];
+            box.min.y = y[ic]-0.25*dy[ic];
+            box.max.y = y[ic]-0.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) nbot[ic]=index_list[0];
+
+            //top
+            ntop[ic]  = ic;
+            box.min.x = x[ic]+0.25*dx[ic];
+            box.max.x = x[ic]+0.25*dx[ic];
+            box.min.y = y[ic]+1.25*dy[ic];
+            box.max.y = y[ic]+1.25*dy[ic];
+            KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+            if (num == 1) ntop[ic]=index_list[0];
+         }  //  End main loop over cells.
+
+         KDTree_Destroy(&tree);
+
+         if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+         }
+#pragma omp barrier
+#endif
+      } // calc_neighbor_type
+
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_calc_neighbors(void)
+{
+   if (! gpu_do_rezone) return;
+
+   ulong gpu_hash_table_size =  0;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   struct timeval tstart_lev2;
+   cpu_timer_start(&tstart_lev2);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+   assert(dev_levtable);
+   assert(dev_level);
+   assert(dev_i);
+   assert(dev_j);
+
+   size_t mem_request = (int)((float)ncells*mem_factor);
+
+   size_t local_work_size = MIN(ncells, TILE_SIZE);
+   size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+   //printf("DEBUG file %s line %d dev_nlft %p size %d\n",__FILE__,__LINE__,dev_nlft,ezcl_get_device_mem_nelements(dev_nlft));
+
+   if (dev_nlft == NULL || ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells) {
+      dev_nlft     = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_nrht     = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_nbot     = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_ntop     = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_neighbor_init,  0, sizeof(cl_int),   (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  1, sizeof(cl_mem),   (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  2, sizeof(cl_mem),   (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  3, sizeof(cl_mem),   (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_neighbor_init,  4, sizeof(cl_mem),   (void *)&dev_ntop);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init,   1, NULL, &global_work_size, &local_work_size, NULL);
+   }
+
+   int imaxsize = (imax+1)*IPOW2(levmx);
+   int jmaxsize = (jmax+1)*IPOW2(levmx);
+
+   int gpu_hash_method       = METHOD_UNSET;
+// allow input.c to control hash types and methods
+   if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
+//=========
+
+   size_t hashsize;
+
+   uint hash_report_level = 1;
+   cl_mem dev_hash_header = NULL;
+   cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize, jmaxsize, gpu_hash_method, hash_report_level,
+      &gpu_hash_table_size, &hashsize, &dev_hash_header);
+
+      /*
+                    const int   isize,        // 0
+                    const int   levmx,        // 1
+                    const int   imaxsize,     // 2
+           __global const int   *levtable,    // 3
+           __global const int   *level,       // 4
+           __global const int   *i,           // 5
+           __global const int   *j,           // 6
+           __global const ulong *hash_header, // 7
+           __global       int   *hash)        // 8
+      */
+
+   cl_event hash_setup_event;
+
+   ezcl_set_kernel_arg(kernel_hash_setup,  0, sizeof(cl_int),   (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_hash_setup,  1, sizeof(cl_int),   (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_hash_setup,  2, sizeof(cl_int),   (void *)&imaxsize);
+   ezcl_set_kernel_arg(kernel_hash_setup,  3, sizeof(cl_mem),   (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_hash_setup,  4, sizeof(cl_mem),   (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_hash_setup,  5, sizeof(cl_mem),   (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_hash_setup,  6, sizeof(cl_mem),   (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_hash_setup,  7, sizeof(cl_mem),   (void *)&dev_nlft);
+   ezcl_set_kernel_arg(kernel_hash_setup,  8, sizeof(cl_mem),   (void *)&dev_nrht);
+   ezcl_set_kernel_arg(kernel_hash_setup,  9, sizeof(cl_mem),   (void *)&dev_nbot);
+   ezcl_set_kernel_arg(kernel_hash_setup, 10, sizeof(cl_mem),   (void *)&dev_ntop);
+   ezcl_set_kernel_arg(kernel_hash_setup, 11, sizeof(cl_mem),   (void *)&dev_hash_header);
+   ezcl_set_kernel_arg(kernel_hash_setup, 12, sizeof(cl_mem),   (void *)&dev_hash);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup,   1, NULL, &global_work_size, &local_work_size, &hash_setup_event);
+
+   ezcl_wait_for_events(1, &hash_setup_event);
+   ezcl_event_release(hash_setup_event);
+
+   if (TIMING_LEVEL >= 2) {
+      gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+      cpu_timer_start(&tstart_lev2);
+   }
+
+      /*
+                    const int   isize,        // 0
+                    const int   levmx,        // 1
+                    const int   imax,         // 2
+                    const int   jmax,         // 3
+                    const int   imaxsize,     // 4
+                    const int   jmaxsize,     // 5
+           __global const int   *levtable,    // 6
+           __global const int   *level,       // 7
+           __global const int   *i,           // 8
+           __global const int   *j,           // 9
+           __global       int   *nlft,        // 10
+           __global       int   *nrht,        // 11
+           __global       int   *nbot,        // 12
+           __global       int   *ntop,        // 13
+           __global const ulong *hash_header, // 14
+           __global       int   *hash)        // 15
+      */
+
+   cl_event calc_neighbors_event;
+
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 0,  sizeof(cl_int),   (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 1,  sizeof(cl_int),   (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 2,  sizeof(cl_int),   (void *)&imax);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 3,  sizeof(cl_int),   (void *)&jmax);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 4,  sizeof(cl_int),   (void *)&imaxsize);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 5,  sizeof(cl_int),   (void *)&jmaxsize);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 6,  sizeof(cl_mem),   (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 7,  sizeof(cl_mem),   (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 8,  sizeof(cl_mem),   (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 9,  sizeof(cl_mem),   (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 10, sizeof(cl_mem),   (void *)&dev_nlft);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 11, sizeof(cl_mem),   (void *)&dev_nrht);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 12, sizeof(cl_mem),   (void *)&dev_nbot);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 13, sizeof(cl_mem),   (void *)&dev_ntop);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 14, sizeof(cl_mem),   (void *)&dev_hash_header);
+   ezcl_set_kernel_arg(kernel_calc_neighbors, 15, sizeof(cl_mem),   (void *)&dev_hash);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors,   1, NULL, &global_work_size, &local_work_size, &calc_neighbors_event);
+
+   ezcl_wait_for_events(1, &calc_neighbors_event);
+   ezcl_event_release(calc_neighbors_event);
+
+   gpu_compact_hash_delete(dev_hash, dev_hash_header);
+
+   if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+
+   gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+
+
+void Mesh::gpu_calc_neighbors_local(void)
+{
+   if (! gpu_do_rezone) return;
+
+   ulong gpu_hash_table_size =  0;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   struct timeval tstart_lev2;
+   if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+   ncells_ghost = ncells;
+
+   assert(dev_levtable);
+   assert(dev_level);
+   assert(dev_i);
+   assert(dev_j);
+
+   size_t one = 1;
+   cl_mem dev_check = ezcl_malloc(NULL, const_cast<char *>("dev_check"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+   size_t mem_request = (int)((float)ncells*mem_factor);
+   dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+   dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+   dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+   dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+   size_t local_work_size =  64;
+   size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+   size_t block_size     = global_work_size/local_work_size;
+
+   //printf("DEBUG file %s line %d lws = %d gws %d bs %d ncells %d\n",__FILE__,__LINE__,
+   //   local_work_size, global_work_size, block_size, ncells);
+   cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
+   cl_mem dev_sizes = ezcl_malloc(NULL, const_cast<char *>("dev_sizes"), &one, sizeof(cl_int4),  CL_MEM_READ_WRITE, 0);
+
+#ifdef BOUNDS_CHECK
+   if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells || 
+       ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
+       ezcl_get_device_mem_nelements(dev_level) < (int)ncells ){
+      printf("%d: Warning ncells %ld size dev_i %d dev_j %d dev_level %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
+   }
+#endif
+
+      /*
+       __kernel void calc_hash_size_cl(
+                          const int   ncells,      // 0
+                          const int   levmx,       // 1
+                 __global       int   *levtable,   // 2
+                 __global       int   *level,      // 3
+                 __global       int   *i,          // 4
+                 __global       int   *j,          // 5
+                 __global       int4  *redscratch, // 6
+                 __global       int4  *sizes,      // 7
+                 __local        int4  *tile)       // 8
+      */
+
+   ezcl_set_kernel_arg(kernel_hash_size, 0, sizeof(cl_int), (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_hash_size, 1, sizeof(cl_int), (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_hash_size, 2, sizeof(cl_mem), (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_hash_size, 3, sizeof(cl_mem), (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_hash_size, 4, sizeof(cl_mem), (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_hash_size, 5, sizeof(cl_mem), (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_hash_size, 6, sizeof(cl_mem), (void *)&dev_redscratch);
+   ezcl_set_kernel_arg(kernel_hash_size, 7, sizeof(cl_mem), (void *)&dev_sizes);
+   ezcl_set_kernel_arg(kernel_hash_size, 8, local_work_size*sizeof(cl_int4), NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_size,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+   if (block_size > 1) {
+         /*
+         __kernel void finish_reduction_minmax4_cl(
+           const    int    isize,            // 0
+           __global int4  *redscratch,       // 1
+           __global int4  *sizes,            // 2
+           __local  int4  *tile)             // 3
+         */
+      ezcl_set_kernel_arg(kernel_finish_hash_size, 0, sizeof(cl_int), (void *)&block_size);
+      ezcl_set_kernel_arg(kernel_finish_hash_size, 1, sizeof(cl_mem), (void *)&dev_redscratch);
+      ezcl_set_kernel_arg(kernel_finish_hash_size, 2, sizeof(cl_mem), (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_finish_hash_size, 3, local_work_size*sizeof(cl_int4), NULL);
+
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_hash_size,   1, NULL, &local_work_size, &local_work_size, NULL);
+   }
+
+   ezcl_device_memory_delete(dev_redscratch);
+
+   cl_int sizes[4];
+   ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE,  0, 1*sizeof(cl_int4), &sizes, NULL);
+
+   int imintile = sizes[0];
+   int imaxtile = sizes[1];
+   int jmintile = sizes[2];
+   int jmaxtile = sizes[3];
+
+   // Expand size by 2*coarse_cells for ghost cells
+   // TODO: May want to get fancier here and calc based on cell level
+   int jminsize = max(jmintile-2*IPOW2(levmx),0);
+   int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
+   int iminsize = max(imintile-2*IPOW2(levmx),0);
+   int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
+   //fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+
+   //ezcl_enqueue_write_buffer(command_queue, dev_sizes, CL_TRUE,  0, 1*sizeof(cl_int4), &sizes, NULL);
+
+   int gpu_hash_method       = METHOD_UNSET;
+// allow imput.c to control hash types and methods
+   if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
+//=========
+
+   size_t hashsize;
+
+   uint hash_report_level = 1;
+   cl_mem dev_hash_header = NULL;
+   cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, gpu_hash_method, hash_report_level, &gpu_hash_table_size, &hashsize, &dev_hash_header);
+
+   int csize = corners_i.size();
+#ifdef BOUNDS_CHECK
+   for (int ic=0; ic<csize; ic++){
+      if (corners_i[ic] >= iminsize) continue;
+      if (corners_j[ic] >= jminsize) continue;
+      if (corners_i[ic] <  imaxsize) continue;
+      if (corners_j[ic] <  jmaxsize) continue;
+      if ( (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) < 0 ||
+           (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) > (int)hashsize){
+         printf("%d: Warning corners i %d j %d hash %d\n",mype,corners_i[ic],corners_j[ic],
+            (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize));
+      }
+   }
+#endif
+
+   size_t corners_local_work_size  = MIN(csize, TILE_SIZE);
+   size_t corners_global_work_size = ((csize+corners_local_work_size - 1) /corners_local_work_size) * corners_local_work_size;
+
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 0, sizeof(cl_int), (void *)&csize);
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 1, sizeof(cl_int), (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 2, sizeof(cl_int), (void *)&imax);
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 3, sizeof(cl_int), (void *)&jmax);
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 4, sizeof(cl_mem), (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 5, sizeof(cl_mem), (void *)&dev_sizes);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_adjust_sizes,   1, NULL, &corners_global_work_size, &corners_local_work_size, NULL);
+
+   if (DEBUG){
+      vector<int> sizes_tmp(4);
+      ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE,  0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
+      int iminsize_tmp = sizes_tmp[0];
+      int imaxsize_tmp = sizes_tmp[1];
+      int jminsize_tmp = sizes_tmp[2];
+      int jmaxsize_tmp = sizes_tmp[3];
+      fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
+   }
+
+   local_work_size = 128;
+   global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+#ifdef BOUNDS_CHECK
+   {
+      vector<int> i_tmp(ncells);
+      vector<int> j_tmp(ncells);
+      vector<int> level_tmp(ncells);
+      ezcl_enqueue_read_buffer(command_queue, dev_i,     CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_j,     CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE,  0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+      for (int ic=0; ic<(int)ncells; ic++){
+         int lev = level_tmp[ic];
+         for (   int jj = j_tmp[ic]*IPOW2(levmx-lev)-jminsize; jj < (j_tmp[ic]+1)*IPOW2(levmx-lev)-jminsize; jj++) {
+            for (int ii = i_tmp[ic]*IPOW2(levmx-lev)-iminsize; ii < (i_tmp[ic]+1)*IPOW2(levmx-lev)-iminsize; ii++) {
+               if (jj < 0 || jj >= (jmaxsize-jminsize) || ii < 0 || ii >= (imaxsize-iminsize) ) {
+                  printf("%d: Warning ncell %d writes to hash out-of-bounds at line %d ii %d jj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,ic,__LINE__,ii,jj,iminsize,imaxsize,jminsize,jmaxsize);
+               }
+            }
+         }
+      }
+   }
+#endif
+
+   //printf("%d: lws %d gws %d \n",mype,local_work_size,global_work_size);
+   cl_event hash_setup_local_event;
+
+      /*
+                    const int   isize,           // 0
+                    const int   levmx,           // 1
+                    const int   imax,            // 2
+                    const int   jmax,            // 3
+                    const int   noffset,         // 4
+           __global       int   *sizes,          // 5
+           __global       int   *levtable,       // 6
+           __global       int   *level,          // 7
+           __global       int   *i,              // 8
+           __global       int   *j,              // 9
+           __global const ulong *hash_heaer,     // 10
+           __global       int   *hash)           // 11
+      */
+
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  0, sizeof(cl_int),   (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  1, sizeof(cl_int),   (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  2, sizeof(cl_int),   (void *)&imax);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  3, sizeof(cl_int),   (void *)&jmax);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  4, sizeof(cl_int),   (void *)&noffset);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  5, sizeof(cl_mem),   (void *)&dev_sizes);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  6, sizeof(cl_mem),   (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  7, sizeof(cl_mem),   (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  8, sizeof(cl_mem),   (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_hash_setup_local,  9, sizeof(cl_mem),   (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_hash_setup_local, 10, sizeof(cl_mem),   (void *)&dev_hash_header);
+   ezcl_set_kernel_arg(kernel_hash_setup_local, 11, sizeof(cl_mem),   (void *)&dev_hash);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup_local,   1, NULL, &global_work_size, &local_work_size, &hash_setup_local_event);
+
+   ezcl_wait_for_events(1, &hash_setup_local_event);
+   ezcl_event_release(hash_setup_local_event);
+
+   if (DEBUG){
+      vector<int> sizes_tmp(4);
+      ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE,  0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
+      int iminsize_tmp = sizes_tmp[0];
+      int imaxsize_tmp = sizes_tmp[1];
+      int jminsize_tmp = sizes_tmp[2];
+      int jmaxsize_tmp = sizes_tmp[3];
+      fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
+   }
+
+   if (TIMING_LEVEL >= 2) {
+      gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+      cpu_timer_start(&tstart_lev2);
+   }
+
+#ifdef BOUNDS_CHECK
+   {
+      if (ezcl_get_device_mem_nelements(dev_nlft)  < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_nrht)  < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_nbot)  < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_ntop)  < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_i)     < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_j)     < (int)ncells ||
+          ezcl_get_device_mem_nelements(dev_level) < (int)ncells ) {
+         printf("%d: Warning -- sizes for dev_neigh too small ncells %ld neigh %d %d %d %d %d %d %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_nlft),ezcl_get_device_mem_nelements(dev_nrht),ezcl_get_device_mem_nelements(dev_nbot),ezcl_get_device_mem_nelements(dev_ntop),ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
+      }
+      vector<int> level_tmp(ncells);
+      ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+      int iflag = 0;
+      for (int ic=0; ic<ncells; ic++){
+         if (levmx-level_tmp[ic] < 0 || levmx-level_tmp[ic] > levmx) {
+            printf("%d: Warning level value bad ic %d level %d ncells %d\n",mype,ic,level_tmp[ic],ncells);
+            iflag++;
+         }
+      }
+      if (ezcl_get_device_mem_nelements(dev_levtable) < levmx+1) printf("%d Warning levtable too small levmx is %d devtable size is %d\n",mype,levmx,ezcl_get_device_mem_nelements(dev_levtable));
+#ifdef HAVE_MPI
+      if (iflag > 20) {fflush(stdout); L7_Terminate(); exit(0);}
+#endif
+   }
+#endif
+
+#ifdef BOUNDS_CHECK
+   {
+      int jmaxcalc = (jmax+1)*IPOW2(levmx);
+      int imaxcalc = (imax+1)*IPOW2(levmx);
+      vector<int> i_tmp(ncells);
+      vector<int> j_tmp(ncells);
+      vector<int> level_tmp(ncells);
+      vector<int> hash_tmp(hashsize);
+      ezcl_enqueue_read_buffer(command_queue, dev_i,     CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_j,     CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE,  0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_hash,  CL_TRUE,  0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+      for (int ic=0; ic<(int)ncells; ic++){
+         int ii  = i_tmp[ic];
+         int jj  = j_tmp[ic];
+         int lev = level_tmp[ic];
+         int levmult = IPOW2(levmx-lev);
+         int jjj=jj   *levmult-jminsize;
+         int iii=max(  ii   *levmult-1, 0         )-iminsize;
+         if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         jjj=jj   *levmult-jminsize;
+         iii=min( (ii+1)*levmult,   imaxcalc-1)-iminsize;
+         if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         jjj=max(  jj   *levmult-1, 0) -jminsize;
+         iii=ii   *levmult   -iminsize;
+         if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         jjj=min( (jj+1)*levmult,   jmaxcalc-1)-jminsize;
+         iii=ii   *levmult   -iminsize;
+         if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         int nlftval = hash_tmp[((      jj   *levmult               )-jminsize)*(imaxsize-iminsize)+((max(  ii   *levmult-1, 0         ))-iminsize)];
+         int nrhtval = hash_tmp[((      jj   *levmult               )-jminsize)*(imaxsize-iminsize)+((min( (ii+1)*levmult,   imaxcalc-1))-iminsize)];
+         int nbotval = hash_tmp[((max(  jj   *levmult-1, 0)         )-jminsize)*(imaxsize-iminsize)+((      ii   *levmult               )-iminsize)];
+         int ntopval = hash_tmp[((min( (jj+1)*levmult,   jmaxcalc-1))-jminsize)*(imaxsize-iminsize)+((      ii   *levmult               )-iminsize)];
+
+         if (nlftval == INT_MIN){
+            jjj = jj*levmult-jminsize;
+            iii = ii*levmult-iminsize;
+            if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         }
+         if (nrhtval == INT_MIN){
+            jjj = jj*levmult-jminsize;
+            iii = ii*levmult-iminsize;
+            if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         }
+         if (nbotval == INT_MIN) {
+            iii = ii*levmult-iminsize;
+            jjj = jj*levmult-jminsize;
+            if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         }
+         if (ntopval == INT_MIN) {
+            iii = ii*levmult-iminsize;
+            jjj = jj*levmult-jminsize;
+            if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+         }
+      }
+   }
+#endif
+
+   cl_event calc_neighbors_local_event;
+
+      /*
+                    const int   isize,       // 0
+                    const int   levmx,       // 1
+                    const int   imaxsize,    // 2
+                    const int   jmaxsize,    // 3
+                    const int   noffset,     // 4
+           __global       int   *sizes,      // 5
+           __global       int   *levtable,   // 6
+           __global       int   *level,      // 7
+           __global       int   *i,          // 8
+           __global       int   *j,          // 9
+           __global       int   *nlft,       // 10
+           __global       int   *nrht,       // 11
+           __global       int   *nbot,       // 12
+           __global       int   *ntop,       // 13
+           __global const ulong *hash_heaer, // 14
+           __global       int   *hash)       // 15
+      */
+
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 0,  sizeof(cl_int),   (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 1,  sizeof(cl_int),   (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 2,  sizeof(cl_int),   (void *)&imax);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 3,  sizeof(cl_int),   (void *)&jmax);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 4,  sizeof(cl_int),   (void *)&noffset);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 5,  sizeof(cl_mem),   (void *)&dev_sizes);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 6,  sizeof(cl_mem),   (void *)&dev_levtable);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 7,  sizeof(cl_mem),   (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 8,  sizeof(cl_mem),   (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 9,  sizeof(cl_mem),   (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 10, sizeof(cl_mem),   (void *)&dev_nlft);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 11, sizeof(cl_mem),   (void *)&dev_nrht);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 12, sizeof(cl_mem),   (void *)&dev_nbot);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 13, sizeof(cl_mem),   (void *)&dev_ntop);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 14, sizeof(cl_mem),   (void *)&dev_hash_header);
+   ezcl_set_kernel_arg(kernel_calc_neighbors_local, 15, sizeof(cl_mem),   (void *)&dev_hash);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors_local,   1, NULL, &global_work_size, &local_work_size, &calc_neighbors_local_event);
+
+   ezcl_wait_for_events(1, &calc_neighbors_local_event);
+   ezcl_event_release(calc_neighbors_local_event);
+
+   if (TIMING_LEVEL >= 2) {
+      gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+      cpu_timer_start(&tstart_lev2);
+   }
+
+   if (DEBUG) {
+      print_dev_local();
+
+      vector<int> hash_tmp(hashsize);
+      ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+      cl_mem dev_hash_header_check = gpu_get_hash_header();
+      vector<ulong> hash_header_check(hash_header_size);
+      ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+      int   gpu_hash_method     = (int)hash_header_check[0];
+      ulong gpu_hash_table_size =      hash_header_check[1];
+      ulong gpu_AA              =      hash_header_check[2];
+      ulong gpu_BB              =      hash_header_check[3];
+
+      vector<int> nlft_tmp(ncells_ghost);
+      vector<int> nrht_tmp(ncells_ghost);
+      vector<int> nbot_tmp(ncells_ghost);
+      vector<int> ntop_tmp(ncells_ghost);
+      ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+      int jmaxglobal = (jmax+1)*IPOW2(levmx);
+      int imaxglobal = (imax+1)*IPOW2(levmx);
+      fprintf(fp,"\n                                    HASH 0 numbering\n");
+      for (int jj = jmaxglobal-1; jj>=0; jj--){
+         fprintf(fp,"%2d: %4d:",mype,jj);
+         if (jj >= jminsize && jj < jmaxsize) {
+            for (int ii = 0; ii<imaxglobal; ii++){
+               if (ii >= iminsize && ii < imaxsize) {
+                  fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+               } else {
+                  fprintf(fp,"     ");
+               }
+            }
+         }
+         fprintf(fp,"\n");
+      }
+      fprintf(fp,"%2d:      ",mype);
+      for (int ii = 0; ii<imaxglobal; ii++){
+         fprintf(fp,"%4d:",ii);
+      }
+      fprintf(fp,"\n");
+
+      fprintf(fp,"\n                                    nlft numbering\n");
+      for (int jj = jmaxglobal-1; jj>=0; jj--){
+         fprintf(fp,"%2d: %4d:",mype,jj);
+         if (jj >= jminsize && jj < jmaxsize) {
+            for (int ii = 0; ii<imaxglobal; ii++){
+               if (ii >= iminsize && ii < imaxsize) {
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+                  if (hashval >= 0 && hashval < (int)ncells) {
+                     fprintf(fp,"%5d",nlft_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               } else {
+                  fprintf(fp,"     ");
+               }
+            }
+         }
+         fprintf(fp,"\n");
+      }
+      fprintf(fp,"%2d:      ",mype);
+      for (int ii = 0; ii<imaxglobal; ii++){
+         fprintf(fp,"%4d:",ii);
+      }
+      fprintf(fp,"\n");
+   
+      fprintf(fp,"\n                                    nrht numbering\n");
+      for (int jj = jmaxglobal-1; jj>=0; jj--){
+         fprintf(fp,"%2d: %4d:",mype,jj);
+         if (jj >= jminsize && jj < jmaxsize) {
+            for (int ii = 0; ii<imaxglobal; ii++){
+               if (ii >= iminsize && ii < imaxsize) {
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+                  if (hashval >= 0 && hashval < (int)ncells) {
+                     fprintf(fp,"%5d",nrht_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               } else {
+                  fprintf(fp,"     ");
+               }
+            }
+         }
+         fprintf(fp,"\n");
+      }
+      fprintf(fp,"%2d:      ",mype);
+      for (int ii = 0; ii<imaxglobal; ii++){
+         fprintf(fp,"%4d:",ii);
+      }
+      fprintf(fp,"\n");
+
+      fprintf(fp,"\n                                    nbot numbering\n");
+      for (int jj = jmaxglobal-1; jj>=0; jj--){
+         fprintf(fp,"%2d: %4d:",mype,jj);
+         if (jj >= jminsize && jj < jmaxsize) {
+            for (int ii = 0; ii<imaxglobal; ii++){
+               if (ii >= iminsize && ii < imaxsize) {
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+                  if (hashval >= 0 && hashval < (int)ncells) {
+                     fprintf(fp,"%5d",nbot_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               } else {
+                  fprintf(fp,"     ");
+               }
+            }
+         }
+         fprintf(fp,"\n");
+      }
+      fprintf(fp,"%2d:      ",mype);
+      for (int ii = 0; ii<imaxglobal; ii++){
+         fprintf(fp,"%4d:",ii);
+      }
+      fprintf(fp,"\n");
+
+      fprintf(fp,"\n                                    ntop numbering\n");
+      for (int jj = jmaxglobal-1; jj>=0; jj--){
+         fprintf(fp,"%2d: %4d:",mype,jj);
+         if (jj >= jminsize && jj < jmaxsize) {
+            for (int ii = 0; ii<imaxglobal; ii++){
+               if (ii >= iminsize && ii < imaxsize) {
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+                  if (hashval >= 0 && hashval < (int)ncells) {
+                     fprintf(fp,"%5d",ntop_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               } else {
+                  fprintf(fp,"     ");
+               }
+            }
+         }
+         fprintf(fp,"\n");
+      }
+      fprintf(fp,"%2d:      ",mype);
+      for (int ii = 0; ii<imaxglobal; ii++){
+         fprintf(fp,"%4d:",ii);
+      }
+      fprintf(fp,"\n");
+   }
+
+#ifdef HAVE_MPI
+   if (numpe > 1) {
+         vector<int> iminsize_global(numpe);
+         vector<int> imaxsize_global(numpe);
+         vector<int> jminsize_global(numpe);
+         vector<int> jmaxsize_global(numpe);
+         vector<int> comm_partner(numpe,-1);
+
+         MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+         MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+         int num_comm_partners = 0; 
+         for (int ip = 0; ip < numpe; ip++){
+            if (ip == mype) continue;
+            if (iminsize_global[ip] > imaxtile) continue;
+            if (imaxsize_global[ip] < imintile) continue;
+            if (jminsize_global[ip] > jmaxtile) continue;
+            if (jmaxsize_global[ip] < jmintile) continue;
+            comm_partner[num_comm_partners] = ip;
+            num_comm_partners++;
+            //if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
+         }    
+
+#ifdef BOUNDS_CHECK
+      {
+         vector<int> nlft_tmp(ncells_ghost);
+         vector<int> nrht_tmp(ncells_ghost);
+         vector<int> nbot_tmp(ncells_ghost);
+         vector<int> ntop_tmp(ncells_ghost);
+         ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells*sizeof(cl_int), &ntop_tmp[0], NULL);
+         for (uint ic=0; ic<ncells; ic++){
+            int nl = nlft_tmp[ic];
+            if (nl != -1){
+               nl -= noffset;
+               if (nl<0 || nl>= ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+            }
+            int nr = nrht_tmp[ic];
+            if (nr != -1){
+               nr -= noffset;
+               if (nr<0 || nr>= ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+            }
+            int nb = nbot_tmp[ic];
+            if (nb != -1){
+               nb -= noffset;
+               if (nb<0 || nb>= ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+            }
+            int nt = ntop_tmp[ic];
+            if (nt != -1){
+               nt -= noffset;
+               if (nt<0 || nt>= ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+            }
+         }
+      }
+#endif
+
+      cl_mem dev_border_cell = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell1"), &ncells, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 0,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 1,  sizeof(cl_int), (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 2,  sizeof(cl_mem), (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 3,  sizeof(cl_mem), (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 4,  sizeof(cl_mem), (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 5,  sizeof(cl_mem), (void *)&dev_ntop);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 6,  sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_calc_border_cells, 7,  sizeof(cl_mem), (void *)&dev_border_cell);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells, 1, NULL, &global_work_size, &local_work_size, NULL); 
+
+      cl_mem dev_border_cell_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell2"), &ncells, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      size_t one = 1;
+      cl_mem dev_nbsize = ezcl_malloc(NULL, const_cast<char *>("dev_nbsize"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  0,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  1,  sizeof(cl_int), (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  2,  sizeof(cl_mem), (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  3,  sizeof(cl_mem), (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  4,  sizeof(cl_mem), (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  5,  sizeof(cl_mem), (void *)&dev_ntop);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  6,  sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  7,  sizeof(cl_mem), (void *)&dev_border_cell);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  8,  sizeof(cl_mem), (void *)&dev_border_cell_new);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2,  9,  sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2, 10,  sizeof(cl_mem), (void *)&dev_nbsize);
+      ezcl_set_kernel_arg(kernel_calc_border_cells2, 11,  local_work_size*sizeof(cl_int), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells2, 1, NULL, &global_work_size, &local_work_size, NULL); 
+
+      ezcl_device_memory_swap(&dev_border_cell, &dev_border_cell_new);
+      ezcl_device_memory_delete(dev_border_cell_new);
+
+      int group_size = (int)(global_work_size/local_work_size);
+
+      ezcl_set_kernel_arg(kernel_finish_scan, 0,  sizeof(cl_int), (void *)&group_size);
+      ezcl_set_kernel_arg(kernel_finish_scan, 1,  sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_finish_scan, 2,  sizeof(cl_mem), (void *)&dev_nbsize);
+      ezcl_set_kernel_arg(kernel_finish_scan, 3,  local_work_size*sizeof(cl_int), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &local_work_size, &local_work_size, NULL); 
+
+      int nbsize_local;
+      ezcl_enqueue_read_buffer(command_queue, dev_nbsize, CL_TRUE,  0, 1*sizeof(cl_int), &nbsize_local, NULL);
+      ezcl_device_memory_delete(dev_nbsize);
+
+      //printf("%d: border cell size is %d global is %ld\n",mype,nbsize_local,nbsize_global);
+
+      vector<int> border_cell_num(nbsize_local);
+      vector<int> border_cell_i(nbsize_local);
+      vector<int> border_cell_j(nbsize_local);
+      vector<int> border_cell_level(nbsize_local);
+    
+      // allocate new border memory
+      size_t nbsize_long = nbsize_local;
+      cl_mem dev_border_cell_i     = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"),     &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_j     = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"),     &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_num   = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"),   &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_get_border_data,  0,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_get_border_data,  1,  sizeof(cl_int), (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_get_border_data,  2,  sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_get_border_data,  3,  sizeof(cl_mem), (void *)&dev_border_cell);
+      ezcl_set_kernel_arg(kernel_get_border_data,  4,  sizeof(cl_mem), (void *)&dev_i);
+      ezcl_set_kernel_arg(kernel_get_border_data,  5,  sizeof(cl_mem), (void *)&dev_j);
+      ezcl_set_kernel_arg(kernel_get_border_data,  6,  sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_get_border_data,  7,  sizeof(cl_mem), (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_get_border_data,  8,  sizeof(cl_mem), (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_get_border_data,  9,  sizeof(cl_mem), (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_get_border_data, 10,  sizeof(cl_mem), (void *)&dev_border_cell_num);
+      ezcl_set_kernel_arg(kernel_get_border_data, 11,  local_work_size*sizeof(cl_uint), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data, 1, NULL, &global_work_size, &local_work_size, NULL); 
+
+      ezcl_device_memory_delete(dev_ioffset);
+      ezcl_device_memory_delete(dev_border_cell);
+
+      // read gpu border cell data
+      ezcl_enqueue_read_buffer(command_queue, dev_border_cell_i,     CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i[0],     NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_border_cell_j,     CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j[0],     NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_border_cell_level, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_level[0], NULL);
+      ezcl_enqueue_read_buffer(command_queue, dev_border_cell_num,   CL_TRUE,  0, nbsize_local*sizeof(cl_int), &border_cell_num[0],   NULL);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_FIND_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      // Allocate push database
+
+      int **send_database = (int**)malloc(num_comm_partners*sizeof(int *));
+      for (int ip = 0; ip < num_comm_partners; ip++){
+         send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
+      }
+
+      // Compute the overlap between processor bounding boxes and set up push database
+
+      vector<int> send_buffer_count(num_comm_partners);
+      for (int ip = 0; ip < num_comm_partners; ip++){
+         int icount = 0;
+         for (int ib = 0; ib <nbsize_local; ib++){
+            int lev = border_cell_level[ib];
+            int levmult = IPOW2(levmx-lev);
+            if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] && 
+                border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] && 
+                border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] && 
+                border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
+               send_database[ip][icount] = ib;
+               icount++;
+            }
+         }
+         send_buffer_count[ip]=icount;
+      }
+
+      // Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and 
+      // send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
+      // It will return receive_count_total for use in allocations
+
+      int receive_count_total;
+      int i_push_handle = 0;
+      L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
+                    send_database, &receive_count_total, &i_push_handle);
+
+      if (DEBUG) {
+         fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
+         for (int ip = 0; ip < num_comm_partners; ip++){
+            fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
+            for (int ic = 0; ic < send_buffer_count[ip]; ic++){
+               int ib = send_database[ip][ic];
+               fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
+                  border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+            }
+         }
+      }
+
+      // Can now free the send database. Other arrays are vectors and will automatically 
+      // deallocate
+
+      for (int ip = 0; ip < num_comm_partners; ip++){
+         free(send_database[ip]);
+      }
+      free(send_database);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_PUSH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+      // Push the data needed to the adjacent processors
+
+      int *border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
+      int *border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
+      int *border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
+      int *border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
+      L7_Push_Update(&border_cell_num[0],   border_cell_num_local,   i_push_handle);
+      L7_Push_Update(&border_cell_i[0],     border_cell_i_local,     i_push_handle);
+      L7_Push_Update(&border_cell_j[0],     border_cell_j_local,     i_push_handle);
+      L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
+
+      L7_Push_Free(&i_push_handle);
+
+      ezcl_device_memory_delete(dev_border_cell_i);
+      ezcl_device_memory_delete(dev_border_cell_j);
+      ezcl_device_memory_delete(dev_border_cell_level);
+      ezcl_device_memory_delete(dev_border_cell_num);
+
+      nbsize_local = receive_count_total;
+
+      if (DEBUG) {
+         for (int ic = 0; ic < nbsize_local; ic++) {
+            fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
+               border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
+         }
+      }
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_PUSH_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      nbsize_long = nbsize_local;
+
+      dev_border_cell_num        = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"),        &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_border_cell_i          = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"),          &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_border_cell_j          = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"),          &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      dev_border_cell_level      = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"),      &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_needed     = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed"),     &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_needed_out = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed_out"), &nbsize_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      ezcl_enqueue_write_buffer(command_queue, dev_border_cell_num,    CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_num_local[0], NULL);
+      ezcl_enqueue_write_buffer(command_queue, dev_border_cell_i,      CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i_local[0],   NULL);
+      ezcl_enqueue_write_buffer(command_queue, dev_border_cell_j,      CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j_local[0],   NULL);
+      ezcl_enqueue_write_buffer(command_queue, dev_border_cell_level,  CL_TRUE,  0, nbsize_local*sizeof(cl_int), &border_cell_level_local[0],   NULL);
+
+      //ezcl_enqueue_write_buffer(command_queue, dev_border_cell_needed, CL_TRUE,  0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0],   NULL);
+
+      free(border_cell_i_local);
+      free(border_cell_j_local);
+      free(border_cell_level_local);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_LOCAL_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      if (DEBUG) {
+         vector<int> hash_tmp(hashsize);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE,  0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+         cl_mem dev_hash_header_check = gpu_get_hash_header();
+         vector<ulong> hash_header_check(hash_header_size);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+         int   gpu_hash_method     = (int)hash_header_check[0];
+         ulong gpu_hash_table_size =      hash_header_check[1];
+         ulong gpu_AA              =      hash_header_check[2];
+         ulong gpu_BB              =      hash_header_check[3];
+
+         int jmaxglobal = (jmax+1)*IPOW2(levmx);
+         int imaxglobal = (imax+1)*IPOW2(levmx);
+         fprintf(fp,"\n                                    HASH numbering before layer 1\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+      }
+
+      size_t nb_local_work_size = 128;
+      size_t nb_global_work_size = ((nbsize_local + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
+
+      ezcl_set_kernel_arg(kernel_calc_layer1,  0,  sizeof(cl_int),   (void *)&nbsize_local);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  1,  sizeof(cl_int),   (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  2,  sizeof(cl_int),   (void *)&levmx);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  3,  sizeof(cl_int),   (void *)&imax);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  4,  sizeof(cl_int),   (void *)&jmax);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  5,  sizeof(cl_int),   (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  6,  sizeof(cl_mem),   (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  7,  sizeof(cl_mem),   (void *)&dev_levtable);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  8,  sizeof(cl_mem),   (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_calc_layer1,  9,  sizeof(cl_mem),   (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_calc_layer1, 10,  sizeof(cl_mem),   (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_calc_layer1, 11,  sizeof(cl_mem),   (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_calc_layer1, 12,  sizeof(cl_mem),   (void *)&dev_border_cell_needed);
+      ezcl_set_kernel_arg(kernel_calc_layer1, 13,  sizeof(cl_mem),   (void *)&dev_hash_header);
+      ezcl_set_kernel_arg(kernel_calc_layer1, 14,  sizeof(cl_mem),   (void *)&dev_hash);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL); 
+
+      if (DEBUG){
+         vector<int> border_cell_needed_local(nbsize_local);
+
+         ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed, CL_TRUE,  0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0],   NULL);
+
+         for(int ic=0; ic<nbsize_local; ic++){
+            if (border_cell_needed_local[ic] == 0) continue;
+            fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+         }
+      }
+
+      cl_event calc_layer1_sethash_event;
+
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  0,  sizeof(cl_int),   (void *)&nbsize_local);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  1,  sizeof(cl_int),   (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  2,  sizeof(cl_int),   (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  3,  sizeof(cl_int),   (void *)&levmx);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  4,  sizeof(cl_mem),   (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  5,  sizeof(cl_mem),   (void *)&dev_levtable);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  6,  sizeof(cl_mem),   (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  7,  sizeof(cl_mem),   (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  8,  sizeof(cl_mem),   (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash,  9,  sizeof(cl_mem),   (void *)&dev_border_cell_needed);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 10,  sizeof(cl_mem),   (void *)&dev_hash_header);
+      ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 11,  sizeof(cl_mem),   (void *)&dev_hash);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1_sethash, 1, NULL, &nb_global_work_size, &nb_local_work_size, &calc_layer1_sethash_event); 
+
+      ezcl_wait_for_events(1, &calc_layer1_sethash_event);
+      ezcl_event_release(calc_layer1_sethash_event);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_LAYER1] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      if (DEBUG) {
+         print_dev_local();
+
+         vector<int> hash_tmp(hashsize);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE,  0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+         cl_mem dev_hash_header_check = gpu_get_hash_header();
+         vector<ulong> hash_header_check(hash_header_size);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+         int   gpu_hash_method     = (int)hash_header_check[0];
+         ulong gpu_hash_table_size =      hash_header_check[1];
+         ulong gpu_AA              =      hash_header_check[2];
+         ulong gpu_BB              =      hash_header_check[3];
+
+         int jmaxglobal = (jmax+1)*IPOW2(levmx);
+         int imaxglobal = (imax+1)*IPOW2(levmx);
+         fprintf(fp,"\n                                    HASH numbering for 1 layer\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+      }
+
+      group_size = (int)(nb_global_work_size/nb_local_work_size);
+
+      cl_mem dev_nbpacked = ezcl_malloc(NULL, const_cast<char *>("dev_nbpacked"), &one, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      size_t group_size_long = group_size;
+      dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &group_size_long, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      ezcl_set_kernel_arg(kernel_calc_layer2,  0,  sizeof(cl_int),   (void *)&nbsize_local);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  1,  sizeof(cl_int),   (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  2,  sizeof(cl_int),   (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  3,  sizeof(cl_int),   (void *)&levmx);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  4,  sizeof(cl_int),   (void *)&imax);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  5,  sizeof(cl_int),   (void *)&jmax);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  6,  sizeof(cl_mem),   (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  7,  sizeof(cl_mem),   (void *)&dev_levtable);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  8,  sizeof(cl_mem),   (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_calc_layer2,  9,  sizeof(cl_mem),   (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 10,  sizeof(cl_mem),   (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 11,  sizeof(cl_mem),   (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 12,  sizeof(cl_mem),   (void *)&dev_border_cell_needed);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 13,  sizeof(cl_mem),   (void *)&dev_border_cell_needed_out);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 14,  sizeof(cl_mem),   (void *)&dev_hash_header);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 15,  sizeof(cl_mem),   (void *)&dev_hash);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 16,  sizeof(cl_mem),   (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 17,  sizeof(cl_mem),   (void *)&dev_nbpacked);
+      ezcl_set_kernel_arg(kernel_calc_layer2, 18,  nb_local_work_size*sizeof(cl_mem), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL); 
+
+      if (DEBUG){
+         vector<int> border_cell_needed_local(nbsize_local);
+
+         ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed_out, CL_TRUE,  0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0],   NULL);
+         for(int ic=0; ic<nbsize_local; ic++){
+            if (border_cell_needed_local[ic] <= 0) continue;
+            if (border_cell_needed_local[ic] <  0x0016) fprintf(fp,"%d: First  set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+            if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+         }
+      }
+
+      free(border_cell_num_local);
+
+      ezcl_device_memory_delete(dev_border_cell_needed);
+
+      ezcl_set_kernel_arg(kernel_finish_scan, 0,  sizeof(cl_int), (void *)&group_size);
+      ezcl_set_kernel_arg(kernel_finish_scan, 1,  sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_finish_scan, 2,  sizeof(cl_mem), (void *)&dev_nbpacked);
+      ezcl_set_kernel_arg(kernel_finish_scan, 3,  nb_local_work_size*sizeof(cl_int), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &nb_local_work_size, &nb_local_work_size, NULL); 
+
+      int nbpacked;
+      ezcl_enqueue_read_buffer(command_queue, dev_nbpacked, CL_TRUE,  0, 1*sizeof(cl_int), &nbpacked, NULL);
+      ezcl_device_memory_delete(dev_nbpacked);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_LAYER2] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      nbsize_long = nbsize_local;
+      cl_mem dev_border_cell_i_new     = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i_new"),     &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_j_new     = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j_new"),     &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_border_cell_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+      cl_mem dev_indices_needed    = ezcl_malloc(NULL, const_cast<char *>("dev_indices_needed"),    &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+      cl_event get_border_data2_event;
+
+      ezcl_set_kernel_arg(kernel_get_border_data2,  0,  sizeof(cl_int), (void *)&nbsize_local);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  1,  sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  2,  sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  3,  sizeof(cl_mem), (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  4,  sizeof(cl_mem), (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  5,  sizeof(cl_mem), (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  6,  sizeof(cl_mem), (void *)&dev_border_cell_num);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  7,  sizeof(cl_mem), (void *)&dev_border_cell_i_new);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  8,  sizeof(cl_mem), (void *)&dev_border_cell_j_new);
+      ezcl_set_kernel_arg(kernel_get_border_data2,  9,  sizeof(cl_mem), (void *)&dev_border_cell_level_new);
+      ezcl_set_kernel_arg(kernel_get_border_data2, 10,  sizeof(cl_mem), (void *)&dev_indices_needed);
+      ezcl_set_kernel_arg(kernel_get_border_data2, 11,  local_work_size*sizeof(cl_uint), NULL);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data2, 1, NULL, &nb_global_work_size, &nb_local_work_size, &get_border_data2_event);
+
+      ezcl_device_memory_delete(dev_border_cell_num);
+
+      ezcl_device_memory_swap(&dev_border_cell_i,     &dev_border_cell_i_new);
+      ezcl_device_memory_swap(&dev_border_cell_j,     &dev_border_cell_j_new);
+      ezcl_device_memory_swap(&dev_border_cell_level, &dev_border_cell_level_new);
+
+      size_t nbp_local_work_size = 128;
+      size_t nbp_global_work_size = ((nbpacked + nbp_local_work_size - 1) /nbp_local_work_size) * nbp_local_work_size;
+
+      cl_event calc_layer2_sethash_event;
+
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  0,  sizeof(cl_int),   (void *)&nbpacked);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  1,  sizeof(cl_int),   (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  2,  sizeof(cl_int),   (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  3,  sizeof(cl_int),   (void *)&levmx);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  4,  sizeof(cl_int),   (void *)&imax);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  5,  sizeof(cl_int),   (void *)&jmax);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  6,  sizeof(cl_mem),   (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  7,  sizeof(cl_mem),   (void *)&dev_levtable);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  8,  sizeof(cl_mem),   (void *)&dev_levibeg);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash,  9,  sizeof(cl_mem),   (void *)&dev_leviend);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 10,  sizeof(cl_mem),   (void *)&dev_levjbeg);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 11,  sizeof(cl_mem),   (void *)&dev_levjend);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 12,  sizeof(cl_mem),   (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 13,  sizeof(cl_mem),   (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 14,  sizeof(cl_mem),   (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 15,  sizeof(cl_mem),   (void *)&dev_indices_needed);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 16,  sizeof(cl_mem),   (void *)&dev_border_cell_needed_out);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 17,  sizeof(cl_mem),   (void *)&dev_hash_header);
+      ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 18,  sizeof(cl_mem),   (void *)&dev_hash);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2_sethash, 1, NULL, &nbp_global_work_size, &nbp_local_work_size, &calc_layer2_sethash_event); 
+
+      ezcl_wait_for_events(1, &calc_layer2_sethash_event);
+      ezcl_event_release(calc_layer2_sethash_event);
+
+      ezcl_device_memory_delete(dev_ioffset);
+
+      ezcl_wait_for_events(1, &get_border_data2_event);
+      ezcl_event_release(get_border_data2_event);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_LAYER_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      vector<int> indices_needed(nbpacked);
+
+      // read gpu border cell data 
+      ezcl_enqueue_read_buffer(command_queue, dev_indices_needed,    CL_TRUE,  0, nbpacked*sizeof(cl_int), &indices_needed[0],    NULL);
+
+      ezcl_device_memory_delete(dev_border_cell_i_new);
+      ezcl_device_memory_delete(dev_border_cell_j_new);
+      ezcl_device_memory_delete(dev_border_cell_level_new);
+
+      if (DEBUG) {
+         print_dev_local();
+
+         vector<int> hash_tmp(hashsize);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE,  0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+         cl_mem dev_hash_header_check = gpu_get_hash_header();
+         vector<ulong> hash_header_check(hash_header_size);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+         int   gpu_hash_method     = (int)hash_header_check[0];
+         ulong gpu_hash_table_size =      hash_header_check[1];
+         ulong gpu_AA              =      hash_header_check[2];
+         ulong gpu_BB              =      hash_header_check[3];
+
+         int jmaxglobal = (jmax+1)*IPOW2(levmx);
+         int imaxglobal = (imax+1)*IPOW2(levmx);
+         fprintf(fp,"\n                                    HASH numbering for 2 layer\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+         fflush(fp);
+      }
+
+      ezcl_device_memory_delete(dev_border_cell_needed_out);
+
+      int nghost = nbpacked;
+      ncells_ghost = ncells + nghost;
+
+      //if (mype == 1) printf("%d: DEBUG before expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+      if (ezcl_get_device_mem_capacity(dev_celltype) < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_i)        < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_j)        < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_level)    < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_nlft)     < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_nrht)     < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_nbot)     < ncells_ghost ||
+          ezcl_get_device_mem_capacity(dev_ntop)     < ncells_ghost ) {
+
+         //if (mype == 0) printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+         //printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+         mem_factor = (float)(ncells_ghost/ncells);
+         cl_mem dev_celltype_old = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_i_old        = ezcl_malloc(NULL, const_cast<char *>("dev_i_old"),        &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_j_old        = ezcl_malloc(NULL, const_cast<char *>("dev_j_old"),        &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_level_old    = ezcl_malloc(NULL, const_cast<char *>("dev_level_old"),    &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_nlft_old     = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_old"),     &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_nrht_old     = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_old"),     &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_nbot_old     = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_old"),     &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         cl_mem dev_ntop_old     = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_old"),     &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+         ezcl_device_memory_swap(&dev_celltype_old, &dev_celltype);
+         ezcl_device_memory_swap(&dev_i_old,        &dev_i       );
+         ezcl_device_memory_swap(&dev_j_old,        &dev_j       );
+         ezcl_device_memory_swap(&dev_level_old,    &dev_level   );
+         ezcl_device_memory_swap(&dev_nlft_old,     &dev_nlft    );
+         ezcl_device_memory_swap(&dev_nrht_old,     &dev_nrht    );
+         ezcl_device_memory_swap(&dev_nbot_old,     &dev_nbot    );
+         ezcl_device_memory_swap(&dev_ntop_old,     &dev_ntop    );
+
+         cl_event copy_mesh_data_event;
+
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 0,  sizeof(cl_int), (void *)&ncells);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 1,  sizeof(cl_mem), (void *)&dev_celltype_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 2,  sizeof(cl_mem), (void *)&dev_celltype);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 3,  sizeof(cl_mem), (void *)&dev_i_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 4,  sizeof(cl_mem), (void *)&dev_i);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 5,  sizeof(cl_mem), (void *)&dev_j_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 6,  sizeof(cl_mem), (void *)&dev_j);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 7,  sizeof(cl_mem), (void *)&dev_level_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 8,  sizeof(cl_mem), (void *)&dev_level);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 9,  sizeof(cl_mem), (void *)&dev_nlft_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 10, sizeof(cl_mem), (void *)&dev_nlft);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 11, sizeof(cl_mem), (void *)&dev_nrht_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 12, sizeof(cl_mem), (void *)&dev_nrht);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 13, sizeof(cl_mem), (void *)&dev_nbot_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 14, sizeof(cl_mem), (void *)&dev_nbot);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 15, sizeof(cl_mem), (void *)&dev_ntop_old);
+         ezcl_set_kernel_arg(kernel_copy_mesh_data, 16, sizeof(cl_mem), (void *)&dev_ntop);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_mesh_data,   1, NULL, &global_work_size, &local_work_size, &copy_mesh_data_event);
+
+         ezcl_device_memory_delete(dev_celltype_old);
+         ezcl_device_memory_delete(dev_i_old);
+         ezcl_device_memory_delete(dev_j_old);
+         ezcl_device_memory_delete(dev_level_old);
+         ezcl_device_memory_delete(dev_nlft_old);
+         ezcl_device_memory_delete(dev_nrht_old);
+         ezcl_device_memory_delete(dev_nbot_old);
+         ezcl_device_memory_delete(dev_ntop_old);
+
+         ezcl_wait_for_events(1, &copy_mesh_data_event);
+         ezcl_event_release(copy_mesh_data_event);
+      }
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_COPY_MESH_DATA] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      nb_global_work_size = ((nbpacked + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
+
+#ifdef BOUNDS_CHECK
+      if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_j) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_level) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_celltype) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+             printf("DEBUG size issue at %d\n",__LINE__);
+      }
+      if (ezcl_get_device_mem_nelements(dev_border_cell_i) < nbpacked || 
+          ezcl_get_device_mem_nelements(dev_border_cell_j) < nbpacked || 
+          ezcl_get_device_mem_nelements(dev_border_cell_level) < nbpacked ){
+             printf("DEBUG size issue at %d\n",__LINE__);
+      }
+#endif
+ 
+      cl_event fill_mesh_ghost_event;
+
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  0,  sizeof(cl_int), (void *)&nbpacked);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  1,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  2,  sizeof(cl_mem), (void *)&dev_levibeg);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  3,  sizeof(cl_mem), (void *)&dev_leviend);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  4,  sizeof(cl_mem), (void *)&dev_levjbeg);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  5,  sizeof(cl_mem), (void *)&dev_levjend);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  6,  sizeof(cl_mem), (void *)&dev_border_cell_i);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  7,  sizeof(cl_mem), (void *)&dev_border_cell_j);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  8,  sizeof(cl_mem), (void *)&dev_border_cell_level);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost,  9,  sizeof(cl_mem), (void *)&dev_i);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 10,  sizeof(cl_mem), (void *)&dev_j);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 11,  sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 12,  sizeof(cl_mem), (void *)&dev_celltype);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 13,  sizeof(cl_mem), (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 14,  sizeof(cl_mem), (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 15,  sizeof(cl_mem), (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 16,  sizeof(cl_mem), (void *)&dev_ntop);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_mesh_ghost, 1, NULL, &nb_global_work_size, &nb_local_work_size, &fill_mesh_ghost_event); 
+
+      ezcl_wait_for_events(1, &fill_mesh_ghost_event);
+      ezcl_event_release(fill_mesh_ghost_event);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_FILL_MESH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      if (DEBUG){
+         fprintf(fp,"After copying i,j, level to ghost cells\n");
+         print_dev_local();
+      }
+
+      ezcl_device_memory_delete(dev_border_cell_i);
+      ezcl_device_memory_delete(dev_border_cell_j);
+      ezcl_device_memory_delete(dev_border_cell_level);
+
+      size_t ghost_local_work_size = 128;
+      size_t ghost_global_work_size = ((ncells_ghost + ghost_local_work_size - 1) /ghost_local_work_size) * ghost_local_work_size;
+
+      cl_event fill_neighbor_ghost_event;
+
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  0,  sizeof(cl_int),   (void *)&ncells_ghost);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  1,  sizeof(cl_int),   (void *)&levmx);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  2,  sizeof(cl_int),   (void *)&imax);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  3,  sizeof(cl_int),   (void *)&jmax);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  4,  sizeof(cl_mem),   (void *)&dev_sizes);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  5,  sizeof(cl_mem),   (void *)&dev_levtable);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  6,  sizeof(cl_mem),   (void *)&dev_i);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  7,  sizeof(cl_mem),   (void *)&dev_j);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  8,  sizeof(cl_mem),   (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost,  9,  sizeof(cl_mem),   (void *)&dev_hash_header);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 10,  sizeof(cl_mem),   (void *)&dev_hash);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 11,  sizeof(cl_mem),   (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 12,  sizeof(cl_mem),   (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 13,  sizeof(cl_mem),   (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 14,  sizeof(cl_mem),   (void *)&dev_ntop);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_neighbor_ghost, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &fill_neighbor_ghost_event); 
+
+      ezcl_wait_for_events(1, &fill_neighbor_ghost_event);
+      ezcl_event_release(fill_neighbor_ghost_event);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      if (DEBUG){
+         fprintf(fp,"After setting neighbors through ghost cells\n");
+         print_dev_local();
+      }
+
+#ifdef BOUNDS_CHECK
+      if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
+          ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
+          ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+         printf("%d: Warning sizes for set_corner_neighbor not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
+      }
+#endif
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      if (DEBUG){
+         fprintf(fp,"After setting corner neighbors\n");
+         print_dev_local();
+      }
+
+#ifdef BOUNDS_CHECK
+      if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost || 
+          ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
+          ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
+          ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+         printf("%d: Warning sizes for adjust neighbors not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
+      }
+      if (ezcl_get_device_mem_nelements(dev_indices_needed) < (int)(ncells_ghost-ncells) ){
+         printf("%d: Warning indices size wrong nghost %d size indices_needed\n",mype,ncells_ghost-ncells,ezcl_get_device_mem_nelements(dev_indices_needed));
+      }
+#endif
+
+      cl_event adjust_neighbors_local_event;
+
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  0,  sizeof(cl_int), (void *)&ncells_ghost);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  1,  sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  2,  sizeof(cl_int), (void *)&noffset);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  3,  sizeof(cl_mem), (void *)&dev_indices_needed);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  4,  sizeof(cl_mem), (void *)&dev_nlft);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  5,  sizeof(cl_mem), (void *)&dev_nrht);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  6,  sizeof(cl_mem), (void *)&dev_nbot);
+      ezcl_set_kernel_arg(kernel_adjust_neighbors_local,  7,  sizeof(cl_mem), (void *)&dev_ntop);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_adjust_neighbors_local, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &adjust_neighbors_local_event); 
+
+      ezcl_device_memory_delete(dev_indices_needed);
+
+      if (DEBUG){
+         fprintf(fp,"After adjusting neighbors to local indices\n");
+         print_dev_local();
+      }
+
+      ezcl_wait_for_events(1, &adjust_neighbors_local_event);
+      ezcl_event_release(adjust_neighbors_local_event);
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_NEIGH_ADJUST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+         cpu_timer_start(&tstart_lev2);
+      }
+
+      offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
+      offtile_local_count++;
+      offtile_ratio_local /= offtile_local_count;
+
+      if (cell_handle) L7_Free(&cell_handle);
+      cell_handle=0;
+
+      if (DEBUG){
+         fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
+         for (int ic=0; ic<nghost; ic++){
+            fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ic,indices_needed[ic]);
+         }
+      }
+
+      L7_Dev_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
+
+#ifdef BOUNDS_CHECK
+      {
+         vector<int> nlft_tmp(ncells_ghost);
+         vector<int> nrht_tmp(ncells_ghost);
+         vector<int> nbot_tmp(ncells_ghost);
+         vector<int> ntop_tmp(ncells_ghost);
+         vector<int> level_tmp(ncells_ghost);
+         vector<real_t> H_tmp(ncells_ghost);
+         ezcl_enqueue_read_buffer(command_queue, dev_nlft,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nrht,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nbot,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_ntop,  CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+         for (uint ic=0; ic<ncells; ic++){
+            int nl = nlft_tmp[ic];
+            if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+            if (level_tmp[nl] > level_tmp[ic]){
+               int ntl = ntop_tmp[nl];
+               if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
+            }
+            int nr = nrht_tmp[ic];
+            if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+            if (level_tmp[nr] > level_tmp[ic]){
+               int ntr = ntop_tmp[nr];
+               if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
+            }
+            int nb = nbot_tmp[ic];
+            if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+            if (level_tmp[nb] > level_tmp[ic]){
+               int nrb = nrht_tmp[nb];
+               if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
+            }
+            int nt = ntop_tmp[ic];
+            if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d ntop %d ncells %ld ncells_ghost %ld\n",mype,__LINE__,ic,ic+noffset,nt,ncells,ncells_ghost);
+            if (level_tmp[nt] > level_tmp[ic]){
+               int nrt = nrht_tmp[nt];
+               if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
+            }
+         }
+      }
+#endif
+
+      if (TIMING_LEVEL >= 2) {
+         gpu_timers[MESH_TIMER_SETUP_COMM] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+      }
+
+      if (DEBUG) {
+         print_dev_local();
+
+         vector<int> hash_tmp(hashsize);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+         cl_mem dev_hash_header_check = gpu_get_hash_header();
+         vector<ulong> hash_header_check(hash_header_size);
+         ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+         int   gpu_hash_method     = (int)hash_header_check[0];
+         ulong gpu_hash_table_size =      hash_header_check[1];
+         ulong gpu_AA              =      hash_header_check[2];
+         ulong gpu_BB              =      hash_header_check[3];
+
+         vector<int> nlft_tmp(ncells_ghost);
+         vector<int> nrht_tmp(ncells_ghost);
+         vector<int> nbot_tmp(ncells_ghost);
+         vector<int> ntop_tmp(ncells_ghost);
+         ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+         int jmaxglobal = (jmax+1)*IPOW2(levmx);
+         int imaxglobal = (imax+1)*IPOW2(levmx);
+         fprintf(fp,"\n                                    HASH numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  if (ii >= iminsize && ii < imaxsize) {
+                     fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    nlft numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+                  if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+                        fprintf(fp,"%5d",nlft_tmp[hashval]);
+                  } else {
+                        fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+      
+         fprintf(fp,"\n                                    nrht numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+                  if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+                     fprintf(fp,"%5d",nrht_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    nbot numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+                  if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+                     fprintf(fp,"%5d",nbot_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+
+         fprintf(fp,"\n                                    ntop numbering\n");
+         for (int jj = jmaxglobal-1; jj>=0; jj--){
+            fprintf(fp,"%2d: %4d:",mype,jj);
+            if (jj >= jminsize && jj < jmaxsize) {
+               for (int ii = 0; ii<imaxglobal; ii++){
+                  int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+                  if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+                     fprintf(fp,"%5d",ntop_tmp[hashval]);
+                  } else {
+                     fprintf(fp,"     ");
+                  }
+               }
+            }
+            fprintf(fp,"\n");
+         }
+         fprintf(fp,"%2d:      ",mype);
+         for (int ii = 0; ii<imaxglobal; ii++){
+            fprintf(fp,"%4d:",ii);
+         }
+         fprintf(fp,"\n");
+      }
+
+      if (DEBUG) {
+         print_dev_local();
+
+         vector<int> i_tmp(ncells_ghost);
+         vector<int> j_tmp(ncells_ghost);
+         vector<int> level_tmp(ncells_ghost);
+         vector<int> nlft_tmp(ncells_ghost);
+         vector<int> nrht_tmp(ncells_ghost);
+         vector<int> nbot_tmp(ncells_ghost);
+         vector<int> ntop_tmp(ncells_ghost);
+         ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE,  0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+         for (uint ic=0; ic<ncells; ic++){
+            fprintf(fp,"%d: before update ic %d        i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+                mype,ic,i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
+         }
+         int ig=0;
+         for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
+            fprintf(fp,"%d: after  update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+                mype,ic,indices_needed[ig],i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
+         }
+      }
+   }
+#endif
+
+   ezcl_device_memory_delete(dev_sizes);
+   ezcl_device_memory_delete(dev_check);
+
+   gpu_compact_hash_delete(dev_hash, dev_hash_header);
+
+   gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::print_calc_neighbor_type(void)
+{
+   if ( calc_neighbor_type == HASH_TABLE ) {
+      if (mype == 0) printf("Using hash tables to calculate neighbors\n");
+      if (mype == 0 && numpe == 1) final_hash_collision_report();
+   } else {
+      printf("hash table size %ld\n",ncells*(int)log(ncells)*sizeof(int));
+      if (mype == 0) printf("Using k-D tree to calculate neighbors\n");
+   }
+}
+
+int Mesh::get_calc_neighbor_type(void)
+{
+   return(calc_neighbor_type );
+}
+
+void Mesh::calc_celltype_threaded(size_t ncells)
+{
+   int flags=0;
+#ifdef HAVE_J7
+   if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
+      if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
+      celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
+   }
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+   for (uint ic=0; ic<ncells; ++ic) {
+      celltype[ic] = REAL_CELL;
+      if (is_left_boundary(ic) )   celltype[ic] = LEFT_BOUNDARY;
+      if (is_right_boundary(ic) )  celltype[ic] = RIGHT_BOUNDARY;
+      if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
+      if (is_top_boundary(ic))     celltype[ic] = TOP_BOUNDARY;
+   }
+}
+
+void Mesh::calc_celltype(size_t ncells)
+{
+   int flags = 0;
+#ifdef HAVE_J7
+   if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+   if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
+      if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
+      celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
+   }
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+   for (uint ic=0; ic<ncells; ++ic) {
+      celltype[ic] = REAL_CELL;
+      if (is_left_boundary(ic) )   celltype[ic] = LEFT_BOUNDARY;
+      if (is_right_boundary(ic) )  celltype[ic] = RIGHT_BOUNDARY;
+      if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
+      if (is_top_boundary(ic))     celltype[ic] = TOP_BOUNDARY;
+   }
+}
+
+void Mesh::calc_symmetry(vector<int> &dsym, vector<int> &xsym, vector<int> &ysym)
+{
+   TBounds box;
+   vector<int> index_list( IPOW2(levmx*levmx) );
+
+   int num;
+   for (uint ic=0; ic<ncells; ic++) {
+      dsym[ic]=ic;
+      xsym[ic]=ic;
+      ysym[ic]=ic;
+
+      //diagonal symmetry
+      box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
+      box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
+      box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
+      box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
+      KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+      if (num == 1) dsym[ic]=index_list[0];
+      //printf("ic %d dsym[ic] %d num %d\n",ic,dsym[ic],num);
+
+      //x symmetry
+      box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
+      box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
+      box.min.y = y[ic]+0.5*dy[ic];
+      box.max.y = y[ic]+0.5*dy[ic];
+      KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+      if (num == 1) xsym[ic]=index_list[0];
+
+      //y symmetry
+      box.min.x = x[ic]+0.5*dx[ic];
+      box.max.x = x[ic]+0.5*dx[ic];
+      box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
+      box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
+      KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+      if (num == 1) ysym[ic]=index_list[0];
+
+   }
+}
+
+#ifdef HAVE_MPI
+void Mesh::do_load_balance_local(size_t numcells, float *weight, MallocPlus &state_memory)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   // To get rid of compiler warning
+   if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
+
+   int ncells_old = numcells;
+   int noffset_old = ndispl[mype];
+
+// Need to add weight array to load balance if it is not NULL
+// Need to add tolerance to when load balance is done
+
+   int do_load_balance_global = 0;
+   int nsizes_old = 0;
+
+   for (int ip=0; ip<numpe; ip++){
+      nsizes_old = nsizes[ip];
+
+      // Calc new,even partition of data across processors
+      nsizes[ip] = ncells_global/numpe;
+      // Account for leftover cells
+      if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+
+      if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
+   }
+
+   if (do_load_balance_global) {
+      cpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
+
+      mesh_memory.memory_delete(celltype);
+      mesh_memory.memory_delete(nlft);
+      mesh_memory.memory_delete(nrht);
+      mesh_memory.memory_delete(nbot);
+      mesh_memory.memory_delete(ntop);
+
+      ndispl[0]=0;
+      for (int ip=1; ip<numpe; ip++){
+         ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+      }
+      ncells = nsizes[mype];
+      noffset=ndispl[mype];
+
+      // Indices of blocks to be added to load balance
+      int lower_block_start = noffset;
+      int lower_block_end   = min(noffset_old-1, (int)(noffset+ncells-1));
+      int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
+      int upper_block_end   = noffset+ncells-1;
+
+      int lower_block_size = max(lower_block_end-lower_block_start+1,0);
+      if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
+      int upper_block_size = max(upper_block_end-upper_block_start+1,0);
+      int indices_needed_count = lower_block_size + upper_block_size;
+
+      int in = 0;
+
+      vector<int> indices_needed(indices_needed_count);
+      for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
+         indices_needed[in]=iz;
+      }
+      for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
+         indices_needed[in]=iz;
+      }
+
+      int load_balance_handle = 0;
+      L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
+
+      //printf("\n%d: DEBUG load balance report\n",mype);
+
+      state_memory.memory_realloc_all(ncells_old+indices_needed_count);
+
+      MallocPlus state_memory_old = state_memory;
+
+
+      malloc_plus_memory_entry *memory_item;
+
+      for (memory_item = state_memory_old.memory_entry_by_name_begin();
+           memory_item != state_memory_old.memory_entry_by_name_end();
+           memory_item = state_memory_old.memory_entry_by_name_next() ) {
+
+         //if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+
+         if (memory_item->mem_elsize == 8) {
+            double *mem_ptr_double = (double *)memory_item->mem_ptr;
+
+            int flags = state_memory.get_memory_flags(mem_ptr_double);
+            double *state_temp_double = (double *) state_memory.memory_malloc(ncells, sizeof(double),
+                                                                              "state_temp_double", flags);
+
+            //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+            L7_Update(mem_ptr_double, L7_DOUBLE, load_balance_handle);
+            in = 0;
+            if(lower_block_size > 0) {
+               for(; in < MIN(lower_block_size, (int)ncells); in++) {
+                  state_temp_double[in] = mem_ptr_double[ncells_old + in];
+               }
+            }
+
+            for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+               state_temp_double[in] = mem_ptr_double[ic];
+            }
+
+            if(upper_block_size > 0) {
+               int ic = ncells_old + lower_block_size;
+               for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+                  state_temp_double[in] = mem_ptr_double[ic+k];
+               }
+            }
+            state_memory.memory_replace(mem_ptr_double, state_temp_double);
+         } else if (memory_item->mem_elsize == 4) {
+            float *mem_ptr_float = (float *)memory_item->mem_ptr;
+
+            int flags = state_memory.get_memory_flags(mem_ptr_float);
+            float *state_temp_float = (float *) state_memory.memory_malloc(ncells, sizeof(float),
+                                                                          "state_temp_float", flags);
+
+            //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+            L7_Update(mem_ptr_float, L7_FLOAT, load_balance_handle);
+            in = 0;
+            if(lower_block_size > 0) {
+               for(; in < MIN(lower_block_size, (int)ncells); in++) {
+                  state_temp_float[in] = mem_ptr_float[ncells_old + in];
+               }
+            }
+
+            for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+               state_temp_float[in] = mem_ptr_float[ic];
+            }
+
+            if(upper_block_size > 0) {
+               int ic = ncells_old + lower_block_size;
+               for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+                  state_temp_float[in] = mem_ptr_float[ic+k];
+               }
+            }
+            state_memory.memory_replace(mem_ptr_float, state_temp_float);
+         }
+      }
+
+      mesh_memory.memory_realloc_all(ncells_old+indices_needed_count);
+
+      MallocPlus mesh_memory_old = mesh_memory;
+
+      for (memory_item = mesh_memory_old.memory_entry_by_name_begin();
+           memory_item != mesh_memory_old.memory_entry_by_name_end();
+           memory_item = mesh_memory_old.memory_entry_by_name_next() ) {
+
+         //if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+
+         if (memory_item->mem_elsize == 8) {
+            long long *mem_ptr_long = (long long *)memory_item->mem_ptr;
+
+            int flags = mesh_memory.get_memory_flags(mem_ptr_long);
+            long long *mesh_temp_long = (long long *)mesh_memory.memory_malloc(ncells, sizeof(long long), "mesh_temp_long", flags);
+
+            //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+            L7_Update(mem_ptr_long, L7_LONG_LONG_INT, load_balance_handle);
+            in = 0;
+            if(lower_block_size > 0) {
+               for(; in < MIN(lower_block_size, (int)ncells); in++) {
+                  mesh_temp_long[in] = mem_ptr_long[ncells_old + in];
+               }
+            }
+
+            for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+               mesh_temp_long[in] = mem_ptr_long[ic];
+            }
+
+            if(upper_block_size > 0) {
+               int ic = ncells_old + lower_block_size;
+               for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+                  mesh_temp_long[in] = mem_ptr_long[ic+k];
+               }
+            }
+            mesh_memory.memory_replace(mem_ptr_long, mesh_temp_long);
+
+         } else {
+            int *mem_ptr_int = (int *)memory_item->mem_ptr;
+
+            int flags = mesh_memory.get_memory_flags(mem_ptr_int);
+            int *mesh_temp_int = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "mesh_temp_int", flags);
+
+            //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+            L7_Update(mem_ptr_int, L7_INT, load_balance_handle);
+            in = 0;
+            if(lower_block_size > 0) {
+               for(; in < MIN(lower_block_size, (int)ncells); in++) {
+                  mesh_temp_int[in] = mem_ptr_int[ncells_old + in];
+               }
+            }
+
+            for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+               mesh_temp_int[in] = mem_ptr_int[ic];
+            }
+
+            if(upper_block_size > 0) {
+               int ic = ncells_old + lower_block_size;
+               for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+                  mesh_temp_int[in] = mem_ptr_int[ic+k];
+               }
+            }
+            mesh_memory.memory_replace(mem_ptr_int, mesh_temp_int);
+
+         }
+      }
+
+      L7_Free(&load_balance_handle);
+      load_balance_handle = 0;
+
+      memory_reset_ptrs();
+
+      //mesh_memory.memory_report();
+      //state_memory.memory_report();
+      //printf("%d: DEBUG end load balance report\n\n",mype);
+      calc_celltype(ncells);
+   }
+
+
+   cpu_timers[MESH_TIMER_LOAD_BALANCE] += cpu_timer_stop(tstart_cpu);
+}
+#endif
+
+#ifdef HAVE_OPENCL
+#ifdef HAVE_MPI
+int Mesh::gpu_do_load_balance_local(size_t numcells, float *weight, MallocPlus &gpu_state_memory)
+{
+   int do_load_balance_global = 0;
+
+   if (! gpu_do_rezone) return(do_load_balance_global);
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   // To get rid of compiler warning
+   if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
+
+   int ncells_old = numcells;
+   int noffset_old = ndispl[mype];
+
+// Need to add weight array to load balance if it is not NULL
+// Need to add tolerance to when load balance is done
+
+   int nsizes_old = 0;
+   for (int ip=0; ip<numpe; ip++){
+      nsizes_old = nsizes[ip];
+      nsizes[ip] = ncells_global/numpe;
+      if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+      if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
+   }
+
+   if(do_load_balance_global) {
+
+      cl_command_queue command_queue = ezcl_get_command_queue();
+
+      gpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
+
+      ndispl[0]=0;
+      for (int ip=1; ip<numpe; ip++){
+         ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+      }
+      ncells = nsizes[mype];
+      noffset=ndispl[mype];
+
+      // Indices of blocks to be added to load balance
+      int lower_block_start = noffset;
+      int lower_block_end   = min(noffset_old-1, (int)(noffset+ncells-1));
+      int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
+      int upper_block_end   = noffset+ncells-1;
+      //printf("%d: lbs %d lbe %d ubs %d ube %d\n",mype,lower_block_start-noffset_old,lower_block_end-noffset_old,upper_block_start-noffset_old,upper_block_end-noffset_old);
+
+      size_t lower_block_size = max(lower_block_end-lower_block_start+1,0);
+      if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
+      size_t upper_block_size = max(upper_block_end-upper_block_start+1,0);
+      int indices_needed_count = lower_block_size + upper_block_size;
+
+      size_t middle_block_size = ncells - lower_block_size - upper_block_size;
+      int middle_block_start = max(noffset - noffset_old, 0);
+
+      int lower_segment_size = noffset-noffset_old;
+      int do_whole_segment = 0;
+      if (lower_segment_size > ncells_old) do_whole_segment = 1;
+
+      int upper_segment_size = ( (noffset_old+ncells_old) - (noffset+ncells) );
+      int upper_segment_start = (noffset_old+ncells_old) - upper_segment_size - noffset_old;
+      if (upper_segment_size > ncells_old) do_whole_segment=1;
+
+      int in = 0;
+      vector<int> indices_needed(indices_needed_count);
+      for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
+         indices_needed[in]=iz;
+      }
+      for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
+         indices_needed[in]=iz;
+      }
+
+      int load_balance_handle = 0;
+      L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
+       
+      size_t local_work_size = 128;
+      size_t global_work_size = ((ncells + local_work_size - 1) / local_work_size) * local_work_size;
+
+      // printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
+
+      // Allocate lower block on GPU
+      size_t low_block_size = MAX(1, lower_block_size);
+      cl_mem dev_state_var_lower = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_lower"), &low_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+      // Allocate upper block on GPU
+      size_t up_block_size = MAX(1, upper_block_size);
+      cl_mem dev_state_var_upper = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_upper"), &up_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+      MallocPlus gpu_state_memory_old = gpu_state_memory;
+      malloc_plus_memory_entry *memory_item;
+
+      for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
+           memory_item != gpu_state_memory_old.memory_entry_by_name_end();
+           memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
+         //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+         cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
+
+         if (memory_item->mem_elsize == 8){
+#ifndef MINIMUM_PRECISION
+            vector<double> state_var_tmp(ncells_old+indices_needed_count,0.0);
+
+            // Read current state values from GPU and write to CPU arrays
+            if (do_whole_segment) {
+               ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_double), &state_var_tmp[0], NULL);
+            } else {
+               // Read lower block from GPU
+               if (lower_segment_size > 0) {
+                  ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_double), &state_var_tmp[0], NULL);
+               }
+               // Read upper block from GPU
+               if (upper_segment_size > 0) {
+                  ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_double), upper_segment_size*sizeof(cl_double), &state_var_tmp[upper_segment_start], NULL);
+               }
+            }
+
+            // Update arrays with L7
+            L7_Update(&state_var_tmp[0], L7_DOUBLE, load_balance_handle);
+
+            // Set lower block on GPU
+            if(lower_block_size > 0) {
+               ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_double), &state_var_tmp[ncells_old], NULL);
+            }
+            // Set upper block on GPU
+            if(upper_block_size > 0) {
+               ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_double), &state_var_tmp[ncells_old+lower_block_size], NULL); 
+            }
+
+            // Allocate space on GPU for temp arrays (used in double buffering)
+            cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_double), CL_MEM_READ_WRITE, 0);
+            gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_double), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
+
+            //printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
+
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 0, sizeof(cl_int), &ncells);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 1, sizeof(cl_int), &lower_block_size);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 2, sizeof(cl_int), &middle_block_size);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 3, sizeof(cl_int), &middle_block_start);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 4, sizeof(cl_mem), &dev_state_mem_ptr);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 5, sizeof(cl_mem), &dev_state_var_lower);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 6, sizeof(cl_mem), &dev_state_var_upper);
+            ezcl_set_kernel_arg(kernel_do_load_balance_double, 7, sizeof(cl_mem), &dev_state_var_new);
+
+            ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_double,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+            gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+#else
+            printf("ERROR -- can't have double type for state variable\n");
+            exit(1);
+#endif
+         } else if (memory_item->mem_elsize == 4){
+            vector<float> state_var_tmp(ncells_old+indices_needed_count,0.0);
+
+            // Read current state values from GPU and write to CPU arrays
+            if (do_whole_segment) {
+               ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_float), &state_var_tmp[0], NULL);
+            } else {
+               // Read lower block from GPU
+               if (lower_segment_size > 0) {
+                  ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_float), &state_var_tmp[0], NULL);
+               }
+               // Read upper block from GPU
+               if (upper_segment_size > 0) {
+                  ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_float), upper_segment_size*sizeof(cl_float), &state_var_tmp[upper_segment_start], NULL);
+               }
+            }
+
+            // Update arrays with L7
+            L7_Update(&state_var_tmp[0], L7_FLOAT, load_balance_handle);
+
+            // Set lower block on GPU
+            if(lower_block_size > 0) {
+               ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_float), &state_var_tmp[ncells_old], NULL);
+            }
+            // Set upper block on GPU
+            if(upper_block_size > 0) {
+               ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_float), &state_var_tmp[ncells_old+lower_block_size], NULL); 
+            }
+
+            // Allocate space on GPU for temp arrays (used in double buffering)
+            cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_float), CL_MEM_READ_WRITE, 0);
+            gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_float), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
+
+            //printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
+
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 0, sizeof(cl_int), &ncells);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 1, sizeof(cl_int), &lower_block_size);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 2, sizeof(cl_int), &middle_block_size);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 3, sizeof(cl_int), &middle_block_start);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 4, sizeof(cl_mem), &dev_state_mem_ptr);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 5, sizeof(cl_mem), &dev_state_var_lower);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 6, sizeof(cl_mem), &dev_state_var_upper);
+            ezcl_set_kernel_arg(kernel_do_load_balance_float, 7, sizeof(cl_mem), &dev_state_var_new);
+
+            ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_float,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+            gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+         }
+      }
+
+      ezcl_device_memory_delete(dev_state_var_lower);
+      ezcl_device_memory_delete(dev_state_var_upper);
+
+      vector<int> i_tmp(ncells_old+indices_needed_count,0);
+      vector<int> j_tmp(ncells_old+indices_needed_count,0);
+      vector<int> level_tmp(ncells_old+indices_needed_count,0);
+      vector<int> celltype_tmp(ncells_old+indices_needed_count,0);
+
+      if (do_whole_segment) {
+         ezcl_enqueue_read_buffer(command_queue, dev_i,        CL_FALSE, 0, ncells_old*sizeof(cl_int), &i_tmp[0],        NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_j,        CL_FALSE, 0, ncells_old*sizeof(cl_int), &j_tmp[0],        NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_level,    CL_FALSE, 0, ncells_old*sizeof(cl_int), &level_tmp[0],    NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE,  0, ncells_old*sizeof(cl_int), &celltype_tmp[0], NULL);
+      } else {
+         if (lower_segment_size > 0) {
+            ezcl_enqueue_read_buffer(command_queue, dev_i,        CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &i_tmp[0],        NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_j,        CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &j_tmp[0],        NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_level,    CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &level_tmp[0],    NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE,  0, lower_segment_size*sizeof(cl_int), &celltype_tmp[0], NULL);
+         }
+         if (upper_segment_size > 0) {
+            ezcl_enqueue_read_buffer(command_queue, dev_i,        CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &i_tmp[upper_segment_start],        NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_j,        CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &j_tmp[upper_segment_start],        NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_level,    CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &level_tmp[upper_segment_start],    NULL);
+            ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE,  upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &celltype_tmp[upper_segment_start], NULL);
+         }
+      }
+
+      L7_Update(&i_tmp[0],        L7_INT, load_balance_handle);
+      L7_Update(&j_tmp[0],        L7_INT, load_balance_handle);
+      L7_Update(&level_tmp[0],    L7_INT, load_balance_handle);
+      L7_Update(&celltype_tmp[0], L7_INT, load_balance_handle);
+
+      L7_Free(&load_balance_handle);
+      load_balance_handle = 0;
+
+      // Allocate and set lower block on GPU
+      cl_mem dev_i_lower, dev_j_lower, dev_level_lower, dev_celltype_lower;
+
+      if(lower_block_size > 0) {
+         dev_i_lower        = ezcl_malloc(NULL, const_cast<char *>("dev_i_lower"),        &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_j_lower        = ezcl_malloc(NULL, const_cast<char *>("dev_j_lower"),        &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_level_lower    = ezcl_malloc(NULL, const_cast<char *>("dev_level_lower"),    &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_celltype_lower = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+         ezcl_enqueue_write_buffer(command_queue, dev_i_lower,        CL_FALSE, 0, lower_block_size*sizeof(cl_int), &i_tmp[ncells_old],        NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_j_lower,        CL_FALSE, 0, lower_block_size*sizeof(cl_int), &j_tmp[ncells_old],        NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_level_lower,    CL_FALSE, 0, lower_block_size*sizeof(cl_int), &level_tmp[ncells_old],    NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_celltype_lower, CL_TRUE,  0, lower_block_size*sizeof(cl_int), &celltype_tmp[ncells_old], NULL);
+      }
+
+      // Allocate and set upper block on GPU
+      cl_mem dev_i_upper, dev_j_upper, dev_level_upper, dev_celltype_upper;
+      if(upper_block_size > 0) {
+         dev_i_upper        = ezcl_malloc(NULL, const_cast<char *>("dev_i_upper"),        &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_j_upper        = ezcl_malloc(NULL, const_cast<char *>("dev_j_upper"),        &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_level_upper    = ezcl_malloc(NULL, const_cast<char *>("dev_level_upper"),    &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+         dev_celltype_upper = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+         ezcl_enqueue_write_buffer(command_queue, dev_i_upper,        CL_FALSE, 0, upper_block_size*sizeof(cl_int), &i_tmp[ncells_old+lower_block_size],        NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_j_upper,        CL_FALSE, 0, upper_block_size*sizeof(cl_int), &j_tmp[ncells_old+lower_block_size],        NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_level_upper,    CL_FALSE, 0, upper_block_size*sizeof(cl_int), &level_tmp[ncells_old+lower_block_size],    NULL);
+         ezcl_enqueue_write_buffer(command_queue, dev_celltype_upper, CL_TRUE,  0, upper_block_size*sizeof(cl_int), &celltype_tmp[ncells_old+lower_block_size], NULL);
+      }
+
+      local_work_size = 128;
+
+      // printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
+      // Allocate space on GPU for temp arrays (used in double buffering)
+
+      size_t mem_request = (int)((float)ncells*mem_factor);
+      cl_mem dev_i_new        = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"),        &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      cl_mem dev_j_new        = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"),        &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      cl_mem dev_level_new    = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"),    &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+      cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+      // Set kernel arguments and call lower block kernel
+      if(lower_block_size > 0) {
+
+         size_t global_work_size = ((lower_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 0, sizeof(cl_mem), &dev_i_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 1, sizeof(cl_mem), &dev_j_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 2, sizeof(cl_mem), &dev_level_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 3, sizeof(cl_mem), &dev_celltype_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 4, sizeof(cl_mem), &dev_i_lower);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 5, sizeof(cl_mem), &dev_j_lower);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 6, sizeof(cl_mem), &dev_level_lower);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 7, sizeof(cl_mem), &dev_celltype_lower);
+         ezcl_set_kernel_arg(kernel_do_load_balance_lower, 8, sizeof(cl_int), &lower_block_size);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_lower,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+         ezcl_device_memory_delete(dev_i_lower);
+         ezcl_device_memory_delete(dev_j_lower);
+         ezcl_device_memory_delete(dev_level_lower);
+         ezcl_device_memory_delete(dev_celltype_lower);
+      }
+
+      // Set kernel arguments and call middle block kernel
+      if(middle_block_size > 0) {
+
+         size_t global_work_size = ((middle_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  0, sizeof(cl_mem), &dev_i_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  1, sizeof(cl_mem), &dev_j_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  2, sizeof(cl_mem), &dev_level_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  3, sizeof(cl_mem), &dev_celltype_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  4, sizeof(cl_mem), &dev_i);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  5, sizeof(cl_mem), &dev_j);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  6, sizeof(cl_mem), &dev_level);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  7, sizeof(cl_mem), &dev_celltype);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  8, sizeof(cl_int), &lower_block_size);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle,  9, sizeof(cl_int), &middle_block_size);
+         ezcl_set_kernel_arg(kernel_do_load_balance_middle, 10, sizeof(cl_int), &middle_block_start);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_middle,   1, NULL, &global_work_size, &local_work_size, NULL);
+      }
+
+      // Set kernel arguments and call upper block kernel
+      if(upper_block_size > 0) {
+
+         size_t global_work_size = ((upper_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  0, sizeof(cl_mem), &dev_i_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  1, sizeof(cl_mem), &dev_j_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  2, sizeof(cl_mem), &dev_level_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  3, sizeof(cl_mem), &dev_celltype_new);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  4, sizeof(cl_mem), &dev_i_upper);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  5, sizeof(cl_mem), &dev_j_upper);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  6, sizeof(cl_mem), &dev_level_upper);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  7, sizeof(cl_mem), &dev_celltype_upper);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  8, sizeof(cl_int), &lower_block_size);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper,  9, sizeof(cl_int), &middle_block_size);
+         ezcl_set_kernel_arg(kernel_do_load_balance_upper, 10, sizeof(cl_int), &upper_block_size);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_upper,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+         ezcl_device_memory_delete(dev_i_upper);
+         ezcl_device_memory_delete(dev_j_upper);
+         ezcl_device_memory_delete(dev_level_upper);
+         ezcl_device_memory_delete(dev_celltype_upper);
+      }
+
+      ezcl_device_memory_swap(&dev_i_new,        &dev_i);
+      ezcl_device_memory_swap(&dev_j_new,        &dev_j);
+      ezcl_device_memory_swap(&dev_level_new,    &dev_level);
+      ezcl_device_memory_swap(&dev_celltype_new, &dev_celltype);
+
+      ezcl_device_memory_delete(dev_i_new);
+      ezcl_device_memory_delete(dev_j_new);
+      ezcl_device_memory_delete(dev_level_new);
+      ezcl_device_memory_delete(dev_celltype_new);
+
+      gpu_timers[MESH_TIMER_LOAD_BALANCE] += (long int)(cpu_timer_stop(tstart_cpu)*1.0e9);
+   }
+
+   return(do_load_balance_global);
+}
+#endif
+#endif
+
+#ifdef HAVE_OPENCL
+int Mesh::gpu_count_BCs(void)
+{
+   cl_event count_BCs_stage1_event, count_BCs_stage2_event;
+
+   size_t local_work_size  = MIN(ncells, TILE_SIZE);
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+   //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; //  For on-device global reduction kernel.
+   size_t block_size     = global_work_size/local_work_size;
+
+   int bcount = 0;
+
+   if (! have_boundary) {
+      cl_command_queue command_queue = ezcl_get_command_queue();
+      cl_mem dev_ioffset  = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+       /*
+       __kernel void count_BCs(
+                        const int    isize,      // 0   
+               __global const int   *i,         // 1
+               __global const int   *j,         // 2
+               __global const int   *level,     // 3
+               __global const int   *lev_ibeg,  // 4
+               __global const int   *lev_iend,  // 5
+               __global const int   *lev_jbeg,  // 6
+               __global const int   *lev_jend,  // 7
+               __global       int   *scratch,   // 8
+               __local        int   *tile)      // 9
+       */
+      size_t shared_spd_sum_int = local_work_size * sizeof(cl_int);
+      ezcl_set_kernel_arg(kernel_count_BCs, 0, sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_count_BCs, 1, sizeof(cl_mem), (void *)&dev_i);
+      ezcl_set_kernel_arg(kernel_count_BCs, 2, sizeof(cl_mem), (void *)&dev_j);
+      ezcl_set_kernel_arg(kernel_count_BCs, 3, sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_count_BCs, 4, sizeof(cl_mem), (void *)&dev_levibeg);
+      ezcl_set_kernel_arg(kernel_count_BCs, 5, sizeof(cl_mem), (void *)&dev_leviend);
+      ezcl_set_kernel_arg(kernel_count_BCs, 6, sizeof(cl_mem), (void *)&dev_levjbeg);
+      ezcl_set_kernel_arg(kernel_count_BCs, 7, sizeof(cl_mem), (void *)&dev_levjend);
+      ezcl_set_kernel_arg(kernel_count_BCs, 8, sizeof(cl_mem), (void *)&dev_ioffset);
+      ezcl_set_kernel_arg(kernel_count_BCs, 9, shared_spd_sum_int, 0);
+
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_count_BCs, 1, NULL, &global_work_size, &local_work_size, &count_BCs_stage1_event);
+
+      if (block_size > 1) {
+         ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+         ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+         ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 2, shared_spd_sum_int, 0);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_int_stage2of2, 1, NULL, &local_work_size, &local_work_size, &count_BCs_stage2_event);
+      }
+
+      ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, 1*sizeof(cl_int), &bcount, NULL);
+ 
+      //printf("DEBUG -- bcount is %d\n",bcount);
+      //state->gpu_time_read += ezcl_timer_calc(&start_read_event, &start_read_event);
+
+      ezcl_device_memory_delete(dev_ioffset);
+
+      gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage1_event, &count_BCs_stage1_event);
+      if (block_size > 1) {
+         gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage2_event, &count_BCs_stage2_event);
+      }
+
+   }
+
+   return(bcount);
+}
+#endif
+
+void Mesh::allocate(size_t ncells)
+{
+   int flags = 0;
+   flags = RESTART_DATA;
+#ifdef HAVE_J7
+   if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+   i     = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "i",     flags);
+   j     = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "j",     flags);
+   level = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "level", flags);
+}
+
+
+void Mesh::resize(size_t new_ncells)
+{
+   size_t current_size = mesh_memory.get_memory_size(i);
+   if (new_ncells > current_size) mesh_memory.memory_realloc_all(new_ncells);
+}
+
+void Mesh::memory_reset_ptrs(void){
+   i        = (int *)mesh_memory.get_memory_ptr("i");
+   j        = (int *)mesh_memory.get_memory_ptr("j");
+   level    = (int *)mesh_memory.get_memory_ptr("level");
+   celltype = (int *)mesh_memory.get_memory_ptr("celltype");
+   nlft     = (int *)mesh_memory.get_memory_ptr("nlft");
+   nrht     = (int *)mesh_memory.get_memory_ptr("nrht");
+   nbot     = (int *)mesh_memory.get_memory_ptr("nbot");
+   ntop     = (int *)mesh_memory.get_memory_ptr("ntop");
+}
+
+void Mesh::resize_old_device_memory(size_t ncells)
+{
+#ifdef HAVE_OPENCL
+   ezcl_device_memory_delete(dev_level);
+   ezcl_device_memory_delete(dev_i);
+   ezcl_device_memory_delete(dev_j);
+   ezcl_device_memory_delete(dev_celltype);
+   size_t mem_request = (int)((float)ncells*mem_factor);
+   dev_level    = ezcl_malloc(NULL, const_cast<char *>("dev_level"),    &mem_request, sizeof(cl_int),  CL_MEM_READ_ONLY, 0);
+   dev_i        = ezcl_malloc(NULL, const_cast<char *>("dev_i"),        &mem_request, sizeof(cl_int),  CL_MEM_READ_ONLY, 0);
+   dev_j        = ezcl_malloc(NULL, const_cast<char *>("dev_j"),        &mem_request, sizeof(cl_int),  CL_MEM_READ_ONLY, 0);
+   dev_celltype = ezcl_malloc(NULL, const_cast<char *>("dev_celltype"), &mem_request, sizeof(cl_int),  CL_MEM_READ_ONLY, 0);
+#else
+   // To get rid of compiler warning
+   if (1 == 2) printf("DEBUG -- ncells is %lu\n",ncells);
+#endif
+}
+void Mesh::print_object_info(void)
+{
+   printf(" ---- Mesh object info -----\n");
+   printf("Dimensionality : %d\n",ndim);
+   printf("Parallel info  : mype %d numpe %d noffset %d parallel %d\n",mype,numpe,noffset,parallel);
+   printf("Sizes          : ncells %ld ncells_ghost %ld\n\n",ncells,ncells_ghost);
+#ifdef HAVE_OPENCL
+   int num_elements, elsize;
+
+   num_elements = ezcl_get_device_mem_nelements(dev_celltype);
+   elsize = ezcl_get_device_mem_elsize(dev_celltype);
+   printf("dev_celltype     ptr : %p nelements %d elsize %d\n",dev_celltype,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_level);
+   elsize = ezcl_get_device_mem_elsize(dev_level);
+   printf("dev_level        ptr : %p nelements %d elsize %d\n",dev_level,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_i);
+   elsize = ezcl_get_device_mem_elsize(dev_i);
+   printf("dev_i            ptr : %p nelements %d elsize %d\n",dev_i,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_j);
+   elsize = ezcl_get_device_mem_elsize(dev_j);
+   printf("dev_j            ptr : %p nelements %d elsize %d\n",dev_j,num_elements,elsize);
+
+   num_elements = ezcl_get_device_mem_nelements(dev_nlft);
+   elsize = ezcl_get_device_mem_elsize(dev_nlft);
+   printf("dev_nlft         ptr : %p nelements %d elsize %d\n",dev_nlft,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_nrht);
+   elsize = ezcl_get_device_mem_elsize(dev_nrht);
+   printf("dev_nrht         ptr : %p nelements %d elsize %d\n",dev_nrht,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_nbot);
+   elsize = ezcl_get_device_mem_elsize(dev_nbot);
+   printf("dev_nbot         ptr : %p nelements %d elsize %d\n",dev_nbot,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_ntop);
+   elsize = ezcl_get_device_mem_elsize(dev_ntop);
+   printf("dev_ntop         ptr : %p nelements %d elsize %d\n",dev_ntop,num_elements,elsize);
+#endif
+   printf("vector celltype  ptr : %p nelements %ld elsize %ld\n",&celltype[0],mesh_memory.get_memory_size(celltype),sizeof(celltype[0])); 
+   printf("vector level     ptr : %p nelements %ld elsize %ld\n",&level[0],   mesh_memory.get_memory_size(level),   sizeof(level[0])); 
+   printf("vector i         ptr : %p nelements %ld elsize %ld\n",&i[0],       mesh_memory.get_memory_size(i),       sizeof(i[0])); 
+   printf("vector j         ptr : %p nelements %ld elsize %ld\n",&j[0],       mesh_memory.get_memory_size(j),       sizeof(j[0])); 
+
+   printf("vector nlft      ptr : %p nelements %ld elsize %ld\n",&nlft[0],    mesh_memory.get_memory_size(nlft),    sizeof(nlft[0])); 
+   printf("vector nrht      ptr : %p nelements %ld elsize %ld\n",&nrht[0],    mesh_memory.get_memory_size(nrht),    sizeof(nrht[0])); 
+   printf("vector nbot      ptr : %p nelements %ld elsize %ld\n",&nbot[0],    mesh_memory.get_memory_size(nbot),    sizeof(nbot[0])); 
+   printf("vector ntop      ptr : %p nelements %ld elsize %ld\n",&ntop[0],    mesh_memory.get_memory_size(ntop),    sizeof(ntop[0])); 
+}
+
+
+void Mesh::set_refinement_order(int order[4], int ic, int ifirst, int ilast, int jfirst, int jlast,
+                                int level_first, int level_last, int *i_old, int *j_old, int *level_old)
+{
+            if (localStencil) {
+               //  Store the coordinates of the cells before and after this one on
+               //  the space-filling curve index.
+
+#ifdef __OLD_STENCIL__
+               spatial_t  nx[3],  //  x-coordinates of cells.
+                          ny[3];  //  y-coordinates of cells.
+               if (ic != 0) {
+                  nx[0] = lev_deltax[level_old[ic-1]] * (spatial_t)i[ic-1];
+                  ny[0] = lev_deltay[level_old[ic-1]] * (spatial_t)j[ic-1];
+               } else {
+                  nx[0] = lev_deltax[level_first] * (spatial_t)ifirst;
+                  ny[0] = lev_deltay[level_first] * (spatial_t)jfirst;
+               }
+               nx[1] = lev_deltax[level_old[ic  ]] * (spatial_t)i[ic  ];
+               ny[1] = lev_deltay[level_old[ic  ]] * (spatial_t)j[ic  ];
+               if (ic != ncells-1) {
+                  nx[2] = lev_deltax[level_old[ic+1]] * (spatial_t)i[ic+1];
+                  ny[2] = lev_deltay[level_old[ic+1]] * (spatial_t)j[ic+1];
+               } else {
+                  nx[2] = lev_deltax[level_last] * (spatial_t)ilast;
+                  ny[2] = lev_deltay[level_last] * (spatial_t)jlast;
+               }
+
+               //  Figure out relative orientation of the neighboring cells.  We are
+               //  are aided in this because the Hilbert curve only has six possible
+               //  ways across the cell:  four Ls and two straight lines.  Then
+               //  refine the cell according to the relative orientation and order
+               //  according to the four-point Hilbert stencil.
+               if      (nx[0] < nx[1] and ny[2] < ny[1])   //  southwest L, forward order
+               {  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+               else if (nx[2] < nx[1] and ny[0] < ny[1])   //  southwest L, reverse order
+               {  order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
+               else if (nx[0] > nx[1] and ny[2] < ny[1])   //  southeast L, forward order
+               {  order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
+               else if (nx[2] > nx[1] and ny[0] < ny[1])   //  southeast L, reverse order
+               {  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+               else if (nx[0] > nx[1] and ny[2] > ny[1])   //  northeast L, forward order
+               {  order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE; }
+               else if (nx[2] > nx[1] and ny[0] > ny[1])   //  northeast L, reverse order
+               {  order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
+               else if (nx[0] < nx[1] and ny[2] > ny[1])   //  northwest L, forward order
+               {  order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
+               else if (nx[2] < nx[1] and ny[0] > ny[1])   //  northwest L, reverse order
+               {  order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW; }
+               else if (nx[0] > nx[1] and nx[1] > nx[2])   //  straight horizontal, forward order
+               {  order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW; }
+               else if (nx[0] < nx[1] and nx[1] < nx[2])   //  straight horizontal, reverse order
+               {  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+               else if (ny[0] > ny[1] and ny[1] > ny[2])   //  straight vertical, forward order
+               {  order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
+               else if (ny[0] < ny[1] and ny[1] < ny[2])   //  straight vertical, reverse order
+               {  order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
+               else                                        //  other, default to z-order
+               {  order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
+#endif
+
+#ifdef __NEW_STENCIL__
+               int ir[3],   // First i index at finest level of the mesh
+                   jr[3];   // First j index at finest level of the mesh
+               // Cell's Radius at the Finest level of the mesh
+
+               int crf = IPOW2(levmx-level_old[ic]);
+
+               if (ic != 0) {
+                  ir[0] = i_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
+                  jr[0] = j_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
+               } else {
+                  //printf("%d cell %d is a first\n",mype,ic);
+                  ir[0] = ifirst * IPOW2(levmx-level_first);
+                  jr[0] = jfirst * IPOW2(levmx-level_first);
+               }
+               ir[1] = i_old[ic    ] * IPOW2(levmx-level_old[ic    ]);
+               jr[1] = j_old[ic    ] * IPOW2(levmx-level_old[ic    ]);
+               if (ic != (int)ncells-1) {
+                  ir[2] = i_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
+                  jr[2] = j_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
+               } else {
+                  //printf("%d cell %d is a last\n",mype,ic);
+                  ir[2] = ilast * IPOW2(levmx-level_last);
+                  jr[2] = jlast * IPOW2(levmx-level_last);
+               }
+               //if (parallel) fprintf(fp,"%d: DEBUG rezone top boundary -- ic %d global %d noffset %d nc %d i %d j %d level %d\n",mype,ic,ic+noffset,noffset,nc,i[nc],j[nc],level[nc]);
+
+               int dir_in  = ir[1] - ir[0];
+               int dir_out = ir[1] - ir[2];
+               int djr_in  = jr[1] - jr[0];
+               int djr_out = jr[1] - jr[2];
+
+               char  in_direction = 'X';
+               char out_direction = 'X';
+
+               // Left In
+               if( (djr_in == 0 && (dir_in == crf*HALF || dir_in == crf || dir_in == crf*TWO)) || (djr_in == -crf*HALF && dir_in == crf*HALF) || (djr_in == crf && dir_in == crf*TWO) ) {
+                  in_direction = 'L';
+               }
+               // Bottom In
+               else if( (dir_in == 0 && (djr_in == crf*HALF || djr_in == crf || djr_in == crf*TWO)) || (dir_in == -crf*HALF && djr_in == crf*HALF) || (dir_in == crf && djr_in == crf*TWO) ) {
+                  in_direction = 'B';
+               }
+               // Right In
+               else if( (dir_in == -crf && (djr_in == -crf*HALF || djr_in == 0 || (djr_in == crf && level_old[ic-1] < level_old[ic]))) ) {
+                  in_direction = 'R';
+               }
+               // Top In
+               else if( (djr_in == -crf && (dir_in == -crf*HALF || dir_in == 0 || (dir_in == crf && level_old[ic-1] < level_old[ic]))) ) {
+                  in_direction = 'T';
+               }
+               // Further from the left
+               else if( dir_in > 0 && djr_in == 0 ) {
+                  in_direction = 'L';
+               }
+               // Further from the right
+               else if( dir_in < 0 && djr_in == 0 ) {
+                  in_direction = 'R';
+               }
+               // Further from the bottom
+               else if( djr_in > 0 && dir_in == 0 ) {
+                  in_direction = 'B';
+               }
+               // Further from the top
+               else if( djr_in < 0 && dir_in == 0 ) {
+                  in_direction = 'T';
+               }
+               // SW in; 'M'
+               else if( dir_in > 0 && djr_in > 0) {
+                  in_direction = 'M';
+               }
+               // NW in; 'W'
+               else if( dir_in > 0 && djr_in < 0) {
+                  in_direction = 'W';
+               }
+               // SE in; 'F'
+               else if( dir_in < 0 && djr_in > 0) {
+                  in_direction = 'F';
+               }
+               // NE in; 'E'
+               else if( dir_in < 0 && djr_in < 0) {
+                  in_direction = 'E';
+               }
+
+   
+               // Left Out
+               if( (djr_out == 0 && (dir_out == crf*HALF || dir_out == crf || dir_out == crf*TWO)) || (djr_out == -crf*HALF && dir_out == crf*HALF) || (djr_out == crf && dir_out == crf*TWO) ) {
+                  out_direction = 'L';
+               }
+               // Bottom Out
+               else if( (dir_out == 0 && (djr_out == crf*HALF || djr_out == crf || djr_out == crf*TWO)) || (dir_out == -crf*HALF && djr_out == crf*HALF) || (dir_out == crf && djr_out == crf*TWO) ) {
+                  out_direction = 'B';
+               }
+               // Right Out
+               else if( (dir_out == -crf && (djr_out == -crf*HALF || djr_out == 0 || (djr_out == crf && level_old[ic+1] < level_old[ic]))) ) {
+                  out_direction = 'R';
+               }
+               // Top Out
+               else if( (djr_out == -crf && (dir_out == -crf*HALF || dir_out == 0 || (dir_out == crf && level_old[ic+1] < level_old[ic]))) ) {
+                  out_direction = 'T';
+               }
+               // Further from the left
+               else if( dir_out > 0 && djr_out == 0 ) {
+                  out_direction = 'L';
+               }
+               // Further from the right
+               else if( dir_out < 0 && djr_out == 0 ) {
+                  out_direction = 'R';
+               }
+               // Further from the bottom
+               else if( djr_out > 0 && dir_out == 0 ) {
+                  out_direction = 'B';
+               }
+               // Further from the top
+               else if( djr_out < 0 && dir_out == 0 ) {
+                  out_direction = 'T';
+               }
+               // SW out; 'M'
+               else if( dir_out > 0 && djr_out > 0) {
+                  out_direction = 'M';
+               }
+               // NW out; 'W'
+               else if( dir_out > 0 && djr_out < 0) {
+                  out_direction = 'W';
+               }
+               // SE out; 'F'
+               else if( dir_out < 0 && djr_out > 0) {
+                  out_direction = 'F';
+               }
+               // NE out; 'E'
+               else if( dir_out < 0 && djr_out < 0) {
+                  out_direction = 'E';
+               }
+
+               // Set the Stencil
+               if(in_direction == 'L' && (out_direction == 'B' || out_direction == 'R' || out_direction == 'F')) {
+                  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+               }
+               else if(in_direction == 'L' && (out_direction == 'T' || out_direction == 'W' )) {
+                  order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+               }
+               else if(in_direction == 'L' && out_direction == 'M') {
+                  order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
+               }
+               else if(in_direction == 'L' && out_direction == 'E') {
+                  order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
+               }
+
+               else if(in_direction == 'B' && (out_direction == 'R' || out_direction == 'F' )) {
+                  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+               }
+               else if(in_direction == 'B' && (out_direction == 'L' || out_direction == 'T' || out_direction == 'W' )) {
+                  order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+               }
+               else if(in_direction == 'B' && out_direction == 'M') {
+                  order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
+               }
+               else if(in_direction == 'B' && out_direction == 'E') {
+                  order[0] = SW; order[1] = NW; order[2] = SE; order[3] = NE;
+               }
+               
+               else if(in_direction == 'R' && (out_direction == 'T' || out_direction == 'L' || out_direction == 'W' )) {
+                  order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+               }
+               else if(in_direction == 'R' && (out_direction == 'B' || out_direction == 'F' )) {
+                  order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+               }
+               else if(in_direction == 'R' && out_direction == 'M') {
+                  order[0] = NE; order[1] = NW; order[2] = SE; order[3] = SW;
+               }
+               else if(in_direction == 'R' && out_direction == 'E') {
+                  order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
+               }
+
+               else if(in_direction == 'T' && (out_direction == 'L' || out_direction == 'W' )) {
+                  order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+               }
+               else if(in_direction == 'T' && (out_direction == 'R' || out_direction == 'B' || out_direction == 'F' )) {
+                  order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+               }
+               else if(in_direction == 'T' && out_direction == 'M') {
+                  order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
+               }
+               else if(in_direction == 'T' && out_direction == 'E') {
+                  order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
+               }
+
+               else if(in_direction == 'M' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
+                  order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+               }
+               else if(in_direction == 'M' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
+                  order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+               }
+               else if(in_direction == 'M' && out_direction == 'E') {
+                  order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
+               }
+ 
+               else if(in_direction == 'W' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
+                  order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
+               }
+               else if(in_direction == 'W' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
+                  order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
+               }
+               else if(in_direction == 'W' && out_direction == 'F') {
+                  order[0] = NW; order[1] = NE; order[2] = SW; order[3] = SE;
+               }
+
+               else if(in_direction == 'F' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
+                  order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
+               }
+               else if(in_direction == 'F' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
+                  order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
+               }
+               else if(in_direction == 'F' && out_direction == 'W') {
+                  order[0] = SE; order[1] = NE; order[2] = SW; order[3] = NW;
+               }
+
+               else if(in_direction == 'E' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
+                  order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+               }
+               else if(in_direction == 'E' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
+                  order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+               }
+               else if(in_direction == 'E' && out_direction == 'M') {
+                  order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
+               }
+
+               else { // Default to a knot 
+                  order[0] = NW; order[1] = SE; order[2] = SW; order[3] = NE;
+                  if (do_stencil_warning) {
+                     printf("Nonlocal case for the stencil.\n");
+                  }
+               }
+               //  Determine the relative orientation of the neighboring cells.
+               //  There are 12 possible ways across the cell: 4 Ls and 2 straight
+               //  lines, each with 2 directions of traversal.
+               //  Then the cell is refined and ordered according to the relative
+               //  orientation and four-point Hilbert stencil.
+
+               // XXX NOTE that the four-point stencil varies depending upon
+               // the starting and ending point of the global Hilbert curve.
+               // The stencil applied here assumes the start at (0,0) and the end
+               // at (0,y_max). XXX WRONG
+#endif                 
+
+            }  //  End local stencil version
+            else //  Use Z-ordering for the curve.
+            {  order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
+            
+}
+
+void Mesh::calc_face_list(void)
+{
+   xface_i.clear();
+   xface_j.clear();
+   xface_level.clear();
+
+   ixmin_level.clear();
+   ixmax_level.clear();
+   jxmin_level.clear();
+   jxmax_level.clear();
+   ixmin_level.resize(levmx+1,  9999999);
+   ixmax_level.resize(levmx+1, -9999999);
+   jxmin_level.resize(levmx+1,  9999999);
+   jxmax_level.resize(levmx+1, -9999999);
+
+   ixadjust.clear();
+   ixadjust.resize(levmx+1);
+   jxadjust.clear();
+   jxadjust.resize(levmx+1);
+
+   int iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nr = nrht[nz];
+      if (nr == nz) continue;
+
+      int ifactor = 1;
+      if (level[nr] < level[nz]) ifactor = 2;
+
+      // Have right face
+      //printf("DEBUG xface -- iface %d lower nz %d upper nr %d\n",iface,nz,nr);
+      xface_level.push_back(MAX(level[nz],level[nr]));
+      xface_i.push_back(i[nr]*ifactor);
+      if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+         xface_j.push_back(j[nr]*ifactor+1);
+      } else {
+         xface_j.push_back(j[nr]*ifactor);
+      }
+
+      iface++;
+
+      if (level[nr] > level[nz] && is_lower(j[nr]) ){
+         int ntr = ntop[nr];
+         if (ntr != nr) {
+            //printf("DEBUG xface -- iface %d lower nz %d upper ntr %d\n",iface,nz,ntr);
+            xface_level.push_back(MAX(level[nz],level[ntr]));
+            xface_i.push_back(i[ntr]*ifactor);
+            xface_j.push_back(j[ntr]*ifactor);
+
+            iface++;
+         }
+      }
+   }
+   nxface=iface;
+
+   yface_i.clear();
+   yface_j.clear();
+   yface_level.clear();
+
+   iymin_level.clear();
+   iymax_level.clear();
+   jymin_level.clear();
+   jymax_level.clear();
+   iymin_level.resize(levmx+1,  9999999);
+   iymax_level.resize(levmx+1, -9999999);
+   jymin_level.resize(levmx+1,  9999999);
+   jymax_level.resize(levmx+1, -9999999);
+
+   iyadjust.clear();
+   iyadjust.resize(levmx+1);
+   jyadjust.clear();
+   jyadjust.resize(levmx+1);
+
+   iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nt = ntop[nz];
+      if (nt == nz) continue;
+
+      int ifactor = 1;
+      if (level[nt] < level[nz]) ifactor = 2;
+
+      // Have top face
+      //printf("DEBUG yface -- iface %d lower nz %d upper nt %d\n",iface,nz,nt);
+      yface_level.push_back(MAX(level[nz],level[nt]));
+      yface_j.push_back(j[nt]*ifactor);
+      if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+         yface_i.push_back(i[nt]*ifactor+1);
+      } else{
+         yface_i.push_back(i[nt]*ifactor);
+      }
+
+      iface++;
+      if (level[nt] > level[nz] && is_lower(i[nt]) ){
+         int nrt = nrht[nt];
+         if (nrt != nt) {
+            //printf("DEBUG yface -- iface %d lower nz %d upper nrt %d\n",iface,nz,nrt);
+            yface_level.push_back(MAX(level[nz],level[nrt]));
+            yface_j.push_back(j[nrt]*ifactor);
+            yface_i.push_back(i[nrt]*ifactor);
+
+            iface++;
+         }
+
+      }
+   }
+   nyface=iface;
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+
+      int fi = xface_i[iface];
+      if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+      if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+      int fj = xface_j[iface];
+      if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+      if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+      if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+      xface_i[iface] -= ixmin_level[fl];
+      xface_j[iface] -= jxmin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      ixadjust[fl] = ixmin_level[fl];
+      jxadjust[fl] = jxmin_level[fl];
+      ixmax_level[fl] -= ixmin_level[fl];;
+      jxmax_level[fl] -= jxmin_level[fl];
+      ixmin_level[fl] = 0;
+      jxmin_level[fl] = 0;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+
+      int fi = yface_i[iface];
+      if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+      if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+      int fj = yface_j[iface];
+      if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+      if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+      if (iymax_level[fl] < iymin_level[fl]) continue;
+
+      yface_i[iface] -= iymin_level[fl];
+      yface_j[iface] -= jymin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      iyadjust[fl] = iymin_level[fl];
+      jyadjust[fl] = jymin_level[fl];
+      iymax_level[fl] -= iymin_level[fl];;
+      jymax_level[fl] -= jymin_level[fl];
+      iymin_level[fl] = 0;
+      jymin_level[fl] = 0;
+   }
+
+}
+
+void Mesh::calc_face_list_wmap(void)
+{
+   map_xface2cell_lower.clear();
+   map_xface2cell_upper.clear();
+
+   xface_i.clear();
+   xface_j.clear();
+   xface_level.clear();
+
+   ixmin_level.clear();
+   ixmax_level.clear();
+   jxmin_level.clear();
+   jxmax_level.clear();
+   ixmin_level.resize(levmx+1,  9999999);
+   ixmax_level.resize(levmx+1, -9999999);
+   jxmin_level.resize(levmx+1,  9999999);
+   jxmax_level.resize(levmx+1, -9999999);
+
+   ixadjust.clear();
+   ixadjust.resize(levmx+1);
+   jxadjust.clear();
+   jxadjust.resize(levmx+1);
+
+   int iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nr = nrht[nz];
+      if (nr == nz) continue;
+
+      int ifactor = 1;
+      if (level[nr] < level[nz]) ifactor = 2;
+
+      // Have right face
+      map_xface2cell_lower.push_back(nz);
+      map_xface2cell_upper.push_back(nr);
+      xface_level.push_back(MAX(level[nz],level[nr]));
+      xface_i.push_back(i[nr]*ifactor);
+      if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+         xface_j.push_back(j[nr]*ifactor+1);
+      } else {
+         xface_j.push_back(j[nr]*ifactor);
+      }
+
+      iface++;
+
+      if (level[nr] > level[nz] && is_lower(j[nr]) ){
+         int ntr = ntop[nr];
+         if (ntr != nr) {
+            map_xface2cell_lower.push_back(nz);
+            map_xface2cell_upper.push_back(ntr);
+            xface_level.push_back(MAX(level[nz],level[ntr]));
+            xface_i.push_back(i[ntr]*ifactor);
+            xface_j.push_back(j[ntr]*ifactor);
+
+            iface++;
+         }
+      }
+   }
+   nxface=iface;
+
+   map_yface2cell_lower.clear();
+   map_yface2cell_upper.clear();
+
+   yface_i.clear();
+   yface_j.clear();
+   yface_level.clear();
+
+   iymin_level.clear();
+   iymax_level.clear();
+   jymin_level.clear();
+   jymax_level.clear();
+   iymin_level.resize(levmx+1,  9999999);
+   iymax_level.resize(levmx+1, -9999999);
+   jymin_level.resize(levmx+1,  9999999);
+   jymax_level.resize(levmx+1, -9999999);
+
+   iyadjust.clear();
+   iyadjust.resize(levmx+1);
+   jyadjust.clear();
+   jyadjust.resize(levmx+1);
+
+   iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nt = ntop[nz];
+      if (nt == nz) continue;
+
+      int ifactor = 1;
+      if (level[nt] < level[nz]) ifactor = 2;
+
+      // Have top face
+      // printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
+      map_yface2cell_lower.push_back(nz);
+      map_yface2cell_upper.push_back(nt);
+      yface_level.push_back(MAX(level[nz],level[nt]));
+      yface_j.push_back(j[nt]*ifactor);
+      if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+         yface_i.push_back(i[nt]*ifactor+1);
+      } else{
+         yface_i.push_back(i[nt]*ifactor);
+      }
+
+      iface++;
+      if (level[nt] > level[nz] && is_lower(i[nt]) ){
+         int nrt = nrht[nt];
+         if (nrt != nt) {
+            map_yface2cell_lower.push_back(nz);
+            map_yface2cell_upper.push_back(nrt);
+            yface_level.push_back(MAX(level[nz],level[nrt]));
+            yface_j.push_back(j[nrt]*ifactor);
+            yface_i.push_back(i[nrt]*ifactor);
+
+            iface++;
+         }
+      }
+   }
+   nyface=iface;
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+
+      int fi = xface_i[iface];
+      if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+      if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+      int fj = xface_j[iface];
+      if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+      if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+      if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+      xface_i[iface] -= ixmin_level[fl];
+      xface_j[iface] -= jxmin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      ixadjust[fl] = ixmin_level[fl];
+      jxadjust[fl] = jxmin_level[fl];
+      ixmax_level[fl] -= ixmin_level[fl];;
+      jxmax_level[fl] -= jxmin_level[fl];
+      ixmin_level[fl] = 0;
+      jxmin_level[fl] = 0;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+
+      int fi = yface_i[iface];
+      if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+      if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+      int fj = yface_j[iface];
+      if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+      if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+      if (iymax_level[fl] < iymin_level[fl]) continue;
+
+      yface_i[iface] -= iymin_level[fl];
+      yface_j[iface] -= jymin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      iyadjust[fl] = iymin_level[fl];
+      jyadjust[fl] = jymin_level[fl];
+      iymax_level[fl] -= iymin_level[fl];;
+      jymax_level[fl] -= jymin_level[fl];
+      iymin_level[fl] = 0;
+      jymin_level[fl] = 0;
+   }
+
+}
+
+void Mesh::calc_face_list_wbidirmap(void)
+{
+   map_xface2cell_lower.clear();
+   map_xface2cell_upper.clear();
+
+   map_xcell2face_left1.clear();
+   map_xcell2face_left2.clear();
+   map_xcell2face_right1.clear();
+   map_xcell2face_right2.clear();
+   map_xcell2face_left1.resize(ncells, -1);
+   map_xcell2face_left2.resize(ncells, -1);
+   map_xcell2face_right1.resize(ncells, -1);
+   map_xcell2face_right2.resize(ncells, -1);
+
+   xface_i.clear();
+   xface_j.clear();
+   xface_level.clear();
+
+   ixmin_level.clear();
+   ixmax_level.clear();
+   jxmin_level.clear();
+   jxmax_level.clear();
+   ixmin_level.resize(levmx+1,  9999999);
+   ixmax_level.resize(levmx+1, -9999999);
+   jxmin_level.resize(levmx+1,  9999999);
+   jxmax_level.resize(levmx+1, -9999999);
+
+   ixadjust.clear();
+   ixadjust.resize(levmx+1);
+   jxadjust.clear();
+   jxadjust.resize(levmx+1);
+
+   int iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nr = nrht[nz];
+      if (nr == nz) continue;
+
+      int ifactor = 1;
+      if (level[nr] < level[nz]) ifactor = 2;
+
+      // Have right face
+      map_xface2cell_lower.push_back(nz);
+      map_xface2cell_upper.push_back(nr);
+      xface_level.push_back(MAX(level[nz],level[nr]));
+      xface_i.push_back(i[nr]*ifactor);
+      if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+         xface_j.push_back(j[nr]*ifactor+1);
+      } else {
+         xface_j.push_back(j[nr]*ifactor);
+      }
+      map_xcell2face_right1[nz] = iface;
+
+      iface++;
+
+      if (level[nr] > level[nz] && is_lower(j[nr]) ){
+         int ntr = ntop[nr];
+         if (ntr != nr) {
+            map_xface2cell_lower.push_back(nz);
+            map_xface2cell_upper.push_back(ntr);
+            xface_level.push_back(MAX(level[nz],level[ntr]));
+            xface_i.push_back(i[ntr]*ifactor);
+            xface_j.push_back(j[ntr]*ifactor);
+            map_xcell2face_right2[nz] = iface;
+
+            iface++;
+         }
+      }
+   }
+   nxface=iface;
+
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nl = nlft[nz];
+      if (nl == nz) continue;
+
+      if (level[nl] < level[nz] && is_upper(j[nz])){
+         map_xcell2face_left1[nz] = map_xcell2face_right2[nl];
+      } else {
+         map_xcell2face_left1[nz] = map_xcell2face_right1[nl];
+         if (level[nl] > level[nz]){
+            map_xcell2face_left2[nz] = map_xcell2face_right1[ntop[nl]];
+         }
+      }
+
+   }
+
+   map_yface2cell_lower.clear();
+   map_yface2cell_upper.clear();
+
+   map_ycell2face_bot1.clear();
+   map_ycell2face_bot2.clear();
+   map_ycell2face_top1.clear();
+   map_ycell2face_top2.clear();
+   map_ycell2face_bot1.resize(ncells, -1);
+   map_ycell2face_bot2.resize(ncells, -1);
+   map_ycell2face_top1.resize(ncells, -1);
+   map_ycell2face_top2.resize(ncells, -1);
+
+   yface_i.clear();
+   yface_j.clear();
+   yface_level.clear();
+
+   iymin_level.clear();
+   iymax_level.clear();
+   jymin_level.clear();
+   jymax_level.clear();
+   iymin_level.resize(levmx+1,  9999999);
+   iymax_level.resize(levmx+1, -9999999);
+   jymin_level.resize(levmx+1,  9999999);
+   jymax_level.resize(levmx+1, -9999999);
+
+   iyadjust.clear();
+   iyadjust.resize(levmx+1);
+   jyadjust.clear();
+   jyadjust.resize(levmx+1);
+
+   iface=0;
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nt = ntop[nz];
+      if (nt == nz) continue;
+
+      int ifactor = 1;
+      if (level[nt] < level[nz]) ifactor = 2;
+
+      // Have top face
+      // printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
+      map_yface2cell_lower.push_back(nz);
+      map_yface2cell_upper.push_back(nt);
+      yface_level.push_back(MAX(level[nz],level[nt]));
+      yface_j.push_back(j[nt]*ifactor);
+      if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+         yface_i.push_back(i[nt]*ifactor+1);
+      } else{
+         yface_i.push_back(i[nt]*ifactor);
+      }
+      map_ycell2face_top1[nz] = iface;
+
+      iface++;
+
+      if (level[nt] > level[nz]  &&is_lower(i[nt]) ){
+         int nrt = nrht[nt];
+         if (nrt != nt) {
+            map_yface2cell_lower.push_back(nz);
+            map_yface2cell_upper.push_back(nrt);
+            yface_level.push_back(MAX(level[nz],level[nrt]));
+            yface_j.push_back(j[nrt]*ifactor);
+            yface_i.push_back(i[nrt]*ifactor);
+            map_ycell2face_top2[nz] = iface;
+
+            iface++;
+         }
+      }
+   }
+   nyface=iface;
+
+   for (int nz=0; nz<(int)ncells; nz++){
+      int nb = nbot[nz];
+      if (nb == nz) continue;
+
+      if (level[nb] < level[nz] && is_upper(i[nz])){
+         map_ycell2face_bot1[nz] = map_ycell2face_top2[nb];
+      } else {
+         map_ycell2face_bot1[nz] = map_ycell2face_top1[nb];
+         if (level[nb] > level[nz]){
+            map_ycell2face_bot2[nz] = map_ycell2face_top1[nrht[nb]];
+         }
+      }
+
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+
+      int fi = xface_i[iface];
+      if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+      if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+      int fj = xface_j[iface];
+      if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+      if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      int fl = xface_level[iface];
+      if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+      xface_i[iface] -= ixmin_level[fl];
+      xface_j[iface] -= jxmin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      ixadjust[fl] = ixmin_level[fl];
+      jxadjust[fl] = jxmin_level[fl];
+      ixmax_level[fl] -= ixmin_level[fl];;
+      jxmax_level[fl] -= jxmin_level[fl];
+      ixmin_level[fl] = 0;
+      jxmin_level[fl] = 0;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+
+      int fi = yface_i[iface];
+      if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+      if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+      int fj = yface_j[iface];
+      if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+      if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      int fl = yface_level[iface];
+      if (iymax_level[fl] < iymin_level[fl]) continue;
+
+      yface_i[iface] -= iymin_level[fl];
+      yface_j[iface] -= jymin_level[fl];
+   }
+
+   for (int fl = 0; fl <= levmx; fl++){
+      iyadjust[fl] = iymin_level[fl];
+      jyadjust[fl] = jymin_level[fl];
+      iymax_level[fl] -= iymin_level[fl];;
+      jymax_level[fl] -= jymin_level[fl];
+      iymin_level[fl] = 0;
+      jymin_level[fl] = 0;
+   }
+
+}
+
+int **Mesh::get_xface_flag(int lev, bool print_output)
+{
+   int **xface_flag = (int **)genmatrix(jxmax_level[lev]+1,
+                                        ixmax_level[lev]+1, sizeof(int));
+   for (int jj=0; jj<jxmax_level[lev]+1; jj++){
+      for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+         xface_flag[jj][ii] = -1;
+      }
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      if (xface_level[iface] == lev){
+         int ii = xface_i[iface];
+         int jj = xface_j[iface];
+
+         xface_flag[jj][ii] = 1;
+      }
+   }
+
+   if (DEBUG || print_output) {
+      printf("DEBUG -- x face_flag for level %d\n",lev);
+      printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",ixmax_level[lev]+1,jxmax_level[lev]+1);
+
+      printf("                           ");
+      for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+         printf(" %4d ",ii);
+      }
+      printf("\n");
+
+      for (int jj=jxmax_level[lev]; jj>=0; jj--){
+
+         printf("DEBUG -- j  %4d:          ",jj);
+         for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+            if (xface_flag[jj][ii] >= 0){
+               //printf("      xface_flag_check[%d][%d] = 1;\n",jj,ii);
+               printf(" %4d ", xface_flag[jj][ii]);
+            } else {
+               printf("      ");
+            }
+         }
+         printf("\n");
+      }
+   }
+
+   return(xface_flag);
+}
+
+int **Mesh::get_yface_flag(int lev, bool print_output)
+{
+   int **yface_flag = (int **)genmatrix(jymax_level[lev]+1,
+                                        iymax_level[lev]+1, sizeof(int));
+   for (int jj=0; jj<jymax_level[lev]+1; jj++){
+      for (int ii=0; ii<iymax_level[lev]+1; ii++){
+         yface_flag[jj][ii] = -1;
+      }
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      if (yface_level[iface] == lev){
+         int ii = yface_i[iface];
+         int jj = yface_j[iface];
+
+         yface_flag[jj][ii] = 1;
+      }
+   }
+
+   if (DEBUG || print_output) {
+      printf("DEBUG -- y face_flag for level %d\n",lev);
+      printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",iymax_level[lev]+1,jymax_level[lev]+1);
+
+      printf("                           ");
+      for (int ii=0; ii<iymax_level[lev]+1; ii++){
+         printf(" %4d ",ii);
+      }
+      printf("\n");
+
+      for (int jj=jymax_level[lev]; jj>=0; jj--){
+
+         printf("DEBUG -- j  %4d:          ",jj);
+         for (int ii=0; ii<iymax_level[lev]+1; ii++){
+            if (yface_flag[jj][ii] >= 0){
+               //printf("      yface_flag_check[%d][%d] = 1;\n",jj,ii);
+               printf(" %4d ", yface_flag[jj][ii]);
+            } else {
+               printf("      ");
+            }
+         }
+         printf("\n");
+      }
+   }
+
+   return(yface_flag);
+}
+
+void Mesh::get_flat_grid(int lev, int ***zone_flag_base, int ***zone_cell_base)
+{
+   int isize = ixmax_level[lev]+4;
+   int jsize = jymax_level[lev]+4;
+   int iadjust = ixadjust[lev]-2;
+   int jadjust = jyadjust[lev]-2;
+
+   //printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
+   //printf("DEBUG -- adjust ixadjust %d jxadjust %d\n",ixadjust[lev],jxadjust[lev]);
+   //printf("DEBUG -- adjust iyadjust %d jyadjust %d\n",iyadjust[lev],jyadjust[lev]);
+
+   (*zone_flag_base) = (int **)genmatrix(jsize, isize, sizeof(int));
+
+   int **zone_flag = *zone_flag_base;
+   for (int jj=0; jj<jsize; jj++){
+      for (int ii=0; ii<isize; ii++){
+          zone_flag[jj][ii] = -1;
+      }
+   }
+
+   (*zone_cell_base) = (int **)genmatrix(jsize, isize, sizeof(int));
+
+   int **zone_cell = *zone_cell_base;
+   for (int jj=0; jj<jsize; jj++){
+      for (int ii=0; ii<isize; ii++){
+         zone_cell[jj][ii] = -1;
+      }
+   }
+
+   for (int iface=0; iface < nxface; iface++){
+      if (xface_level[iface] == lev){
+         int nz1 = map_xface2cell_lower[iface];
+         int nz2 = map_xface2cell_upper[iface];
+
+         if (lev == level[nz1]) {
+            int iii = i[nz1]-iadjust;
+            int jjj = j[nz1]-jadjust;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz1;
+            if (nlft[nz1] != REAL_CELL) {
+               zone_cell[jjj][iii-1] = nlft[nz1];
+            }
+         } else {
+            int iii = i[nz1]*2-iadjust+1;
+            int jjj = j[nz1]*2-jadjust;
+            if (is_upper(j[nz2])) jjj += 1;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz1;
+            zone_cell[jjj][iii-1] = nz1;
+         }
+         if (lev == level[nz2]) {
+            int iii = i[nz2]-iadjust;
+            int jjj = j[nz2]-jadjust;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz2;
+            if (nrht[nz2] != REAL_CELL) {
+               zone_cell[jjj][iii+1] = nrht[nz2];
+            }
+         } else {
+            int iii = i[nz2]*2-iadjust;
+            int jjj = j[nz2]*2-jadjust;
+            if (is_upper(j[nz1])) jjj += 1;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz2;
+            zone_cell[jjj][iii+1] = nz2;
+         }
+      }
+   }
+
+   for (int iface=0; iface < nyface; iface++){
+      if (yface_level[iface] == lev){
+         int nz1 = map_yface2cell_lower[iface];
+         int nz2 = map_yface2cell_upper[iface];
+
+         if (lev == level[nz1]) {
+            int iii = i[nz1]-iadjust;
+            int jjj = j[nz1]-jadjust;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz1;
+            if (nbot[nz1] != REAL_CELL) {
+               zone_cell[jjj-1][iii] = nbot[nz1];
+            }
+         } else {
+            int iii = i[nz1]*2-iadjust;
+            int jjj = j[nz1]*2-jadjust+1;
+            if (is_upper(i[nz2])) iii += 1;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz1;
+            zone_cell[jjj-1][iii] = nz1;
+         }
+         if (lev == level[nz2]) {
+            int iii = i[nz2]-iadjust;
+            int jjj = j[nz2]-jadjust;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz2;
+            if (ntop[nz2] != REAL_CELL) {
+               zone_cell[jjj+1][iii] = ntop[nz2];
+            }
+         } else {
+            int iii = i[nz2]*2-iadjust;
+            int jjj = j[nz2]*2-jadjust;
+            if (is_upper(i[nz1])) iii += 1;
+            zone_flag[jjj][iii] = 1;
+            zone_cell[jjj][iii] = nz2;
+            zone_cell[jjj+1][iii] = nz2;
+         }
+      }
+   }
+
+   if (DEBUG) {
+      printf("DEBUG -- zone_flag for level %d\n",lev);
+      printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
+      for (int j=jsize-1; j>=0; j--){
+         for (int i=0; i<isize; i++){
+            if (zone_flag[j][i] >= 0){
+               printf("      zone_flag_check[%d][%d] = 1;\n",j,i);
+            }
+         }
+      }
+      for (int j=jsize-1; j>=0; j--){
+         for (int i=0; i<isize; i++){
+            if (zone_cell[j][i] >= 0){
+               printf("      zone_cell_check[%d][%d] = %d;\n",j,i,zone_cell[j][i]);
+            }
+         }
+      }
+
+      printf("                  ");
+      for (int i=0; i<isize; i++){
+         printf(" %4d ",i);
+      }
+      printf("\n");
+
+      for (int j=jsize-1; j>=0; j--){
+
+         printf("DEBUG -- j  %4d: ",j);
+         for (int i=0; i<isize; i++){
+            if (zone_flag[j][i] >= 0){
+               printf(" %4d ", zone_flag[j][i]);
+            } else {
+               printf("      ");
+            }
+         }
+         printf("\n");
+      }
+
+      printf("DEBUG -- zone_cell for level %d\n",lev);
+
+      printf("                  ");
+      for (int i=0; i<isize; i++){
+         printf(" %4d ",i);
+      }
+      printf("\n");
+
+      for (int j=jsize-1; j>=0; j--){
+
+         printf("DEBUG -- j  %4d: ",j);
+         for (int i=0; i<isize; i++){
+            if (zone_cell[j][i] >= 0){
+               printf(" %4d ", zone_cell[j][i]);
+            } else {
+               printf("      ");
+            }
+         }
+         printf("\n");
+      }
+   }
+}
+
+void Mesh::calc_face_list_clearmaps()
+{
+   map_xface2cell_lower.clear();
+   map_xface2cell_upper.clear();
+
+   map_xcell2face_left1.clear();
+   map_xcell2face_left2.clear();
+   map_xcell2face_right1.clear();
+   map_xcell2face_right2.clear();
+
+   map_yface2cell_lower.clear();
+   map_yface2cell_upper.clear();
+
+   map_ycell2face_bot1.clear();
+   map_ycell2face_bot2.clear();
+   map_ycell2face_top1.clear();
+   map_ycell2face_top2.clear();
+}
+
+void Mesh::timer_output(mesh_timer_category category, mesh_device_types device_type, int timer_level)
+{
+   double local_time = 0.0;
+   if (device_type == MESH_DEVICE_CPU){
+      local_time = get_cpu_timer(category);
+   } else {
+      local_time = get_gpu_timer(category);
+   }
+
+   char string[80] = "/0";
+
+   if (mype == 0) {
+      const char *blank="          ";
+
+      if (device_type == MESH_DEVICE_CPU){
+         sprintf(string,"CPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
+      } else {
+         sprintf(string,"GPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
+      }
+   }
+
+   parallel_output(string, local_time, timer_level, "s");
+}
+
+void Mesh::parallel_output(const char *string, double local_value, int output_level, const char *units)
+{
+   vector<double> global_values(numpe);
+   global_values[0] = local_value;
+#ifdef HAVE_MPI
+   if (numpe > 1) { 
+      MPI_Gather(&local_value, 1, MPI_DOUBLE, &global_values[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+   }
+#endif
+   if (mype == 0) {
+      const char *blank="          ";
+
+      printf("%s\t",string);
+      if (numpe <= 4) {
+         for(int ip = 0; ip < numpe; ip++){
+            printf("%.*s%8.4f\t", 2*output_level, blank, global_values[ip]);
+         }
+         printf("%s\n",units);
+      } else {
+         sort(global_values.begin(),global_values.end());
+         double median_value;
+         int half_value = numpe/2;
+         if (numpe%2 == 0) {
+            median_value = (global_values[half_value-1]+global_values[half_value])/2.0;
+         } else {
+            median_value = global_values[half_value+1];
+         }
+         printf("%.*s%8.4f\t%.*s%8.4f\t%.*s%8.4f   %s min/median/max\n",
+            2*output_level, blank, global_values[0],
+            2*output_level, blank, median_value,
+            2*output_level, blank, global_values[numpe-1],
+            units);
+      }
+   }
+}
+
+void Mesh::parallel_output(const char *string, long long local_value, int output_level, const char *units)
+{
+   vector<long long> global_values(numpe);
+   global_values[0] = local_value;
+#ifdef HAVE_MPI
+   if (numpe > 1) { 
+      MPI_Gather(&local_value, 1, MPI_LONG_LONG, &global_values[0], 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
+   }
+#endif
+   if (mype == 0) {
+      const char *blank="          ";
+
+      printf("%s\t",string);
+      if (numpe <= 4) {
+         for(int ip = 0; ip < numpe; ip++){
+            printf("%.*s%10lld\t", 2*output_level, blank, global_values[ip]);
+         }
+         printf("%s\n",units);
+      } else {
+         sort(global_values.begin(),global_values.end());
+         long long median_value;
+         int half_value = numpe/2;
+         if (numpe%2 == 0) {
+            median_value = (global_values[half_value-1]+global_values[half_value])/2;
+         } else {
+            median_value = global_values[half_value+1];
+         }
+         printf("%.*s%10lld\t%.*s%10lld\t%.*s%10lld   %s min/median/max\n",
+            2*output_level, blank, global_values[0],
+            2*output_level, blank, median_value,
+            2*output_level, blank, global_values[numpe-1],
+            units);
+      }
+   }
+}
+
+void Mesh::parallel_output(const char *string, int local_value, int output_level, const char *units)
+{
+   vector<int> global_values(numpe);
+   global_values[0] = local_value;
+#ifdef HAVE_MPI
+   if (numpe > 1) { 
+      MPI_Gather(&local_value, 1, MPI_INT, &global_values[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
+   }
+#endif
+   if (mype == 0) {
+      const char *blank="          ";
+
+      printf("%s\t",string);
+      if (numpe <= 4) {
+         for(int ip = 0; ip < numpe; ip++){
+            printf("%.*s%10d\t", 2*output_level, blank, global_values[ip]);
+         }
+         printf("%s\n",units);
+      } else {
+         sort(global_values.begin(),global_values.end());
+         int median_value;
+         int half_value = numpe/2;
+         if (numpe%2 == 0) {
+            median_value = (global_values[half_value-1]+global_values[half_value])/2;
+         } else {
+            median_value = global_values[half_value+1];
+         }
+         printf("%.*s%10d\t%.*s%10d\t%.*s%10d   %s min/median/max\n",
+            2*output_level, blank, global_values[0],
+            2*output_level, blank, median_value,
+            2*output_level, blank, global_values[numpe-1],
+            units);
+      }
+   }
+}
+
+const int CRUX_MESH_VERSION = 103;
+const int num_int_dist_vals = 3;
+const int num_int_vals      = 3;
+const int num_double_vals   = 1;
+
+size_t Mesh::get_checkpoint_size(void)
+{
+   size_t nsize;
+   nsize  = num_int_dist_vals*sizeof(int);
+   nsize += num_int_vals*sizeof(int);
+   nsize += num_double_vals*sizeof(double);
+   nsize += 2*MESH_COUNTER_SIZE*sizeof(int);
+   nsize += MESH_TIMER_SIZE*sizeof(double);
+   nsize += MESH_TIMER_SIZE*sizeof(long);
+   nsize += ncells*3*sizeof(int);
+   return(nsize);
+}
+
+void Mesh::store_checkpoint(Crux *crux)
+{
+   // Need ncells for memory allocation
+   int storage = mesh_memory.get_memory_capacity(level);
+   crux->store_named_ints("storage", 7, &storage, 1);
+   // Write scalars to arrays for storing in checkpoint
+   int int_vals[num_int_vals];
+
+   int_vals[ 0] = CRUX_MESH_VERSION;
+   int_vals[ 1] = ndim;
+   int_vals[ 2] = levmx;
+
+   // These are for values that will be different on every processor
+   int int_dist_vals[num_int_dist_vals];
+   int_dist_vals[ 0] = (int)ncells;
+   int_dist_vals[ 1] = (int)ncells_ghost;
+   int_dist_vals[ 2] = offtile_local_count;
+
+   double double_vals[num_double_vals];
+
+   double_vals[0] = offtile_ratio_local;
+
+   int flags = RESTART_DATA;
+   // Now add memory entries to database for storing checkpoint
+   mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
+   flags = RESTART_DATA | REPLICATED_DATA;
+   mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
+
+   flags = RESTART_DATA;
+   mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
+   mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
+   mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
+
+   mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
+   mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
+
+   // Store MallocPlus memory database
+   crux->store_MallocPlus(mesh_memory);
+
+   // Remove memory entries from database now that data is stored
+   mesh_memory.memory_remove(int_dist_vals);
+   mesh_memory.memory_remove(int_vals);
+   mesh_memory.memory_remove(double_vals);
+   mesh_memory.memory_remove(cpu_counters);
+   mesh_memory.memory_remove(gpu_counters);
+   mesh_memory.memory_remove(cpu_timers);
+   mesh_memory.memory_remove(gpu_timers);
+}
+
+void Mesh::restore_checkpoint(Crux *crux)
+{
+   int storage;
+   crux->restore_named_ints("storage", 7, &storage, 1);
+
+   // Create memory for reading data into
+   int int_dist_vals[num_int_dist_vals];
+   int int_vals[num_int_vals];
+   double double_vals[num_double_vals];
+
+   mesh_memory.memory_delete(nlft);
+   mesh_memory.memory_delete(nrht);
+   mesh_memory.memory_delete(nbot);
+   mesh_memory.memory_delete(ntop);
+   mesh_memory.memory_delete(celltype);
+
+   nlft = NULL;
+   nrht = NULL;
+   ntop = NULL;
+   nbot = NULL;
+   celltype = NULL;
+
+   // Resize is a mesh method
+   // resize(storage);
+   // memory_reset_ptrs();
+   allocate (storage);
+   
+   int flags = RESTART_DATA;
+   // Now add memory entries to database for restoring checkpoint
+   mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
+   flags = RESTART_DATA | REPLICATED_DATA;
+   mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
+   mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
+
+   flags = RESTART_DATA;
+   mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
+   mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
+
+   mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
+   mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
+
+   // Restore MallocPlus memory database
+   crux->restore_MallocPlus(mesh_memory);
+
+   // Remove memory entries from database now that data is restored
+   mesh_memory.memory_remove(int_dist_vals);
+   mesh_memory.memory_remove(int_vals);
+   mesh_memory.memory_remove(double_vals);
+   mesh_memory.memory_remove(cpu_counters);
+   mesh_memory.memory_remove(gpu_counters);
+   mesh_memory.memory_remove(cpu_timers);
+   mesh_memory.memory_remove(gpu_timers);
+
+   // Check version number
+   if (int_vals[ 0] != CRUX_MESH_VERSION) {
+      printf("CRUX version mismatch for mesh data, version on file is %d, version in code is %d\n",
+         int_vals[0], CRUX_MESH_VERSION);
+      exit(0);
+   }
+
+   // Copy out scalar values from array
+   ncells                    = int_dist_vals[ 0];
+   ncells_ghost              = int_dist_vals[ 1];
+   offtile_local_count       = int_dist_vals[ 2];
+
+   // Copy out scalar values from array
+   ndim                      = int_vals[ 1];
+   levmx                     = int_vals[ 2];
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS && mype == 0) {
+      const char *int_dist_vals_descriptor[num_int_dist_vals] = {
+         "ncells",
+         "ncells_ghost",
+         "offtile_local_count"
+      };
+      const char *int_vals_descriptor[num_int_vals] = {
+         "CRUX_MESH_VERSION",
+         "ndim",
+         "levmx",
+      };
+      printf("\n");
+      printf("       === Restored mesh int_dist_vals ===\n");
+      for (int i = 0; i < num_int_dist_vals; i++){
+         printf("       %-30s %d\n",int_dist_vals_descriptor[i], int_dist_vals[i]);
+      }
+      printf("       === Restored mesh int_vals ===\n");
+      for (int i = 0; i < num_int_vals; i++){
+         printf("       %-30s %d\n",int_vals_descriptor[i], int_vals[i]);
+      }
+      printf("       === Restored mesh int_vals ===\n");
+      printf("\n");
+   }
+#endif
+
+   offtile_ratio_local = double_vals[0];
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS && mype == 0) {
+      const char *double_vals_descriptor[num_double_vals] = {
+         "offtile_ratio_local"
+      };
+      printf("\n");
+      printf("       === Restored mesh double_vals ===\n");
+      for (int i = 0; i < num_double_vals; i++){
+         printf("       %-30s %lf\n",double_vals_descriptor[i], double_vals[i]);
+      }
+      printf("       === Restored mesh double_vals ===\n");
+      printf("\n");
+   }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS && mype == 0) {
+      printf("       === Restored mesh cpu counters ===\n");
+      for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+         printf("       %-30s %d\n",mesh_counter_descriptor[i], cpu_counters[i]);
+      }
+      printf("       === Restored mesh cpu counters ===\n");
+      printf("       === Restored mesh gpu counters ===\n");
+      for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+         printf("       %-30s %d\n",mesh_counter_descriptor[i], gpu_counters[i]);
+      }
+      printf("       === Restored mesh gpu counters ===\n");
+      printf("\n");
+   }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS && mype == 0) {
+      printf("       === Restored mesh cpu timers ===\n");
+      for (int i = 0; i < MESH_TIMER_SIZE; i++){
+         printf("       %-30s %lf\n",mesh_timer_descriptor[i], cpu_timers[i]);
+      }
+      printf("       === Restored mesh cpu timers ===\n");
+      printf("\n");
+   }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS && mype == 0) {
+      printf("\n");
+      printf("       === Restored mesh gpu timers ===\n");
+      for (int i = 0; i < MESH_TIMER_SIZE; i++){
+         printf("       %-30s %lld\n",mesh_timer_descriptor[i], gpu_timers[i]);
+      }
+      printf("       === Restored mesh gpu timers ===\n");
+      printf("\n");
+   }
+#endif
+   //calc_celltype(ncells);
+}
+
+
+// This code due to Matt Calef
+void scan ( scanInt *input , scanInt *output , scanInt length) 
+{
+#ifdef _OPENMP
+   // This already assumes it is in a parallel region
+
+   // Get the total number of threads
+
+   scanInt numThreads = omp_get_num_threads ( );
+
+   // Compute the range for which this thread is responsible.
+
+   scanInt threadID   = omp_get_thread_num ( );
+   scanInt start = length * ( threadID     ) / numThreads;
+   scanInt end   = length * ( threadID + 1 ) / numThreads;
+
+   // In the case that there are fewer entries than threads, some
+   // threads will have no entries.  Only perform this operation if
+   // there is a postive number of entries.
+
+   if ( start < end ) {
+
+       // Do a scan over the region for this thread, with an initial
+       // value of zero.
+
+       output[start] = 0;
+       for ( scanInt i = start + 1 ; i < end ; i++ ) 
+          output[i] = output[i-1] + input[i-1];
+   }
+    
+   // Wait until all threads get here. 
+
+#pragma omp barrier
+    
+   // At this point each thread has done an independent scan of its
+   // region.  All scans, except the first, are off by an
+   // offset. Here we have a single thread compute that offset with a
+   // serial scan that strides over the regions assigned to each
+   // thread.
+
+#pragma omp single
+   for ( scanInt i = 1 ; i < numThreads ; i ++ ) {
+      scanInt s0 = length * ( i - 1 ) / numThreads;
+      scanInt s1 = length * ( i     ) / numThreads;
+
+      if ( s0 < s1 ) 
+         output[s1] = output[s0] + input[s1-1];
+
+      if ( s0 < s1 - 1 )
+         output[s1] += output[s1-1];
+   }
+
+   // Barrier is implicit from omp single Wait until all threads get here. 
+
+   // Apply the offset to the range for this thread.
+    
+   for ( scanInt i = start + 1 ; i < end ; i++ ) 
+      output[i] += output[start];
+
+#else
+   output[0] = 0;
+   for (int ic = 0; ic < length; ic++){
+      output[ic+1] = output[ic] + input[ic];
+   }
+#endif
+}
+/****************************************************//**
+*GET  BOUNDS!!!!!!****
+**********************************/
+void Mesh::get_bounds(int& lowerBound, int& upperBound){
+#ifdef _OPENMP
+        int threadID = omp_get_thread_num();
+	lowerBound = lowerBound_Global[threadID];
+	upperBound = upperBound_Global[threadID];
+//	printf("GETBOUNDs ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
+#else
+	lowerBound = 0;
+	upperBound = ncells;
+#endif
+}
+
+/****************************************************//**
+*SETTING BOUNDS!!!!!!****
+**********************************/
+void Mesh::set_bounds(int n){
+
+#ifdef _OPENMP
+      //  #pragma omp parallel
+        {
+        int nthreads = omp_get_num_threads();//Private for each thread
+        int threadID = omp_get_thread_num(); //Private for each thread
+        #pragma omp master 
+	{
+        	if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(nthreads*sizeof(int)); 
+        	if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(nthreads*sizeof(int)); 
+        }
+	//#pragma omp flush (lowerBound_Global, upperBound_Global)
+	#pragma omp barrier
+ 	
+	int work = n/nthreads;
+        if(threadID<(n%nthreads))work++;
+        int lowerBound = ((n / nthreads)*threadID) + min(n%nthreads, threadID);
+        int upperBound = lowerBound + work;
+//      printf("ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
+        lowerBound_Global[threadID] = lowerBound;
+        upperBound_Global[threadID] = upperBound;
+        }
+#else 
+     	if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(1*sizeof(int)); 
+       	if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(1*sizeof(int)); 
+        int lowerBound = 0;
+        int upperBound = ncells;
+        lowerBound_Global[0] = lowerBound;
+        upperBound_Global[0] = upperBound;
+#endif
+
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/mesh.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,711 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef MESH_H_
+#define MESH_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "MallocPlus.h"
+#include <string>
+#include <stdio.h>
+#include <vector>
+#include <math.h>
+#include "KDTree.h"
+#include "crux.h"
+#include "partition.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+
+#if !defined(FULL_PRECISION) && !defined(MIXED_PRECISION) && !defined(MINIMUM_PRECISION)
+#define FULL_PRECISION
+#endif
+#ifdef NO_CL_DOUBLE
+#undef  FULL_PRECISION
+#undef  MIXED_PRECISION
+#define MINIMUM_PRECISION
+#endif
+
+#if defined(MINIMUM_PRECISION)
+   typedef float real_t; // this is used for intermediate calculations
+   typedef float spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+   typedef cl_float cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_float cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+   #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
+   #define MPI_SPATIAL_T MPI_FLOAT
+#endif
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+   typedef double real_t;
+   typedef float spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+   typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_float cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+   #define MPI_REAL_T MPI_DOUBLE
+   #define MPI_SPATIAL_T MPI_FLOAT
+#endif
+
+#elif defined(FULL_PRECISION)
+   typedef double real_t;
+   typedef double spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+   typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_double cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+   #define MPI_REAL_T MPI_DOUBLE
+   #define MPI_SPATIAL_T MPI_DOUBLE
+#endif
+#endif
+
+#define TILE_SIZE 128
+
+#define SWAP_PTR(xnew,xold,xtmp) (xtmp=xnew, xnew=xold, xold=xtmp)
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+typedef unsigned int uint;
+
+//float mem_opt_factor = 1.0;
+
+enum boundary
+{  REAL_CELL      =  1,         //  Denotes cell type of real cell.
+   LEFT_BOUNDARY  = -1,         //  Denotes left boundary ghost cell.
+   RIGHT_BOUNDARY = -2,         //  Denotes right boundary ghost cell.
+   BOTTOM_BOUNDARY= -3,         //  Denotes bottom boundary ghost cell.
+   TOP_BOUNDARY   = -4,         //  Denotes top boundary ghost cell.
+   FRONT_BOUNDARY = -5,         //  Denotes front boundary ghost cell.
+   BACK_BOUNDARY  = -6 };       //  Denotes back boundary ghost cell.
+
+enum dimensionality
+{  ONE_DIMENSIONAL   = 1,       // Dimensionality based at 1 for clarity.
+   TWO_DIMENSIONAL,
+   THREE_DIMENSIONAL};
+
+enum orientation
+{  SW,                          //  SW quadrant.
+   NW,                          //  NW quadrant.
+   NE,                          //  NE quadrant.
+   SE };                        //  SE quadrant.
+
+enum neighbor_calc
+{  HASH_TABLE,                  //  Hash Table.
+   KDTREE };                    //  kD-tree.
+
+enum mesh_timers
+{
+   MESH_TIMER_COUNT_BCS,
+   MESH_TIMER_CALC_NEIGHBORS,
+   MESH_TIMER_HASH_SETUP,
+   MESH_TIMER_HASH_QUERY,
+   MESH_TIMER_FIND_BOUNDARY,
+   MESH_TIMER_PUSH_SETUP,
+   MESH_TIMER_PUSH_BOUNDARY,
+   MESH_TIMER_LOCAL_LIST,
+   MESH_TIMER_LAYER1,
+   MESH_TIMER_LAYER2,
+   MESH_TIMER_LAYER_LIST,
+   MESH_TIMER_COPY_MESH_DATA,
+   MESH_TIMER_FILL_MESH_GHOST,
+   MESH_TIMER_FILL_NEIGH_GHOST,
+   MESH_TIMER_SET_CORNER_NEIGH,
+   MESH_TIMER_NEIGH_ADJUST,
+   MESH_TIMER_SETUP_COMM,
+   MESH_TIMER_KDTREE_SETUP,
+   MESH_TIMER_KDTREE_QUERY,
+   MESH_TIMER_REFINE_SMOOTH,
+   MESH_TIMER_REZONE_ALL,
+   MESH_TIMER_PARTITION,
+   MESH_TIMER_CALC_SPATIAL_COORDINATES,
+   MESH_TIMER_LOAD_BALANCE,
+   MESH_TIMER_SIZE
+};
+
+enum mesh_counters
+{
+   MESH_COUNTER_REZONE,
+   MESH_COUNTER_REFINE_SMOOTH,
+   MESH_COUNTER_CALC_NEIGH,
+   MESH_COUNTER_LOAD_BALANCE,
+   MESH_COUNTER_SIZE
+};
+
+//#ifdef DEBUG_RESTORE_VALS
+static const char *mesh_counter_descriptor[MESH_COUNTER_SIZE] = {
+   "mesh_counter_rezone",
+   "mesh_counter_refine_smooth",
+   "mesh_counter_calc_neigh",
+   "mesh_counter_load_balance"
+};
+//#endif
+
+typedef enum mesh_timers   mesh_timer_category;
+typedef enum mesh_counters mesh_counter_category;
+
+enum mesh_device_types
+{
+   MESH_DEVICE_CPU,
+   MESH_DEVICE_GPU
+};
+
+typedef mesh_device_types mesh_device_type;
+
+using namespace std;
+
+/****************************************************************//**
+ * Mesh class
+ *    Contains the cell-based adaptive mesh refinement
+ *    (AMR) object with its data and methods.
+ *******************************************************************/
+class Mesh
+{
+
+public:
+   int ndim;                    //!<  Dimensionality of mesh (2 or 3).
+
+   MallocPlus mesh_memory;
+   MallocPlus gpu_mesh_memory;
+
+#ifdef HAVE_OPENCL
+   string defines;
+#endif
+
+   double    cpu_timers[MESH_TIMER_SIZE];
+   long long gpu_timers[MESH_TIMER_SIZE];
+
+   int    cpu_counters[MESH_COUNTER_SIZE];
+   int    gpu_counters[MESH_COUNTER_SIZE];
+
+   bool           do_rezone,
+                  gpu_do_rezone;
+
+   int            mype,
+                  numpe,
+                  parallel,
+                  cell_handle,
+                  noffset;
+
+   int            *lowerBound_Global,
+                  *upperBound_Global;
+
+   float          mem_factor;
+
+   double         offtile_ratio_local;
+   int            offtile_local_count;
+
+   vector<int>    corners_i,
+                  corners_j;
+
+   vector<int>    nsizes,
+                  ndispl;
+
+   FILE          *fp;
+
+   TKDTree        tree;         //!<  k-D tree for neighbor search.
+   vector<int>    proc;
+   vector<int>    lev_ibegin,   //!<  Lowest x-index in use at specified level of refinement.
+                  lev_iend,     //!<  Highest x-index in use at specified level of refinement.
+                  lev_jbegin,   //!<  Lowest y-index in use at specified level of refinement.
+                  lev_jend,     //!<  Highest y-index in use at specified level of refinement.
+                  lev_kbegin,   //!<  Lowest z-index in use at specified level of refinement.
+                  lev_kend,     //!<  Highest z-index in use at specified level of refinement.
+                  levtable;     //!<  Powers of two to simplify i,j calculations
+   vector<real_t> lev_deltax,   //!<  Grid spacing along x-axis at specified level of refinement.
+                  lev_deltay,   //!<  Grid spacing along y-axis at specified level of refinement.
+                  lev_deltaz;   //!<  Grid spacing along z-axis at specified level of refinement.
+   int            levmx,        //!<  Maximum level of refinement allowed.
+                  have_boundary,//!<  Mesh includes boundary cells, else creates on the fly
+                  ibase,        //!<  Index basis for arrays (0 for C, 1 for Fortan).
+                  imin,         //!<  Lowest x-index in use.
+                  imax,         //!<  Highest x-index in use.
+                  jmin,         //!<  Lowest y-index in use.
+                  jmax,         //!<  Highest y-index in use.
+                  kmin,         //!<  Lowest z-index in use.
+                  kmax;         //!<  Highest z-index in use.
+   size_t         ncells,       //!<  Number of cells in mesh.
+                  ncells_global, //!<  Global number of cells for parallel runs
+                  ncells_ghost; //!<  Number of cells in mesh with ghost cells.
+   real_t         xmin,         //!<  Lowest x-coordinate in use.
+                  xmax,         //!<  Highest x-coordinate in use.
+                  ymin,         //!<  Lowest y-coordinate in use.
+                  ymax,         //!<  Highest y-coordinate in use.
+                  zmin,         //!<  Lowest z-coordinate in use.
+                  zmax,         //!<  Highest z-coordinate in use.
+                  xcentermin,   //!<  Center of minimum x cell
+                  xcentermax,   //!<  Center of maximum x cell
+                  ycentermin,   //!<  Center of minimum y cell
+                  ycentermax,   //!<  Center of maximum y cell
+                  zcentermin,   //!<  Center of minimum z cell
+                  zcentermax,   //!<  Center of maximum z cell
+                  deltax,       //!<  Grid spacing along x-axis.
+                  deltay,       //!<  Grid spacing along y-axis.
+                  deltaz;       //!<  Grid spacing along z-axis.
+
+   vector<int>    index;        //!<  1D ordered index of mesh elements.
+
+                                 //  mesh state data
+   int            *i,            //!<  1D array of mesh element x-indices.
+                  *j,            //!<  1D array of mesh element y-indices.
+                  *k,            //!<  1D array of mesh element z-indices.
+                  *level,        //!<  1D array of mesh element refinement levels.
+                                 //!<  derived data from mesh state data
+                  *celltype,     //!<  1D ordered index of mesh element cell types (ghost or real).
+                  *nlft,         //!<  1D ordered index of mesh element left neighbors.
+                  *nrht,         //!<  1D ordered index of mesh element right neighbors.
+                  *nbot,         //!<  1D ordered index of mesh element bottom neighbors.
+                  *ntop,         //!<  1D ordered index of mesh element top neighbors.
+                  *nfrt,         //!<  1D ordered index of mesh element front neighbors.
+                  *nbak;         //!<  1D ordered index of mesh element back neighbors.
+
+   vector<spatial_t> x,          //!<  1D ordered index of mesh element x-coordinates.
+                     dx,         //!<  1D ordered index of mesh element x-coordinate spacings.
+                     y,          //!<  1D ordered index of mesh element y-coordinates.
+                     dy,         //!<  1D ordered index of mesh element y-coordinate spacings.
+                     z,          //!<  1D ordered index of mesh element z-coordinates.
+                     dz;         //!<  1D ordered index of mesh element z-coordinate spacings.
+
+#ifdef HAVE_OPENCL
+   cl_mem         dev_ioffset;
+
+   cl_mem         dev_celltype,       
+                  dev_i,       
+                  dev_j,       
+                  dev_level,       
+                  dev_nlft,       
+                  dev_nrht,       
+                  dev_nbot,       
+                  dev_ntop;       
+
+   cl_mem         dev_levdx,    // corresponds to lev_deltax
+                  dev_levdy,    // corresponds to lev_deltay
+                  dev_levibeg,
+                  dev_leviend,
+                  dev_levjbeg,
+                  dev_levjend,
+                  dev_levtable; //
+
+   cl_mem         dev_corners_i,
+                  dev_corners_j;
+#endif
+
+   int nxface;
+   int nyface;
+
+   vector<int> xface_i;
+   vector<int> xface_j;
+   vector<int> xface_level;
+   vector<int> map_xface2cell_lower;
+   vector<int> map_xface2cell_upper;
+
+   vector<int> map_xcell2face_left1;
+   vector<int> map_xcell2face_left2;
+   vector<int> map_xcell2face_right1;
+   vector<int> map_xcell2face_right2;
+
+   vector<int> ixmin_level;
+   vector<int> ixmax_level;
+   vector<int> jxmin_level;
+   vector<int> jxmax_level;
+   vector<int> ixadjust;
+   vector<int> jxadjust;
+
+   vector<int> yface_i;
+   vector<int> yface_j;
+   vector<int> yface_level;
+   vector<int> map_yface2cell_lower;
+   vector<int> map_yface2cell_upper;
+
+   vector<int> map_ycell2face_bot1;
+   vector<int> map_ycell2face_bot2;
+   vector<int> map_ycell2face_top1;
+   vector<int> map_ycell2face_top2;
+
+   vector<int> iymin_level;
+   vector<int> iymax_level;
+   vector<int> jymin_level;
+   vector<int> jymax_level;
+   vector<int> iyadjust;
+   vector<int> jyadjust;
+
+   //   Public constructors.
+   Mesh(FILE *fin, int *numpe);
+   Mesh(int nx, int ny, int levmx_in, int ndim_in, double deltax_in, double deltay_in, int boundary, int parallel_in, int do_gpu_calc);
+
+   //   Member functions.
+   void init(int nx, int ny, real_t circ_radius, partition_method initial_order, int do_gpu_calc);
+   void terminate(void);
+
+   void set_bounds(int n);
+   void get_bounds(int& lowerBound, int& upperBound);
+
+/****************************************************************//**
+ * @name Memory routines
+ *******************************************************************/
+///@{
+
+/****************************************************************//**
+ * \brief
+ * Allocates the basic mesh memory, i, j, and level, using the MallocPlus
+ * memory database.
+ *
+ * **Parameters**
+ * * size_t ncells -- number of cells in the mesh
+ *
+ * Typical Usage
+ *
+ *     mesh.allocate(ncells);
+ *******************************************************************/
+   void allocate(size_t ncells);
+
+   void resize(size_t new_ncells);
+   void memory_reset_ptrs(void);
+   void resize_old_device_memory(size_t ncells);
+///@}
+
+/* inline "macros" */
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Boundary cell tests
+ *******************************************************************/
+   int  is_lower_boundary(int *iv, int *lev_begin, int ic)    { return (iv[ic] < lev_begin[level[ic]]); }
+   int  is_upper_boundary(int *iv, int *lev_end,   int ic)    { return (iv[ic] > lev_end[level[ic]]); }
+
+   int  is_left_boundary(int ic)    { return (i[ic] < lev_ibegin[level[ic]]); }
+   int  is_right_boundary(int ic)   { return (i[ic] > lev_iend[  level[ic]]); }
+   int  is_bottom_boundary(int ic)  { return (j[ic] < lev_jbegin[level[ic]]); }
+   int  is_top_boundary(int ic)     { return (j[ic] > lev_jend[  level[ic]]); }
+   int  is_front_boundary(int ic)   { return (k[ic] < lev_kbegin[level[ic]]); }
+   int  is_back_boundary(int ic)    { return (k[ic] > lev_kend[  level[ic]]); }
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Tests for positioning in set of 4 cells
+ *******************************************************************/
+   int is_lower(int i)  { return(i % 2 == 0); }
+   int is_upper(int i)  { return(i % 2 == 1); }
+
+   int is_lower_left(int i, int j)  { return(i % 2 == 0 && j % 2 == 0); }
+   int is_lower_right(int i, int j) { return(i % 2 == 1 && j % 2 == 0); }
+   int is_upper_left(int i, int j)  { return(i % 2 == 0 && j % 2 == 1); }
+   int is_upper_right(int i, int j) { return(i % 2 == 1 && j % 2 == 1); }
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Level tests
+ *******************************************************************/
+   int is_same_level_or_coarser(int nn, int nz) { return(level[nn] <= level[nz]); }
+   int is_coarser(int nn, int nz)               { return(level[nn] <  level[nz]); }
+   int is_finer(int nn, int nz)                 { return(level[nn] >  level[nz]); }
+   int is_same_level(int nn, int nz)            { return(level[nn] == level[nz]); }
+///@}
+
+/* accessor routines */
+   double get_cpu_timer(mesh_timer_category category)       {return(cpu_timers[category]); };
+   /* Convert nanoseconds to msecs */
+   double get_gpu_timer(mesh_timer_category category)       {return((double)(gpu_timers[category])*1.0e-9); };
+
+   void parallel_output(const char *string, double    local_value, int output_level, const char *units);
+   void parallel_output(const char *string, long long local_value, int output_level, const char *units);
+   void parallel_output(const char *string, int       local_value, int output_level, const char *units);
+   void timer_output(mesh_timer_category category, mesh_device_types device_type, int timer_level);
+
+   int get_cpu_counter(mesh_counter_category category)      {return(cpu_counters[category]); };
+   int get_gpu_counter(mesh_counter_category category)      {return(gpu_counters[category]); };
+
+   int get_calc_neighbor_type(void);
+
+   void print_partition_measure(void);
+   void print_calc_neighbor_type(void);
+   void print_partition_type(void);
+/* end accessor routines */
+
+/* Debugging, internal, or not used yet */
+#ifdef HAVE_OPENCL
+   int gpu_count_BCs();
+#endif
+   void kdtree_setup(void);
+   void partition_measure(void);
+   void partition_cells(int numpe,
+                   vector<int> &order,
+                   enum partition_method method);
+   void calc_distribution(int numpe);
+   void calc_symmetry(vector<int> &dsym,
+                  vector<int> &xsym,
+                  vector<int> &ysym);
+
+/* End of debugging, internal, or not used yet */
+
+   //void calc_face_list_test(double *H);
+   void calc_face_list(void);
+   void calc_face_list_wmap(void);
+   void calc_face_list_wbidirmap(void);
+   void calc_face_list_clearmaps(void);
+
+   int **get_xface_flag(int lev, bool print_output=0);
+   int **get_yface_flag(int lev, bool print_output=0);
+   void get_flat_grid(int lev, int ***zone_flag, int ***zone_cell);
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate neighbors
+ *
+ * **Parameters**
+ *
+ *  Input -- from within the object
+ *    i, j, level
+ *  Output -- in the object
+ *    nlft, nrht, nbot, ntop arrays
+ *******************************************************************/
+   void calc_neighbors(int ncells);
+   void calc_neighbors_local(void);
+#ifdef HAVE_OPENCL
+   void gpu_calc_neighbors(void);
+   void gpu_calc_neighbors_local(void);
+#endif
+   //   TODO:  Not created yet; overloading for 3D mesh support. (davis68)
+   void calc_neighbors(vector<int> &nlft,
+                  vector<int> &nrht,
+                  vector<int> &nbot,
+                  vector<int> &ntop,
+                  vector<int> &nfrt,
+                  vector<int> &nbak,
+                  vector<int> index);
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate rezone count
+ *
+ * **Parameters**
+ *
+ *  Input
+ *    mpot -- potential mesh refinement
+ *    ioffset -- write offset for each cell
+ *  Output
+ *    result -- cell count
+ *******************************************************************/
+   int  rezone_count(vector<int> mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+   void gpu_rezone_count2(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result);
+   void gpu_rezone_count(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result);
+   void gpu_rezone_scan(size_t block_size, size_t local_work_size, cl_mem dev_ioffset, cl_mem &dev_result);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Refine Smooth -- smooths jump in refinement level so that only a 1 to 2 jump occurs
+ *
+ *  **Parameters**
+ *
+ *  Input/Output
+ *    mpot -- potential mesh refinement array, 1 is refine and -1 coarsen
+ *    ioffset -- write offset for each cell to account for new cells
+ *    result -- refinement count
+ *******************************************************************/
+   size_t refine_smooth(vector<int> &mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+   int gpu_refine_smooth(cl_mem &dev_mpot, int &icount, int &jcount);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Rezone mesh
+ *
+ *  **Parameters**
+ *
+ *  Input
+ *     add_ncells -- for each processor. A global sum will be done and the main part of
+ *        the rezone will be skipped if no cells are added.
+ *     mpot -- mesh rezone potential
+ *     have_state flag -- 0 (false) for setup when physics state has not been allocated
+ *     ioffset -- partial prefix scan results for starting address to write new cells
+ *     state_memory -- linked list of arrays for state
+ *  Output
+ *     new mesh and state arrays with refinement/coarsening performed
+ *******************************************************************/
+   void rezone_all(int icount, int jcount, vector<int> mpot, int have_state, MallocPlus &state_memory);
+#ifdef HAVE_OPENCL
+   void gpu_rezone_all(int icount, int jcount, cl_mem &dev_mpot, MallocPlus &gpu_state_memory);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Load balance -- only needed for parallel (MPI) runs
+ *
+ *  **Parameters**
+ *
+ *  Input
+ *    numcells -- ncells from rezone all routine. This is a copy in so that a local
+ *       value can be used for load_balance and gpu_load_balance without it getting
+ *       reset for clamr_checkall routine
+ *    weight -- weighting array per cell for balancing. Currently not used. Null value
+ *       indicates even weighting of cells for load balance. 
+ *    state_memory or gpu_state_memory -- linked-list of arrays from physics routine
+ *       to be load balanced. 
+ * Output -- arrays will be returned load balanced with new sizes. Pointers to arrays
+ *       will need to be reset
+ *******************************************************************/
+#ifdef HAVE_MPI
+   void do_load_balance_local(size_t numcells, float *weight, MallocPlus &state_memory);
+#ifdef HAVE_OPENCL
+   int gpu_do_load_balance_local(size_t numcells, float *weight, MallocPlus &gpu_state_memory);
+#endif
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate spatial coordinates
+ *
+ *  **Parameters**
+ *
+ *  Input -- from within the object
+ *    i, j, level
+ *  Output
+ *    x, y -- coordinates for each cell
+ *    dx, dy -- size of each cell
+ *******************************************************************/
+   void calc_spatial_coordinates(int ibase);
+#ifdef HAVE_OPENCL
+   void gpu_calc_spatial_coordinates(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Testing routines
+ *******************************************************************/
+#ifdef HAVE_OPENCL
+   void compare_dev_local_to_local(void); // Not currently called
+   void compare_neighbors_gpu_global_to_cpu_global(void);
+#endif
+   void compare_neighbors_cpu_local_to_cpu_global(uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl);
+#ifdef HAVE_OPENCL
+   void compare_neighbors_all_to_gpu_local(Mesh *mesh_global, int *nsizes, int *ndispl);
+   void compare_mpot_gpu_global_to_cpu_global(int *mpot, cl_mem dev_mpot);
+#endif
+   void compare_mpot_cpu_local_to_cpu_global(uint ncells_global, int *nsizes, int *displ, int *mpot, int *mpot_global, int cycle);
+#ifdef HAVE_OPENCL
+   void compare_mpot_all_to_gpu_local(int *mpot, int *mpot_global, cl_mem dev_mpot, cl_mem dev_mpot_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle);
+   void compare_ioffset_gpu_global_to_cpu_global(uint old_ncells, int *mpot);
+   void compare_ioffset_all_to_gpu_local(uint old_ncells, uint old_ncells_global, int block_size, int block_size_global, int *mpot, int *mpot_global, cl_mem dev_ioffset, cl_mem dev_ioffset_global, int *ioffset, int *ioffset_global, int *celltype_global, int *i_global, int *j_global);
+   void compare_coordinates_gpu_global_to_cpu_global_double(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, double *H);
+   void compare_coordinates_gpu_global_to_cpu_global_float(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, float *H);
+#endif
+   void compare_coordinates_cpu_local_to_cpu_global_double(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, double *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, double *H_global, int cycle);
+   void compare_coordinates_cpu_local_to_cpu_global_float(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, float *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, float *H_global, int cycle);
+#ifdef HAVE_OPENCL
+   void compare_indices_gpu_global_to_cpu_global(void);
+#endif
+   void compare_indices_cpu_local_to_cpu_global(uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl, int cycle);
+#ifdef HAVE_OPENCL
+   void compare_indices_all_to_gpu_local(Mesh *mesh_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle);
+#endif
+///@}
+
+   size_t get_checkpoint_size(void);
+   void store_checkpoint(Crux *crux);
+   void restore_checkpoint(Crux *crux);
+
+   void calc_celltype_threaded(size_t ncells);
+   void calc_celltype(size_t ncells);
+
+private:
+   //   Private constructors.
+   Mesh(const Mesh&);   //   Blocks copy constructor so copies are not made inadvertently.
+
+   //   Member functions.
+   void print_object_info();
+
+   void set_refinement_order(int order[4], int ic, int ifirst, int ilast, int jfirst, int jlast,
+                                int level_first, int level_last, int *i, int *j, int *level);
+
+   void write_grid(int ncycle);
+   void calc_centerminmax(void);
+   void calc_minmax(void);
+
+   void print(void);
+   void print_local(void);
+#ifdef HAVE_OPENCL
+   void print_dev_local();
+#endif
+
+};
+
+#endif /* MESH_H */

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/partition.cpp?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp Sun Sep  3 20:10:18 2017
@@ -0,0 +1,764 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifdef HAVE_MPI
+#include "mpi.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <list>
+#include <algorithm>
+#include "partition.h"
+#include "KDTree.h"
+#include "mesh.h"
+#ifdef HAVE_MPI
+#include "s7.h"
+#endif
+#include "zorder.h"
+#include "timer.h"
+#include "hsfc.h"
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+typedef unsigned int uint;
+
+int measure_type;
+int      meas_count                  = 0;
+double   meas_sum_average            = 0.0;
+
+extern bool localStencil;
+extern enum partition_method initial_order;
+extern enum partition_method cycle_reorder;
+
+void Mesh::partition_measure(void) 
+{
+  if (measure_type != NO_PARTITION_MEASURE){
+
+     int ntX     = TILE_SIZE; 
+     static double offtile_ratio = 0.0;
+
+     uint num_groups = (ncells + TILE_SIZE - 1)/TILE_SIZE;
+
+     if (measure_type == WITH_DUPLICATES) {
+        int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+        for (uint group_id=0; group_id < num_groups; group_id ++){ 
+ 
+           int start_idx = group_id * ntX;
+           int end_idx   = (group_id + 1) * ntX; 
+
+           int offtile =0;
+           for (uint ic = 0; ic < TILE_SIZE; ic++, i++){ 
+
+              if (i >= ncells) continue;
+              //taken from wave_kern_calc.cl 'setup tile' kernel
+              if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile++; 
+              if (level[nlft[i]] > level[i] &&
+                 (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile++;
+              if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile++; 
+              if (level[nrht[i]] > level[i] &&
+                 (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile++;
+              if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile++; 
+              if (level[nbot[i]] > level[i] &&
+                 (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile++;
+              if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile++; 
+              if (level[ntop[i]] > level[i] &&
+                 (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile++;
+           }
+           offtile_ratio += (double)offtile/(double)(TILE_SIZE);
+           //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+        }
+     } else if (measure_type == WITHOUT_DUPLICATES) {
+        int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+        for (uint group_id=0; group_id < num_groups; group_id ++){ 
+           list<int> offtile_list;
+ 
+           int start_idx = group_id * ntX;
+           int end_idx   = (group_id + 1) * ntX; 
+
+           for (uint ic = 0; ic < TILE_SIZE; ic++, i++){ 
+
+              if (i >= ncells) continue;
+
+              if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile_list.push_back(nlft[i]);
+              if (level[nlft[i]] > level[i] &&
+                 (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile_list.push_back(ntop[nlft[i]]);
+              if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile_list.push_back(nrht[i]);
+              if (level[nrht[i]] > level[i] &&
+                 (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile_list.push_back(ntop[nrht[i]]);
+              if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile_list.push_back(nbot[i]);
+              if (level[nbot[i]] > level[i] &&
+                 (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile_list.push_back(nrht[nbot[i]]);
+              if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile_list.push_back(ntop[i]);
+              if (level[ntop[i]] > level[i] &&
+                 (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile_list.push_back(nrht[ntop[i]]);
+           }
+           offtile_list.sort();
+           offtile_list.unique();
+        
+           offtile_ratio += (double)offtile_list.size()/(double)(TILE_SIZE);
+           //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+        }
+     } else if (measure_type == CVALUE) {
+        int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+        for (uint group_id=0; group_id < num_groups; group_id ++){ 
+           list<int> offtile_list;
+ 
+           int start_idx = group_id * ntX;
+           int end_idx   = (group_id + 1) * ntX; 
+
+           for (uint ic = 0; ic < TILE_SIZE; ic++, i++){ 
+
+              if (i >= ncells) continue;
+
+              if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile_list.push_back(nlft[i]);
+              if (level[nlft[i]] > level[i] &&
+                 (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile_list.push_back(ntop[nlft[i]]);
+              if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile_list.push_back(nrht[i]);
+              if (level[nrht[i]] > level[i] &&
+                 (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile_list.push_back(ntop[nrht[i]]);
+              if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile_list.push_back(nbot[i]);
+              if (level[nbot[i]] > level[i] &&
+                 (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile_list.push_back(nrht[nbot[i]]);
+              if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile_list.push_back(ntop[i]);
+              if (level[ntop[i]] > level[i] &&
+                 (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile_list.push_back(nrht[ntop[i]]);
+           }
+           offtile_list.sort();
+           offtile_list.unique();
+        
+           offtile_ratio += (double)offtile_list.size()/(4*sqrt((double)(TILE_SIZE)));
+           //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+        }
+     } else if (measure_type == CSTARVALUE) {
+        int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+        for (uint group_id=0; group_id < num_groups; group_id ++){ 
+           list<int> offtile_list;
+           list<int> offtile_cache_lines; // Assumes memory is aligned
+           int cache_line_size = 4; // Some could be 8, or more?
+ 
+           int start_idx = group_id * ntX;
+           int end_idx   = (group_id + 1) * ntX; 
+
+           for (uint ic = 0; ic < TILE_SIZE; ic++, i++){ 
+
+              if (i >= ncells) continue;
+
+              if (nlft[i] < start_idx || nlft[i] >= end_idx) {
+                  offtile_list.push_back(nlft[i]);
+                  offtile_cache_lines.push_back(nlft[i]/cache_line_size);
+              }
+               
+              if (level[nlft[i]] > level[i] && (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) {
+                  offtile_list.push_back(ntop[nlft[i]]);
+                  offtile_cache_lines.push_back(ntop[nlft[i]]/cache_line_size);
+              }
+              if (nrht[i] < start_idx || nrht[i] >= end_idx) {
+                  offtile_list.push_back(nrht[i]);
+                  offtile_cache_lines.push_back(nrht[i]/cache_line_size);
+              }
+              if (level[nrht[i]] > level[i] && (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) {
+                  offtile_list.push_back(ntop[nrht[i]]);
+                  offtile_cache_lines.push_back(ntop[nrht[i]]/cache_line_size);
+              }
+              if (nbot[i] < start_idx || nbot[i] >= end_idx) {
+                  offtile_list.push_back(nbot[i]);
+                  offtile_cache_lines.push_back(nbot[i]/cache_line_size);
+              }
+              if (level[nbot[i]] > level[i] && (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) {
+                  offtile_list.push_back(nrht[nbot[i]]);
+                  offtile_cache_lines.push_back(nrht[nbot[i]]/cache_line_size);
+              }
+              if (ntop[i] < start_idx || ntop[i] >= end_idx) {
+                  offtile_list.push_back(ntop[i]);
+                  offtile_cache_lines.push_back(ntop[i]/cache_line_size);
+              }
+              if (level[ntop[i]] > level[i] && (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) {
+                  offtile_list.push_back(nrht[ntop[i]]);
+                  offtile_cache_lines.push_back(nrht[ntop[i]]/cache_line_size);
+              }
+           }
+           offtile_list.sort();
+           offtile_list.unique();
+           offtile_cache_lines.sort();
+           offtile_cache_lines.unique();
+
+           double s_ngeom = (double)(offtile_list.size());
+           double q_ngeom = (double)(offtile_cache_lines.size());
+           double ngeom = (double)(TILE_SIZE);
+           double cover = (double)(cache_line_size);
+//            offtile_ratio += (s_ngeom * q_ngeom) / (4*sqrt(ngeom)*2*(1+(ngeom+cache_line_size-1)/cache_line_size));
+//            offtile_ratio += (q_ngeom) / (2*sqrt(ngeom)+2*((sqrt(ngeom)+cover-1)/cover));
+//            offtile_ratio += (q_ngeom) / ( (8*sqrt(ngeom)+cover-1)/cover );
+           ngeom = sqrt(ngeom);
+           offtile_ratio += (s_ngeom*q_ngeom*cover) / ( 4 * ngeom * (8*ngeom+cover-1) );
+        
+           //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+        }
+     } 
+
+     // printf("DEBUG Ratio of surface area to volume is equal to %d / %d \n", offtile, ontile);
+   
+#ifdef _OPENMP
+#pragma omp master
+     {
+#endif
+         meas_count ++;
+         meas_sum_average  += offtile_ratio/(double)num_groups;
+     // printf("DEBUG %d icount %d sum_average %lf\n",__LINE__,icount, sum_average);
+#ifdef _OPENMP
+     }
+#endif
+  } // if PARTITION TYPE
+}
+
+void Mesh::print_partition_measure()
+{
+   if (meas_count != 0) {
+      if        (measure_type == NO_PARTITION_MEASURE) {
+         if (mype == 0) printf("No Partition Measure\n");
+      } else if (measure_type == WITH_DUPLICATES) {
+         parallel_output("Average surface area to volume ratio  ", meas_sum_average/(double)meas_count, 0, "with duplicates");
+      } else if (measure_type == WITHOUT_DUPLICATES) {
+         parallel_output("Average surface area to volume ratio  ", meas_sum_average/(double)meas_count, 0, "without duplicates");
+      } else if (measure_type == CVALUE) {
+         parallel_output("Partition Quality Avg C value     ", meas_sum_average/(double)meas_count, 0, "");
+      } else if (measure_type == CSTARVALUE){
+         parallel_output("Partition Quality Avg C* value     ", meas_sum_average/(double)meas_count, 0, "");
+      }
+   }
+
+   if (numpe > 1){
+      parallel_output("The MPI surface area to volume ratio ", offtile_ratio_local, 0, "without duplicates");
+   }
+}
+
+void Mesh::print_partition_type()
+{
+   if (mype == 0) {
+      if (initial_order == ORIGINAL_ORDER) {
+         printf("Initial order is naive.");  
+      } else if (initial_order == HILBERT_SORT) {
+         printf("Initial order is Hilbert sort.");  
+      } else if (initial_order == HILBERT_PARTITION) {
+         printf("Initial order is Hilbert partitionr.");  
+      } else if (initial_order == ZORDER) {
+         printf("Initial order is Z order.");  
+      }
+
+      if (cycle_reorder == ORIGINAL_ORDER) {
+         printf("   No cycle reorder.");  
+      } else if (cycle_reorder == HILBERT_SORT) {
+         printf("   Cycle reorder is Hilbert sort.");  
+      } else if (cycle_reorder == HILBERT_PARTITION) {
+         printf("   Cycle reorder is Hilbert partition.");  
+      } else if (cycle_reorder == ZORDER) {
+         printf("   Cycle reorder is Z order.");  
+      }
+
+      if (localStencil) {
+         printf("   Local Stencil is on.\n");  
+      } else {
+         printf("\n");
+      }
+   }
+
+}
+void Mesh::partition_cells(
+                    int          numpe,             //  
+                    vector<int> &z_order,           //  Resulting index ordering.
+                    enum partition_method method)   //  Assigned partitioning method.
+{  
+   int           *info;      //
+   double         iscale,    //
+                  jscale;    //
+   int            imax,      //  Maximum x-index.
+                  jmax;      //  Maximum y-index.
+   vector<int>    z_index;   //  Ordered curve from hsfc.
+   vector<int>    i_scaled;  //  x-indices normalized to a scale of [0, 1] for hsfc.
+   vector<int>    j_scaled;  //  y-indices normalized to a scale of [0, 1] for hsfc.
+   vector<double> iunit;     //
+   vector<double> junit;     //
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   //  Initialize ordered curve index.
+   z_index.resize(ncells, 0);
+   //z_order.resize(ncells, 0);
+
+   if (parallel) {
+#ifdef HAVE_MPI
+      nsizes.resize(numpe);
+      ndispl.resize(numpe);
+      MPI_Allgather(&ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+      ndispl[0]=0;
+      for (int ip=1; ip<numpe; ip++){
+         ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+      }
+      noffset=0;
+      for (int ip=0; ip<mype; ip++){
+        noffset += nsizes[ip];
+      }
+#endif
+   } else {
+      //   Adjust the number of required work items to the number of cells.
+      proc.resize(ncells);
+      //   Decompose the domain equitably.
+      calc_distribution(numpe);
+      noffset = 0;
+   }
+
+   
+   //  Partition cells according to one of several possible orderings.
+   int have_spatial_variables=0;
+   switch (method)
+   {   case ORIGINAL_ORDER:
+         //  Set z_order to the current cell order.
+         for (uint ic = 0; ic < ncells; ++ic)
+         {   z_order[ic] = ic; }
+
+         cpu_timers[MESH_TIMER_PARTITION] += cpu_timer_stop(tstart_cpu);
+
+         return;
+         break;
+
+       case HILBERT_SORT:
+         //  Resort the curve by Hilbert order.
+         have_spatial_variables = 1;
+         if (x.size() < ncells) {
+            calc_spatial_coordinates(0);
+            have_spatial_variables = 0;
+         }
+         calc_centerminmax();
+         iunit.resize(ncells);
+         junit.resize(ncells);
+
+         //   Get the range of values in the x- and y-directions and make the scale square.
+         iscale = 1.0 / (xcentermax - xcentermin);
+         jscale = 1.0 / (ycentermax - ycentermin);
+
+         //   Scale the indices to a normalized [0, 1] range for hsfc.
+         for (uint ic = 0; ic < ncells; ++ic){
+             iunit[ic] = (x[ic] + 0.5 * dx[ic] - xcentermin) * iscale;
+             junit[ic] = (y[ic] + 0.5 * dy[ic] - ycentermin) * jscale;
+         }
+
+         if (have_spatial_variables == 0){
+            x.clear();
+            dx.clear();
+            y.clear();
+            dy.clear();
+         }
+
+         if (parallel){
+#ifdef HAVE_MPI
+            info = (int *)malloc(sizeof(int) * 3 * ncells_global);
+            vector<double>iunit_global(ncells_global);
+            vector<double>junit_global(ncells_global);
+            vector<int>z_order_global(ncells_global);
+
+            MPI_Allgatherv(&iunit[0], ncells, MPI_DOUBLE, &iunit_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+            MPI_Allgatherv(&junit[0], ncells, MPI_DOUBLE, &junit_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+            //   Sort the mesh into an ordered space-filling curve from hsfc.
+            hsfc2sort(ncells_global, &iunit_global[0], &junit_global[0], 0, info, 1);
+
+            //   Copy the cell order information from info into z_order.
+            for (uint ic = 0; ic < ncells_global; ++ic)
+            {   z_order_global[ic] = info[ic]; }
+            free(info);
+
+            //   Order the mesh according to the calculated order (note that z_order is for both curves).
+            vector<int> int_global(ncells_global);
+            vector<int> int_global_new(ncells_global);
+
+            // gather, reorder and scatter i
+            MPI_Allgatherv(&i[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &i[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // gather, reorder and scatter j
+            MPI_Allgatherv(&j[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &j[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // gather, reorder and scatter level
+            MPI_Allgatherv(&level[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &level[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // It is faster just to recalculate these variables instead of communicating them
+            if (mesh_memory.get_memory_size(celltype) >= ncells) {
+               calc_celltype(mesh_memory.get_memory_size(celltype));
+            }
+
+            if (have_spatial_variables) {
+               calc_spatial_coordinates(0);
+            }
+
+            if (mesh_memory.get_memory_size(nlft) >= ncells) {
+               vector<int> inv_z_order(ncells_global);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  inv_z_order[z_order_global[ic]] = ic;
+               }
+
+               MPI_Allgatherv(&nlft[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nlft[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&nrht[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nrht[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&nbot[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nbot[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&ntop[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &ntop[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+            }
+
+            MPI_Scatterv(&z_order_global[0], &nsizes[0], &ndispl[0], MPI_INT, &z_order[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+         } else {
+            info = (int *)malloc(sizeof(int) * 3 * ncells);
+
+            //   Sort the mesh into an ordered space-filling curve from hsfc.
+            hsfc2sort(ncells, &iunit[0], &junit[0], 0, info, 1);
+
+            //   Copy the cell order information from info into z_order.
+            for (uint ic = 0; ic < ncells; ++ic)
+            {   z_order[ic] = info[ic]; }
+            free(info);
+
+            //   Order the mesh according to the calculated order (note that z_order is for both curves).
+            vector<int> int_local(ncells);
+
+            mesh_memory.set_memory_attribute(nlft, 0x100);
+            mesh_memory.set_memory_attribute(nrht, 0x100);
+            mesh_memory.set_memory_attribute(nbot, 0x100);
+            mesh_memory.set_memory_attribute(ntop, 0x100);
+
+            mesh_memory.memory_reorder_all(&z_order[0]);
+            memory_reset_ptrs();
+
+            if (x.size() >= ncells) {
+               vector<spatial_t> real_local(ncells);
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = x[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  x[ic] = real_local[z_order[ic]];
+               }
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = dx[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  dx[ic] = real_local[z_order[ic]];
+               }
+           
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = y[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  y[ic] = real_local[z_order[ic]];
+               }
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = dy[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  dy[ic] = real_local[z_order[ic]];
+               }
+            }
+
+         }
+
+         break;
+
+      case ZORDER:
+         //  Resort the curve by z-order.
+         if (parallel) {
+#ifdef HAVE_MPI
+            vector<int>i_global(ncells_global);
+            vector<int>j_global(ncells_global);
+            vector<int>level_global(ncells_global);
+            vector<int>z_index_global(ncells_global);
+            vector<int>z_order_global(ncells_global);
+            MPI_Allgatherv(&i[0], ncells, MPI_REAL, &i_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+            MPI_Allgatherv(&j[0], ncells, MPI_REAL, &j_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+            MPI_Allgatherv(&level[0], ncells, MPI_REAL, &level_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+
+            i_scaled.resize(ncells_global);
+            j_scaled.resize(ncells_global);
+
+            //
+            imax = 0;
+            jmax = 0;
+            for (uint ic = 0; ic < ncells_global; ++ic)
+            {   if (i_global[ic] > imax) imax = i_global[ic];
+               if (j_global[ic] > jmax) jmax = j_global[ic]; }
+
+            //
+            iscale = 16.0 / (double)imax;
+            jscale = 16.0 / (double)jmax;
+
+            //
+            for (uint ic = 0; ic < ncells_global; ++ic)
+            {   i_scaled[ic]=(int) ( (double)i_global[ic]*iscale);
+               j_scaled[ic]=(int) ( (double)j_global[ic]*jscale); }
+
+            //
+            calc_zorder(ncells_global, &i_scaled[0], &j_scaled[0], &level_global[0], levmx, ibase, &z_index_global[0], &z_order_global[0]);
+
+            //   Order the mesh according to the calculated order (note that z_order is for both curves).
+            vector<int> int_global(ncells_global);
+            vector<int> int_global_new(ncells_global);
+
+            // gather, reorder and scatter i
+            MPI_Allgatherv(&i[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &i[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // gather, reorder and scatter j
+            MPI_Allgatherv(&j[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &j[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // gather, reorder and scatter level
+            MPI_Allgatherv(&level[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+            for (int ic = 0; ic<(int)ncells_global; ic++){
+               int_global_new[ic] = int_global[z_order_global[ic]];
+            }
+            MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &level[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+            // It is faster just to recalculate these variables instead of communicating them
+            if (mesh_memory.get_memory_size(celltype) >= ncells) {
+               calc_celltype(mesh_memory.get_memory_size(celltype));
+            }
+
+            if (x.size() >= ncells) {
+               calc_spatial_coordinates(0);
+            }
+
+            if (mesh_memory.get_memory_size(nlft) >= ncells) {
+               vector<int> inv_z_order(ncells_global);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  inv_z_order[z_order_global[ic]] = ic;
+               }
+
+               MPI_Allgatherv(&nlft[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nlft[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&nrht[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nrht[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&nbot[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nbot[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+               MPI_Allgatherv(&ntop[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+               for (int ic = 0; ic<(int)ncells_global; ic++){
+                  int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+               }
+               MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &ntop[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+            }
+            MPI_Scatterv(&z_order_global[0], &nsizes[0], &ndispl[0], MPI_REAL, &z_order[0], ncells, MPI_REAL, 0, MPI_COMM_WORLD);
+#endif
+         } else {
+            i_scaled.resize(ncells);
+            j_scaled.resize(ncells);
+
+            //
+            imax = 0;
+            jmax = 0;
+            for (uint ic = 0; ic < ncells; ++ic)
+            {   if (i[ic] > imax) imax = i[ic];
+               if (j[ic] > jmax) jmax = j[ic]; }
+
+            //
+            iscale = 16.0 / (double)imax;
+            jscale = 16.0 / (double)jmax;
+
+            //
+            for (uint ic = 0; ic < ncells; ++ic)
+            {   i_scaled[ic]=(int) ( (double)i[ic]*iscale);
+               j_scaled[ic]=(int) ( (double)j[ic]*jscale); }
+
+            //
+            calc_zorder(ncells, &i_scaled[0], &j_scaled[0], &level[0], levmx, ibase, &z_index[0], &z_order[0]);
+
+            //   Order the mesh according to the calculated order (note that z_order is for both curves).
+            vector<int> int_local(ncells);
+
+            mesh_memory.set_memory_attribute(nlft, 0x100);
+            mesh_memory.set_memory_attribute(nrht, 0x100);
+            mesh_memory.set_memory_attribute(nbot, 0x100);
+            mesh_memory.set_memory_attribute(ntop, 0x100);
+
+            mesh_memory.memory_reorder_all(&z_order[0]);
+            memory_reset_ptrs();
+
+
+            if (x.size() >= ncells) {
+               vector<spatial_t> real_local(ncells);
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = x[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  x[ic] = real_local[z_order[ic]];
+               }
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = dx[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  dx[ic] = real_local[z_order[ic]];
+               }
+           
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = y[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  y[ic] = real_local[z_order[ic]];
+               }
+
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  real_local[ic] = dy[ic];
+               }
+               for (int ic = 0; ic<(int)ncells; ic++){
+                  dy[ic] = real_local[z_order[ic]];
+               }
+            }
+
+         }
+
+         break;
+
+      default:
+         //  Note that HILBERT_PARTITION is not currently supported due to redundancy with HILBERT_SORT.
+         break;
+   }
+   
+   
+   //   Output ordered mesh information.
+   if (DEBUG)
+   {   printf("orig index   i     j     lev    nlft nrht nbot ntop   xlow    xhigh     ylow    yhigh   z index  z order\n");
+      for (uint ic=0; ic<ncells; ic++){
+         printf(" %6d   %4d  %4d   %4d  %4d %4d %4d %4d ", index[ic], j[ic], i[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+         printf(" %8.2lf %8.2lf %8.2lf %8.2lf", x[ic], x[ic]+dx[ic], y[ic], y[ic]+dy[ic]);
+         printf(" %6d    %5d\n", z_index[ic], z_order[ic]); } }
+
+   cpu_timers[MESH_TIMER_PARTITION] += cpu_timer_stop(tstart_cpu);
+}
+
+//   The distribution needs to be modified in order to spread out extra cells equitably among the work items.
+void Mesh::calc_distribution(int numpe)
+{  
+   uint lsize = 0;     //
+   for (int ip = 0; ip < numpe; ++ip) {
+      lsize += proc.size()/numpe;
+      if (ip < (int)proc.size()%numpe) lsize++;
+      for (int ic = 0; ic < (int)lsize; ic++) {
+         proc[ic] = ip;
+      }
+   }
+}
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/partition.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef PARTITION_H
+#define PARTITION_H
+
+#include <vector>
+
+#include "input.h"
+
+using namespace std;
+
+enum partition_method {
+   ORIGINAL_ORDER,
+   HILBERT_SORT,
+   HILBERT_PARTITION,
+   ZORDER
+};
+
+enum partition_measure {
+   NO_PARTITION_MEASURE,
+   WITH_DUPLICATES,
+   WITHOUT_DUPLICATES,
+   CVALUE,
+   CSTARVALUE
+};
+
+
+void calc_distribution(int numpe, vector<int> &proc);
+//void partition_cells(int numpe, vector<int> &proc, Mesh &mesh, enum partition_method method);
+
+typedef void (*maptonorm)( double * , double * , void * );
+
+extern "C" void hsfc2sort(
+                const int      N ,     /* IN: Number of points */
+                const double * X ,     /* IN: array of X-Coordinates */
+                const double * Y ,     /* IN: array of Y-Coordinates */
+                const int      ibase ,    /* IN: Stride for Y array */
+                int          * Info ,  /* OUT: (1 <= LDInfo) [ HSFC ordering ]
+                                  (2 <= LDInfo) [ HSFC index, #1 ]
+                                  (3 <= LDInfo) [ HSFC index, #2 ] */
+                int            LDInfo /* IN:  Leading dimension of Info */
+                );
+
+extern "C" void hsfc2part(
+               const int      Level , /* IN: Background grid level of partitioning */
+               const int      Limit , /* IN: Number of levels to consider for 'gaps' */
+               const int      NPart , /* IN: Target number of partitions */
+               const int      N ,     /* IN: Number of points */
+               const double * X ,     /* IN: array of X-Coordinates */
+               const double * Y ,     /* IN: array of Y-Coordinates */
+               const int      ibase ,    /* IN: Base - 0 for C, 1 for Fortran */
+                     int    * Info ,  /* IN:  Array of computational weights,
+                                 OUT: (1 <= LDInfo) [ Partitioning ]
+                                 (2 <= LDInfo) [ Adjusted HSFC ordering ]
+                                 (3 <= LDInfo) [ Original HSFC index, #1 ]
+                                 (4 <= LDInfo) [ Original HSFC index, #2 ] */
+                     int      LDInfo );/* IN:  Leading dimension of Info */
+
+
+#endif /* PARTITION_H */

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/reduce.c?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c Sun Sep  3 20:10:18 2017
@@ -0,0 +1,245 @@
+/**
+ *  Copyright (c) 2011, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#include "reduce.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+
+#ifdef HAVE_OPENCL
+#include "reduce_kernel.inc"
+#endif
+
+void init_kernels_reduce(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_sum     = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_cl");
+    kernel_reduce_sum_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage1of2_cl");
+    kernel_reduce_sum_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage2of2_cl");
+    kernel_reduce_sum_int_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage1of2_cl");
+    kernel_reduce_sum_int_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage2of2_cl");
+    kernel_reduce_product = ezcl_create_kernel_wsource(context, reduce_source, "reduce_product_cl");
+    kernel_reduce_max     = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_cl");
+    kernel_reduce_max_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage1of2_cl");
+    kernel_reduce_max_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage2of2_cl");
+    kernel_reduce_min     = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_cl");
+    kernel_reduce_min_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage1of2_cl");
+    kernel_reduce_min_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage2of2_cl");
+#endif
+}
+
+void init_kernel_sum(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_sum = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_cl");
+#endif
+}
+
+void init_kernel_2stage_sum(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_sum_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage1of2_cl");
+    kernel_reduce_sum_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage2of2_cl");
+#endif
+}
+
+void terminate_kernel_2stage_sum(void)
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum_stage1of2);
+    ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+#endif
+}
+
+void init_kernel_2stage_sum_int(void)
+{   
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_sum_int_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage1of2_cl");
+    kernel_reduce_sum_int_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage2of2_cl");
+#endif
+}
+
+void terminate_kernel_2stage_sum_int(void)
+{   
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);
+    ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+#endif
+}
+
+void init_kernel_product(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_product = ezcl_create_kernel_wsource(context, reduce_source, "reduce_product_cl");
+#endif
+}
+
+void init_kernel_max(void)
+{   
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_max = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_cl");
+#endif
+}
+
+void init_kernel_2stage_max(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_max_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage1of2_cl");
+    kernel_reduce_max_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage2of2_cl");
+#endif
+}
+
+void init_kernel_min(void)
+{   
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_min = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_cl");
+#endif
+}
+
+void init_kernel_2stage_min(void)
+{
+#ifdef HAVE_OPENCL
+    cl_context context = ezcl_get_context();
+    kernel_reduce_min_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage1of2_cl");
+    kernel_reduce_min_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage2of2_cl");
+#endif
+}
+
+void release_kernels_reduce()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum);
+    ezcl_kernel_release(kernel_reduce_sum_stage1of2);
+    ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+    ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);
+    ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+    ezcl_kernel_release(kernel_reduce_product);
+    ezcl_kernel_release(kernel_reduce_max);
+    ezcl_kernel_release(kernel_reduce_max_stage1of2);
+    ezcl_kernel_release(kernel_reduce_max_stage2of2);
+    ezcl_kernel_release(kernel_reduce_min);
+    ezcl_kernel_release(kernel_reduce_min_stage1of2);
+    ezcl_kernel_release(kernel_reduce_min_stage2of2);
+#endif
+}
+
+void release_kernel_sum()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum);
+#endif
+}
+
+void release_kernel_2stage_sum()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum_stage1of2);  
+    ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+#endif
+}
+
+void release_kernel_2stage_sum_int()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);  
+    ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+#endif
+}
+
+void release_kernel_product()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_product);
+#endif
+}
+
+void release_kernel_max()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_max);
+#endif
+}
+
+void release_kernel_2stage_max()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_max_stage1of2);  
+    ezcl_kernel_release(kernel_reduce_max_stage2of2);
+#endif
+}
+
+void release_kernel_min()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_min);
+#endif
+}
+
+void release_kernel_2stage_min()
+{
+#ifdef HAVE_OPENCL
+    ezcl_kernel_release(kernel_reduce_min_stage1of2);  
+    ezcl_kernel_release(kernel_reduce_min_stage2of2);
+#endif
+}
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/reduce.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef _REDUCE_H_
+#define _REDUCE_H_
+
+#ifdef HAVE_OPENCL
+#ifdef __APPLE_CC__
+#include <OpenCL/OpenCL.h>
+#else
+#include "CL/cl.h"
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifdef HAVE_OPENCL
+cl_kernel   kernel_reduce_sum,
+            kernel_reduce_sum_stage1of2,
+            kernel_reduce_sum_stage2of2,
+            kernel_reduce_sum_int_stage1of2,
+            kernel_reduce_sum_int_stage2of2,
+            kernel_reduce_product,
+            kernel_reduce_max,
+            kernel_reduce_max_stage1of2,
+            kernel_reduce_max_stage2of2,
+            kernel_reduce_min,
+            kernel_reduce_min_stage1of2,
+            kernel_reduce_min_stage2of2;
+#endif
+
+void init_kernels_reduce(void);
+void init_kernel_sum(void);
+void init_kernel_2stage_sum(void);
+void init_kernel_2stage_sum_int(void);
+void init_kernel_product(void);
+void init_kernel_max(void);
+void init_kernel_2stage_max(void);
+void init_kernel_min(void);
+void init_kernel_2stage_min(void);
+
+void terminate_kernel_2stage_sum(void);
+void terminate_kernel_2stage_sum_int(void);
+
+void release_kernels_reduce();
+void release_kernel_sum();
+void release_kernel_2stage_sum();
+void release_kernel_2stage_sum_int();
+void release_kernel_product();
+void release_kernel_max();
+void release_kernel_2stage_max();
+void release_kernel_min();
+void release_kernel_2stage_min();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _REDUCE_H_ */
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/s7.c?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c Sun Sep  3 20:10:18 2017
@@ -0,0 +1,977 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "s7.h"
+
+void S7_Sort(
+             void *                 array_in,
+             const int              nsize,
+             const enum S7_Datatype S7_datatype
+             )
+{
+   int n, child, parent, i;
+
+   int qint;
+   long qlong;
+   long long qlonglong;
+   float qfloat;
+   double qdouble;
+
+   int
+   *int_data_ptr;
+   long
+   *long_data_ptr;
+   long long
+   *longlong_data_ptr;
+   float
+   *float_data_ptr;
+   double
+   *double_data_ptr;
+
+   // Heapsort
+
+   i=nsize/2;
+   n = nsize;
+
+   switch (S7_datatype){
+      case S7_INTEGER4:
+      case S7_INT:
+         int_data_ptr = (int *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               qint=int_data_ptr[--i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+                  break; // End the sort here!
+               } // if n
+               qint=int_data_ptr[n];
+               int_data_ptr[n]=int_data_ptr[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && int_data_ptr[child+1] > int_data_ptr[child]) child++;
+               if (int_data_ptr[child] > qint) {
+                  int_data_ptr[parent] = int_data_ptr[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break; // Break out of sift while loop
+               } // else
+            } // while
+            int_data_ptr[parent]=qint;
+         } // for
+
+         break;
+
+      case S7_LONG:
+         long_data_ptr = (long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               qlong=long_data_ptr[--i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+                  break; // End the sort here!
+               } // if n
+               qlong=long_data_ptr[n];
+               long_data_ptr[n]=long_data_ptr[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && long_data_ptr[child+1] > long_data_ptr[child]) child++;
+               if (long_data_ptr[child] > qlong) {
+                  long_data_ptr[parent] = long_data_ptr[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break; // Break out of sift while loop
+               } // else
+            } // while
+            long_data_ptr[parent]=qlong;
+         } // for
+
+         break;
+
+      case S7_LONG_LONG_INT:
+      case S7_INTEGER8:
+         longlong_data_ptr = (long long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               qlonglong=longlong_data_ptr[--i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+                  break; // End the sort here!
+               } // if n
+               qlonglong=longlong_data_ptr[n];
+               longlong_data_ptr[n]=longlong_data_ptr[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && longlong_data_ptr[child+1] > longlong_data_ptr[child]) child++;
+               if (longlong_data_ptr[child] > qlonglong) {
+                  longlong_data_ptr[parent] = longlong_data_ptr[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break; // Break out of sift while loop
+               } // else
+            } // while
+            longlong_data_ptr[parent]=qlonglong;
+         } // for
+
+         break;
+
+      case S7_FLOAT:
+      case S7_REAL4:
+         float_data_ptr = (float *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               qfloat=float_data_ptr[--i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+                  break; // End the sort here!
+               } // if n
+               qfloat=float_data_ptr[n];
+               float_data_ptr[n]=float_data_ptr[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && float_data_ptr[child+1] > float_data_ptr[child]) child++;
+               if (float_data_ptr[child] > qfloat) {
+                  float_data_ptr[parent] = float_data_ptr[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break; // Break out of sift while loop
+               } // else
+            } // while
+            float_data_ptr[parent]=qfloat;
+         } // for
+
+         break;
+
+      case S7_DOUBLE:
+      case S7_REAL8:
+         double_data_ptr = (double *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               qdouble=double_data_ptr[--i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+                  break; // End the sort here!
+               } // if n
+               qdouble=double_data_ptr[n];
+               double_data_ptr[n]=double_data_ptr[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && double_data_ptr[child+1] > double_data_ptr[child]) child++;
+               if (double_data_ptr[child] > qdouble) {
+                  double_data_ptr[parent] = double_data_ptr[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break; // Break out of sift while loop
+               } // else
+            } // while
+            double_data_ptr[parent]=qdouble;
+         } // for
+
+         break;
+
+      default:
+         printf("Error -- S7_Datatype not supported in S7_Sort\n");
+         exit(1);
+         break;
+
+   }
+}
+
+
+void S7_Sort_2Arrays(
+                     void *                 array_in1,
+                     void *                 array_in2,
+                     const int              nsize,
+                     const enum S7_Datatype S7_datatype
+                     )
+{
+   int n, child, parent, i;
+
+   int qint1, qint2;
+   long qlong1, qlong2;
+   long long qlonglong1, qlonglong2;
+   float qfloat1, qfloat2;
+   double qdouble1, qdouble2;
+
+   int
+   *int_data_ptr1, *int_data_ptr2;
+   long
+   *long_data_ptr1, *long_data_ptr2;
+   long long
+   *longlong_data_ptr1, *longlong_data_ptr2;
+   float
+   *float_data_ptr1, *float_data_ptr2;
+   double
+   *double_data_ptr1, *double_data_ptr2;
+
+   // Heapsort
+
+   i=nsize/2;
+   n = nsize;
+
+   switch (S7_datatype){
+      case S7_INTEGER4:
+      case S7_INT:
+         int_data_ptr1 = (int *)array_in1;
+         int_data_ptr2 = (int *)array_in2;
+
+         for (;;) {
+            if (i > 0) {
+               qint1=int_data_ptr1[--i];
+               qint2=int_data_ptr2[i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return; // End of sort
+               } // if n
+               qint1=int_data_ptr1[n];
+               qint2=int_data_ptr2[n];
+               int_data_ptr1[n]=int_data_ptr1[0];
+               int_data_ptr2[n]=int_data_ptr2[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && int_data_ptr1[child+1] > int_data_ptr1[child]) child++;
+               if (int_data_ptr1[child] > qint1) {
+                  int_data_ptr1[parent] = int_data_ptr1[child];
+                  int_data_ptr2[parent] = int_data_ptr2[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            int_data_ptr1[parent]=qint1;
+            int_data_ptr2[parent]=qint2;
+         } // for
+         break;
+
+      case S7_LONG:
+         long_data_ptr1 = (long *)array_in1;
+         long_data_ptr2 = (long *)array_in2;
+
+         for (;;) {
+            if (i > 0) {
+               qlong1=long_data_ptr1[--i];
+               qlong2=long_data_ptr2[i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return; // End of sort
+               } // if n
+               qlong1=long_data_ptr1[n];
+               qlong2=long_data_ptr2[n];
+               long_data_ptr1[n]=long_data_ptr1[0];
+               long_data_ptr2[n]=long_data_ptr2[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && long_data_ptr1[child+1] > long_data_ptr1[child]) child++;
+               if (long_data_ptr1[child] > qlong1) {
+                  long_data_ptr1[parent] = long_data_ptr1[child];
+                  long_data_ptr2[parent] = long_data_ptr2[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            long_data_ptr1[parent]=qlong1;
+            long_data_ptr2[parent]=qlong2;
+         } // for
+         break;
+
+      case S7_LONG_LONG_INT:
+      case S7_INTEGER8:
+         longlong_data_ptr1 = (long long *)array_in1;
+         longlong_data_ptr2 = (long long *)array_in2;
+
+         for (;;) {
+            if (i > 0) {
+               qlonglong1=longlong_data_ptr1[--i];
+               qlonglong2=longlong_data_ptr2[i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return; // End of sort
+               } // if n
+               qlonglong1=longlong_data_ptr1[n];
+               qlonglong2=longlong_data_ptr2[n];
+               longlong_data_ptr1[n]=longlong_data_ptr1[0];
+               longlong_data_ptr2[n]=longlong_data_ptr2[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && longlong_data_ptr1[child+1] > longlong_data_ptr1[child]) child++;
+               if (longlong_data_ptr1[child] > qlonglong1) {
+                  longlong_data_ptr1[parent] = longlong_data_ptr1[child];
+                  longlong_data_ptr2[parent] = longlong_data_ptr2[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            longlong_data_ptr1[parent]=qlonglong1;
+            longlong_data_ptr2[parent]=qlonglong2;
+         } // for
+         break;
+
+      case S7_FLOAT:
+      case S7_REAL4:
+         float_data_ptr1 = (float *)array_in1;
+         float_data_ptr2 = (float *)array_in2;
+
+         for (;;) {
+            if (i > 0) {
+               qfloat1=float_data_ptr1[--i];
+               qfloat2=float_data_ptr2[i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return; // End of sort
+               } // if n
+               qfloat1=float_data_ptr1[n];
+               qfloat2=float_data_ptr2[n];
+               float_data_ptr1[n]=float_data_ptr1[0];
+               float_data_ptr2[n]=float_data_ptr2[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && float_data_ptr1[child+1] > float_data_ptr1[child]) child++;
+               if (float_data_ptr1[child] > qfloat1) {
+                  float_data_ptr1[parent] = float_data_ptr1[child];
+                  float_data_ptr2[parent] = float_data_ptr2[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            float_data_ptr1[parent]=qfloat1;
+            float_data_ptr2[parent]=qfloat2;
+         } // for
+         break;
+
+      case S7_DOUBLE:
+      case S7_REAL8:
+         double_data_ptr1 = (double *)array_in1;
+         double_data_ptr2 = (double *)array_in2;
+
+         for (;;) {
+            if (i > 0) {
+               qdouble1=double_data_ptr1[--i];
+               qdouble2=double_data_ptr2[i];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return; // End of sort
+               } // if n
+               qdouble1=double_data_ptr1[n];
+               qdouble2=double_data_ptr2[n];
+               double_data_ptr1[n]=double_data_ptr1[0];
+               double_data_ptr2[n]=double_data_ptr2[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && double_data_ptr1[child+1] > double_data_ptr1[child]) child++;
+               if (double_data_ptr1[child] > qdouble1) {
+                  double_data_ptr1[parent] = double_data_ptr1[child];
+                  double_data_ptr2[parent] = double_data_ptr2[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            double_data_ptr1[parent]=qdouble1;
+            double_data_ptr2[parent]=qdouble2;
+         } // for
+         break;
+
+      default:
+         printf("Error -- S7_Datatype not supported in S7_Sort\n");
+         exit(1);
+         break;
+   }
+}
+
+
+
+
+
+void S7_Index_Sort(
+                   void *                 array_in,
+                   const int              nsize,
+                   const enum S7_Datatype S7_datatype,
+                   int *                  index
+                   )
+{
+   int n, j, child, parent, i;
+   int indext;
+
+   int qint;
+   long qlong;
+   long long qlonglong;
+   float qfloat;
+   double qdouble;
+
+   int
+   *int_data_ptr;
+   long
+   *long_data_ptr;
+   long long
+   *longlong_data_ptr;
+   float
+   *float_data_ptr;
+   double
+   *double_data_ptr;
+
+   // Heapsort
+
+   // Initialize array with consecutive integers
+   for (j=0; j<nsize; j++) index[j]=j;
+
+   i=nsize/2;
+   n = nsize;
+
+   switch (S7_datatype){
+      case S7_INTEGER4:
+      case S7_INT:
+         int_data_ptr = (int *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qint=int_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qint=int_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && int_data_ptr[index[child+1]] > int_data_ptr[index[child]]) child++;
+               if (int_data_ptr[index[child]] > qint) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_LONG:
+         long_data_ptr = (long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qlong=long_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qlong=long_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && long_data_ptr[index[child+1]] > long_data_ptr[index[child]]) child++;
+               if (long_data_ptr[index[child]] > qlong) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_LONG_LONG_INT:
+      case S7_INTEGER8:
+         longlong_data_ptr = (long long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qlonglong=longlong_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qlonglong=longlong_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && longlong_data_ptr[index[child+1]] > longlong_data_ptr[index[child]]) child++;
+               if (longlong_data_ptr[index[child]] > qlonglong) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_FLOAT:
+      case S7_REAL4:
+         float_data_ptr = (float *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qfloat=float_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qfloat=float_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && float_data_ptr[index[child+1]] > float_data_ptr[index[child]]) child++;
+               if (float_data_ptr[index[child]] > qfloat) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+
+      case S7_DOUBLE:
+      case S7_REAL8:
+         double_data_ptr = (double *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qdouble=double_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qdouble=double_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && double_data_ptr[index[child+1]] > double_data_ptr[index[child]]) child++;
+               if (double_data_ptr[index[child]] > qdouble) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      default:
+         printf("Error -- S7_Datatype not supported in S7_Index_Sort\n");
+         exit(1);
+         break;
+
+   }
+}
+
+void S7_Indexi8_Sort(
+                     void *                 array_in,
+                     const int              nsize,
+                     const enum S7_Datatype S7_datatype,
+                     long *                 index
+                     )
+{
+   int n, j, child, parent, i;
+   long indext;
+
+   int qint;
+   long qlong;
+   long long qlonglong;
+   float qfloat;
+   double qdouble;
+
+   int
+   *int_data_ptr;
+   long
+   *long_data_ptr;
+   long long
+   *longlong_data_ptr;
+   float
+   *float_data_ptr;
+   double
+   *double_data_ptr;
+
+   // Heapsort
+
+   // Initialize array with consecutive integers
+   for (j=0; j<nsize; j++) index[j]=j;
+
+   i=nsize/2;
+   n = nsize;
+
+   switch (S7_datatype){
+      case S7_INTEGER4:
+      case S7_INT:
+         int_data_ptr = (int *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qint=int_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qint=int_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && int_data_ptr[index[child+1]] > int_data_ptr[index[child]]) child++;
+               if (int_data_ptr[index[child]] > qint) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_LONG:
+         long_data_ptr = (long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qlong=long_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qlong=long_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && long_data_ptr[index[child+1]] > long_data_ptr[index[child]]) child++;
+               if (long_data_ptr[index[child]] > qlong) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_LONG_LONG_INT:
+      case S7_INTEGER8:
+         longlong_data_ptr = (long long *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qlonglong=longlong_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qlonglong=longlong_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && longlong_data_ptr[index[child+1]] > longlong_data_ptr[index[child]]) child++;
+               if (longlong_data_ptr[index[child]] > qlonglong) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      case S7_FLOAT:
+      case S7_REAL4:
+         float_data_ptr = (float *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qfloat=float_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qfloat=float_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && float_data_ptr[index[child+1]] > float_data_ptr[index[child]]) child++;
+               if (float_data_ptr[index[child]] > qfloat) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+
+      case S7_DOUBLE:
+      case S7_REAL8:
+         double_data_ptr = (double *)array_in;
+
+         for (;;) {
+            if (i > 0) {
+               indext=index[--i];
+               qdouble=double_data_ptr[indext];
+            } // if i > 0
+            else {
+               n--;
+               if (n == 0) {
+
+                  return;
+               } // if n
+               indext=index[n];
+               qdouble=double_data_ptr[indext];
+               index[n]=index[0];
+            } // else
+
+            parent=i;
+            child = i*2 + 1;
+            while (child < n) {
+               if (child +1 < n && double_data_ptr[index[child+1]] > double_data_ptr[index[child]]) child++;
+               if (double_data_ptr[index[child]] > qdouble) {
+                  index[parent] = index[child];
+                  parent=child;
+                  child = parent*2 + 1;
+               } // if q
+               else {
+                  break;
+               } // else
+            } // while
+            index[parent]=indext;
+         } // for
+         break;
+
+      default:
+         printf("Error -- S7_Datatype not supported in S7_Indexi8_Sort\n");
+         exit(1);
+         break;
+
+   }
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/s7.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef S7_H_
+#define S7_H_
+
+//#define _S7_DEBUG
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+   /*
+    * Some S7 parameters.
+    */
+
+#define S7_OK   0 /* Successful return. */
+
+   enum  S7_Datatype
+   {
+      S7_GENERIC8  = 0,
+      S7_BYTE,
+      S7_PACKED,
+
+      S7_CHAR,
+      S7_INT,
+      S7_LONG,
+      S7_LONG_LONG_INT,
+      S7_FLOAT,
+      S7_DOUBLE,
+
+      S7_CHARACTER,
+      S7_LOGICAL,
+      S7_INTEGER4,
+      S7_INTEGER8,
+      S7_REAL4,
+      S7_REAL8,
+
+      S7_DATATYPE_MIN = S7_GENERIC8,
+      S7_DATATYPE_MAX = S7_REAL8
+   };
+
+
+   void S7_Sort(
+                void                   *array_in,
+                const int              nsize,
+                const enum S7_Datatype S7_datatype
+                );
+
+   void S7_Sort_2Arrays(
+                        void *                 array_in1,
+                        void *                 array_in2,
+                        const int              nsize,
+                        const enum S7_Datatype S7_datatype
+                        );
+
+   void S7_Index_Sort(
+                      void *                 array_in,
+                      const int              nsize,
+                      const enum S7_Datatype S7_datatype,
+                      int *                  index
+                      );
+
+   void S7_Indexi8_Sort(
+                        void *                 array_in,
+                        const int              nsize,
+                        const enum S7_Datatype S7_datatype,
+                        long *                 index
+                        );
+
+
+   void S7_Index_sort_real8(const int n,double array_in[],int index[]);
+   void S7_Index_sort_int8(const int n,long long iarray_in[], int index[]);
+   void S7_Index_sort_int4(const int n, int iarray_in[], int index[]);
+   void S7_Index_sort_real8_int8(const int n,double array_in[],long long index[]);
+
+   void S7_Index_sort_int8_int8(const int n,long long iarray_in[], long long index[]);
+   void S7_Index_sort_int4_int8(const int n, int iarray_in[], long long index[]);
+   void S7_Sort_real8(const int n,double array_in[]);
+   void S7_Sort_int8(const int n,long long array_in[]);
+   void S7_Sort_int4(const int n,int array_in[]);
+   void S7_Sort_real8_real8(const int n,double array_in[],double array_in2[]);
+   void S7_Sort_int8_int8(const int n,long long array_in[],long long array_in2[]);
+   void S7_Sort_int4_int4(const int n,int array_in[],int array_in2[]);
+
+   /*
+    * End prototypes.
+    */
+
+   /*
+    * remove typesafe linkage if compiling under c++
+    */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* S7_H */

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/state.cpp?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp Sun Sep  3 20:10:18 2017
@@ -0,0 +1,3966 @@
+/*
+ *  Copyright (c) 2011-2013, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#include "mesh.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+#include <algorithm>
+#include <queue>
+#include "state.h"
+#include "timer.h"
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#undef DEBUG
+//#define DEBUG 0
+#define DEBUG_RESTORE_VALS 1
+#define TIMING_LEVEL 2
+
+#if defined(MINIMUM_PRECISION)
+#define ZERO 0.0f
+#define ONE 1.0f
+#define HALF 0.5f
+#define EPSILON 1.0f-30
+#define STATE_EPS        15.0
+// calc refine is done in single precision
+#define REFINE_GRADIENT  0.10f
+#define COARSEN_GRADIENT 0.05f
+#define REFINE_HALF 0.5f
+#define REFINE_NEG_THOUSAND -1000.0f
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+#define ZERO 0.0
+#define ONE 1.0
+#define HALF 0.5
+#define EPSILON 1.0e-30
+#define STATE_EPS        .02
+// calc refine is done in single precision
+#define REFINE_GRADIENT  0.10f
+#define COARSEN_GRADIENT 0.05f
+#define REFINE_HALF 0.5f
+#define REFINE_NEG_THOUSAND -1000.0f
+
+#elif defined(FULL_PRECISION)
+#define ZERO 0.0
+#define ONE 1.0
+#define HALF 0.5
+#define EPSILON 1.0e-30
+#define STATE_EPS        .02
+// calc refine is done in single precision
+#define REFINE_GRADIENT  0.10
+#define COARSEN_GRADIENT 0.05
+#define REFINE_HALF 0.5
+#define REFINE_NEG_THOUSAND -1000.0
+
+#endif
+
+#ifdef _OPENMP
+static bool iversion_flag = false;
+#endif
+
+typedef unsigned int uint;
+
+static const char *state_timer_descriptor[STATE_TIMER_SIZE] = {
+   "state_timer_apply_BCs",
+   "state_timer_set_timestep",
+   "state_timer_finite_difference",
+   "state_timer_refine_potential",
+   "state_timer_calc_mpot",
+   "state_timer_rezone_all",
+   "state_timer_mass_sum",
+   "state_timer_read",
+   "state_timer_write"
+};
+
+#ifdef HAVE_OPENCL
+#include "state_kernel.inc"
+#endif
+
+struct esum_type{
+   double sum;
+   double correction;
+};
+#ifdef HAVE_MPI
+MPI_Datatype MPI_TWO_DOUBLES;
+MPI_Op KNUTH_SUM;
+int commutative = 1;
+void knuth_sum(struct esum_type *in, struct esum_type *inout, int *len, MPI_Datatype *MPI_TWO_DOUBLES);
+#endif
+
+int save_ncells;
+
+#define CONSERVED_EQNS
+
+#define SQR(x) ( x*x )
+#define MIN3(x,y,z) ( min( min(x,y), z) )
+
+#ifdef HAVE_OPENCL
+cl_kernel kernel_set_timestep;
+cl_kernel kernel_reduction_min;
+cl_kernel kernel_copy_state_data;
+cl_kernel kernel_copy_state_ghost_data;
+cl_kernel kernel_apply_boundary_conditions;
+cl_kernel kernel_apply_boundary_conditions_local;
+cl_kernel kernel_apply_boundary_conditions_ghost;
+cl_kernel kernel_calc_finite_difference;
+cl_kernel kernel_refine_potential;
+cl_kernel kernel_reduce_sum_mass_stage1of2;
+cl_kernel kernel_reduce_sum_mass_stage2of2;
+cl_kernel kernel_reduce_epsum_mass_stage1of2;
+cl_kernel kernel_reduce_epsum_mass_stage2of2;
+#endif
+
+inline real_t U_halfstep(// XXX Fix the subindices to be more intuitive XXX
+        real_t    deltaT,     // Timestep
+        real_t    U_i,        // Initial cell's (downwind's) state variable
+        real_t    U_n,        // Next cell's    (upwind's)   state variable
+        real_t    F_i,        // Initial cell's (downwind's) state variable flux
+        real_t    F_n,        // Next cell's    (upwind's)   state variable flux
+        real_t    r_i,        // Initial cell's (downwind's) center to face distance
+        real_t    r_n,        // Next cell's    (upwind's)   center to face distance
+        real_t    A_i,        // Cell's            face surface area
+        real_t    A_n,        // Cell's neighbor's face surface area
+        real_t    V_i,        // Cell's            volume
+        real_t    V_n) {      // Cell's neighbor's volume
+
+   return (( r_i*U_n + r_n*U_i ) / ( r_i + r_n )) 
+          - HALF*deltaT*(( F_n*A_n*min(ONE, A_i/A_n) - F_i*A_i*min(ONE, A_n/A_i) )
+                    / ( V_n*min(HALF, V_i/V_n) + V_i*min(HALF, V_n/V_i) ));
+
+}
+
+inline real_t U_fullstep(
+        real_t    deltaT,
+        real_t    dr,
+        real_t    U,
+        real_t    F_plus,
+        real_t    F_minus,
+        real_t    G_plus,
+        real_t    G_minus) {
+
+   return (U - (deltaT / dr)*(F_plus - F_minus + G_plus - G_minus));
+
+}
+
+
+inline real_t w_corrector(
+        real_t    deltaT,       // Timestep
+        real_t    dr,           // Cell's center to face distance
+        real_t    U_eigen,      // State variable's eigenvalue (speed)
+        real_t    grad_half,    // Centered gradient
+        real_t    grad_minus,   // Downwind gradient
+        real_t    grad_plus) {  // Upwind gradient
+
+   real_t nu     = HALF * U_eigen * deltaT / dr;
+   nu          = nu * (ONE - nu);
+
+   real_t rdenom = ONE / max(SQR(grad_half), EPSILON);
+   real_t rplus  = (grad_plus  * grad_half) * rdenom;
+   real_t rminus = (grad_minus * grad_half) * rdenom;
+
+   return HALF*nu*(ONE- max(MIN3(ONE, rplus, rminus), ZERO));
+}
+
+State::State(Mesh *mesh_in)
+{
+   for (int i = 0; i < STATE_TIMER_SIZE; i++){
+      cpu_timers[i] = 0.0;
+   }
+   for (int i = 0; i < STATE_TIMER_SIZE; i++){
+      gpu_timers[i] = 0L;
+   }
+
+   mesh = mesh_in;
+
+#ifdef HAVE_MPI
+   int mpi_init;
+   MPI_Initialized(&mpi_init);
+   if (mpi_init){
+      MPI_Type_contiguous(2, MPI_DOUBLE, &MPI_TWO_DOUBLES);
+      MPI_Type_commit(&MPI_TWO_DOUBLES);
+      MPI_Op_create((MPI_User_function *)knuth_sum, commutative, &KNUTH_SUM);
+      // FIXME add fini and set size
+      if (mesh->parallel) state_memory.pinit(MPI_COMM_WORLD, 2L * 1024 * 1024 * 1024);
+   }
+#endif
+}
+
+void State::init(int do_gpu_calc)
+{
+   if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+      cl_context context = ezcl_get_context();
+
+      if (mesh->mype == 0) printf("Starting compile of kernels in state\n");
+      const char *defines = NULL;
+      cl_program program                 = ezcl_create_program_wsource(context, defines, state_kern_source);
+
+      kernel_set_timestep                    = ezcl_create_kernel_wprogram(program, "set_timestep_cl");
+      kernel_reduction_min                   = ezcl_create_kernel_wprogram(program, "finish_reduction_min_cl");
+      kernel_copy_state_data                 = ezcl_create_kernel_wprogram(program, "copy_state_data_cl");
+      kernel_copy_state_ghost_data           = ezcl_create_kernel_wprogram(program, "copy_state_ghost_data_cl");
+      kernel_apply_boundary_conditions       = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_cl");
+      kernel_apply_boundary_conditions_local = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_local_cl");
+      kernel_apply_boundary_conditions_ghost = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_ghost_cl");
+      kernel_calc_finite_difference          = ezcl_create_kernel_wprogram(program, "calc_finite_difference_cl");
+      kernel_refine_potential                = ezcl_create_kernel_wprogram(program, "refine_potential_cl");
+      kernel_reduce_sum_mass_stage1of2       = ezcl_create_kernel_wprogram(program, "reduce_sum_mass_stage1of2_cl");
+      kernel_reduce_sum_mass_stage2of2       = ezcl_create_kernel_wprogram(program, "reduce_sum_mass_stage2of2_cl");
+      kernel_reduce_epsum_mass_stage1of2     = ezcl_create_kernel_wprogram(program, "reduce_epsum_mass_stage1of2_cl");
+      kernel_reduce_epsum_mass_stage2of2     = ezcl_create_kernel_wprogram(program, "reduce_epsum_mass_stage2of2_cl");
+
+      ezcl_program_release(program);
+      if (mesh->mype == 0) printf("Finishing compile of kernels in state\n");
+#endif
+   }
+
+   //printf("\nDEBUG -- Calling state memory memory malloc at line %d\n",__LINE__);
+   allocate(mesh->ncells);
+   //state_memory.memory_report();
+   //printf("DEBUG -- Finished state memory memory malloc at line %d\n\n",__LINE__);
+
+}
+
+void State::allocate(size_t ncells)
+{
+   int flags = 0;
+   flags = RESTART_DATA;
+#ifdef HAVE_J7
+   if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+   H = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "H", flags);
+   U = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "U", flags);
+   V = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "V", flags);
+}
+
+void State::resize(size_t new_ncells){
+   size_t current_size = state_memory.get_memory_size(H);
+   if (new_ncells > current_size) state_memory.memory_realloc_all(new_ncells);
+
+   //printf("\nDEBUG -- Calling state memory resize at line %d\n",__LINE__);
+   //state_memory.memory_report();
+   //printf("DEBUG -- Finished state memory resize at line %d\n\n",__LINE__);
+}
+
+void State::memory_reset_ptrs(void){
+   H = (state_t *)state_memory.get_memory_ptr("H");
+   U = (state_t *)state_memory.get_memory_ptr("U");
+   V = (state_t *)state_memory.get_memory_ptr("V");
+
+   //printf("\nDEBUG -- Calling state memory reset_ptrs at line %d\n",__LINE__);
+   //state_memory.memory_report();
+   //printf("DEBUG -- Finished state memory reset_ptrs at line %d\n\n",__LINE__);
+}
+
+void State::terminate(void)
+{
+   state_memory.memory_delete(H);
+   state_memory.memory_delete(U);
+   state_memory.memory_delete(V);
+
+#ifdef HAVE_OPENCL
+   ezcl_device_memory_delete(dev_deltaT);
+
+   gpu_state_memory.memory_delete(dev_H);
+   gpu_state_memory.memory_delete(dev_U);
+   gpu_state_memory.memory_delete(dev_V);
+
+   ezcl_kernel_release(kernel_set_timestep);
+   ezcl_kernel_release(kernel_reduction_min);
+   ezcl_kernel_release(kernel_copy_state_data);
+   ezcl_kernel_release(kernel_copy_state_ghost_data);
+   ezcl_kernel_release(kernel_apply_boundary_conditions);
+   ezcl_kernel_release(kernel_apply_boundary_conditions_local);
+   ezcl_kernel_release(kernel_apply_boundary_conditions_ghost);
+   ezcl_kernel_release(kernel_calc_finite_difference);
+   ezcl_kernel_release(kernel_refine_potential);
+   ezcl_kernel_release(kernel_reduce_sum_mass_stage1of2);
+   ezcl_kernel_release(kernel_reduce_sum_mass_stage2of2);
+   ezcl_kernel_release(kernel_reduce_epsum_mass_stage1of2);
+   ezcl_kernel_release(kernel_reduce_epsum_mass_stage2of2);
+#endif
+#ifdef HAVE_MPI
+   if (mesh->parallel) state_memory.pfini();
+#endif
+}
+
+#ifdef HAVE_MPI
+void knuth_sum(struct esum_type *in, struct esum_type *inout, int *len, MPI_Datatype *MPI_TWO_DOUBLES)
+{
+   double u, v, upt, up, vpp;
+   u = inout->sum;
+   v = in->sum + (in->correction+inout->correction);
+   upt = u + v;
+   up = upt - v;
+   vpp = upt - up;
+   inout->sum = upt;
+   inout->correction = (u - up) + (v - vpp);
+
+   // Just to block compiler warnings
+   if (1==2) printf("DEBUG len %d datatype %lld\n",*len,(long long)(*MPI_TWO_DOUBLES) );
+}
+#endif
+
+void State::add_boundary_cells(void)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   // This is for a mesh with no boundary cells -- they are added and
+   // the mesh sizes increased
+   size_t &ncells        = mesh->ncells;
+   vector<int>  &index    = mesh->index;
+   vector<spatial_t> &x        = mesh->x;
+   vector<spatial_t> &dx       = mesh->dx;
+   vector<spatial_t> &y        = mesh->y;
+   vector<spatial_t> &dy       = mesh->dy;
+
+   int *i        = mesh->i;
+   int *j        = mesh->j;
+   int *level    = mesh->level;
+   int *celltype = mesh->celltype;
+   int *nlft     = mesh->nlft;
+   int *nrht     = mesh->nrht;
+   int *nbot     = mesh->nbot;
+   int *ntop     = mesh->ntop;
+
+   vector<int> &lev_ibegin = mesh->lev_ibegin;
+   vector<int> &lev_iend   = mesh->lev_iend;
+   vector<int> &lev_jbegin = mesh->lev_jbegin;
+   vector<int> &lev_jend   = mesh->lev_jend;
+
+   // Pre-count number of cells to add
+   int icount = 0;
+   for (uint ic=0; ic<ncells; ic++) {
+      if (i[ic] == lev_ibegin[level[ic]]) icount++; // Left boundary
+      if (i[ic] == lev_iend[level[ic]])   icount++; // Right boundary
+      if (j[ic] == lev_jbegin[level[ic]]) icount++; // Bottom boundary
+      if (j[ic] == lev_jend[level[ic]])   icount++; // Top boundary
+   }
+      
+   int new_ncells = ncells + icount;
+   // Increase the arrays for the new boundary cells
+   H=(state_t *)state_memory.memory_realloc(new_ncells, H);
+   U=(state_t *)state_memory.memory_realloc(new_ncells, U);
+   V=(state_t *)state_memory.memory_realloc(new_ncells, V);
+   //printf("\nDEBUG add_boundary cells\n"); 
+   //state_memory.memory_report();
+   //printf("DEBUG end add_boundary cells\n\n"); 
+
+   mesh->i        =(int *)mesh->mesh_memory.memory_realloc(new_ncells, i);
+   mesh->j        =(int *)mesh->mesh_memory.memory_realloc(new_ncells, j);
+   mesh->level    =(int *)mesh->mesh_memory.memory_realloc(new_ncells, level);
+   mesh->celltype =(int *)mesh->mesh_memory.memory_realloc(new_ncells, celltype);
+   mesh->nlft     =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nlft);
+   mesh->nrht     =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nrht);
+   mesh->nbot     =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nbot);
+   mesh->ntop     =(int *)mesh->mesh_memory.memory_realloc(new_ncells, ntop);
+   //memory_reset_ptrs();
+   i        = mesh->i;
+   j        = mesh->j;
+   level    = mesh->level;
+   celltype = mesh->celltype;
+   nlft     = mesh->nlft;
+   nrht     = mesh->nrht;
+   nbot     = mesh->nbot;
+   ntop     = mesh->ntop;
+
+   index.resize(new_ncells);
+   x.resize(new_ncells);
+   dx.resize(new_ncells);
+   y.resize(new_ncells);
+   dy.resize(new_ncells);
+
+   for (int nc=ncells; nc<new_ncells; nc++) {
+      nlft[nc] = -1;
+      nrht[nc] = -1;
+      nbot[nc] = -1;
+      ntop[nc] = -1;
+   }
+      
+   // In the first pass, set two of the neighbor indices and all
+   // the other data to be brought across. Set the inverse of the
+   // the velocity to enforce the reflective boundary condition
+   uint nc=ncells;
+   for (uint ic=0; ic<ncells; ic++) {
+      if (i[ic] == lev_ibegin[level[ic]]) {
+         nlft[ic] = nc;
+         nlft[nc] = nc;
+         nrht[nc] = ic;
+         i[nc] = lev_ibegin[level[ic]]-1;
+         j[nc] = j[ic];
+         level[nc] = level[ic];
+         dx[nc] = dx[ic];
+         dy[nc] = dy[ic];
+         x[nc] = x[ic]-dx[ic];
+         y[nc] = y[ic];
+         H[nc] =  H[ic];
+         U[nc] = -U[ic];
+         V[nc] =  V[ic];
+         nc++;
+      }
+      if (i[ic] == lev_iend[level[ic]]) {
+         nrht[ic] = nc;
+         nrht[nc] = nc;
+         nlft[nc] = ic;
+         i[nc] = lev_iend[level[ic]]+1;
+         j[nc] = j[ic];
+         level[nc] = level[ic];
+         dx[nc] = dx[ic];
+         dy[nc] = dy[ic];
+         x[nc] = x[ic]+dx[ic];
+         y[nc] = y[ic];
+         H[nc] =  H[ic];
+         U[nc] = -U[ic];
+         V[nc] =  V[ic];
+         nc++;
+      }
+      if (j[ic] == lev_jbegin[level[ic]]) {
+         nbot[ic] = nc;
+         nbot[nc] = nc;
+         ntop[nc] = ic;
+         i[nc] = i[ic];
+         j[nc] = lev_jbegin[level[ic]]-1;
+         level[nc] = level[ic];
+         dx[nc] = dx[ic];
+         dy[nc] = dy[ic];
+         x[nc] = x[ic];
+         y[nc] = y[ic]-dy[ic];
+         H[nc] =  H[ic];
+         U[nc] =  U[ic];
+         V[nc] = -V[ic];
+         nc++;
+      }
+      if (j[ic] == lev_jend[level[ic]]) {
+         ntop[ic] = nc;
+         ntop[nc] = nc;
+         nbot[nc] = ic;
+         i[nc] = i[ic];
+         j[nc] = lev_jend[level[ic]]+1;
+         level[nc] = level[ic];
+         dx[nc] = dx[ic];
+         dy[nc] = dy[ic];
+         x[nc] = x[ic];
+         y[nc] = y[ic]+dy[ic];
+         H[nc] =  H[ic];
+         U[nc] =  U[ic];
+         V[nc] = -V[ic];
+         nc++;
+      }
+   }
+
+   // Now set the other two neighbor indices
+   for (int nc=ncells; nc<new_ncells; nc++) {
+      if (i[nc] == lev_ibegin[level[nc]]-1) {
+         // Need to check if also a bottom boundary cell
+         if (j[nc] == lev_jbegin[level[nc]]){
+           nbot[nc] = nc;
+         } else {
+           nbot[nc] = nlft[nbot[nrht[nc]]];
+         }
+         if (j[nc] == lev_jend[level[nc]]){
+           ntop[nc] = nc;
+         } else {
+           ntop[nc] = nlft[ntop[nrht[nc]]];
+         }
+      }
+      if (i[nc] == lev_iend[level[nc]]+1)   {
+         if (level[nc] <= level[nbot[nlft[nc]]]){
+            if (j[nc] == lev_jbegin[level[nc]]){
+               nbot[nc] = nc;
+            } else {
+               nbot[nc] = nrht[nbot[nlft[nc]]];
+            }
+            if (j[nc] == lev_jend[level[nc]]){
+               ntop[nc] = nc;
+            } else {
+               ntop[nc] = nrht[ntop[nlft[nc]]];
+            }
+         // calculation is a little different if going through a
+         // finer zoned region
+         } else {
+            nbot[nc] = nrht[nrht[nbot[nlft[nc]]]];
+            ntop[nc] = nrht[nrht[ntop[nlft[nc]]]];
+         }
+      }
+      if (j[nc] == lev_jbegin[level[nc]]-1) {
+         if (i[nc] == lev_ibegin[level[nc]]){
+            nlft[nc] = nc;
+         } else {
+            nlft[nc] = nbot[nlft[ntop[nc]]];
+         }
+         if (i[nc] == lev_iend[level[nc]]){
+            nrht[nc] = nc;
+         } else {
+            nrht[nc] = nbot[nrht[ntop[nc]]];
+         }
+      }
+      if (j[nc] == lev_jend[level[nc]]+1)   {
+         if (level[nc] <= level[nlft[nbot[nc]]]){
+            if (i[nc] == lev_ibegin[level[nc]]){
+               nlft[nc] = nc;
+            } else {
+               nlft[nc] = ntop[nlft[nbot[nc]]];
+            }
+            if (i[nc] == lev_iend[level[nc]]){
+               nrht[nc] = nc;
+            } else {
+               nrht[nc] = ntop[nrht[nbot[nc]]];
+            }
+         } else {
+            nlft[nc] = ntop[ntop[nlft[nbot[nc]]]];
+            nrht[nc] = ntop[ntop[nrht[nbot[nc]]]];
+         }
+      }
+   }
+   save_ncells = ncells;
+   ncells = new_ncells;
+
+   cpu_timers[STATE_TIMER_APPLY_BCS] += cpu_timer_stop(tstart_cpu);
+}
+
+void State::apply_boundary_conditions_local(void)
+{
+   static int *nlft, *nrht, *nbot, *ntop;
+
+   size_t &ncells = mesh->ncells;
+   nlft = mesh->nlft;
+   nrht = mesh->nrht;
+   nbot = mesh->nbot;
+   ntop = mesh->ntop;
+
+   // This is for a mesh with boundary cells
+   int lowerBound, upperBound;
+   mesh->get_bounds(lowerBound, upperBound);
+   for (uint ic=lowerBound; ic<upperBound; ic++) {
+      if (mesh->is_left_boundary(ic)) {
+         int nr = nrht[ic];
+         if (nr < (int)ncells) {
+            H[ic] =  H[nr];
+            U[ic] = -U[nr];
+            V[ic] =  V[nr];
+         }
+      }
+      if (mesh->is_right_boundary(ic))  {
+         int nl = nlft[ic];
+         if (nl < (int)ncells) {
+            H[ic] =  H[nl];
+            U[ic] = -U[nl];
+            V[ic] =  V[nl];
+         }
+      }
+      if (mesh->is_bottom_boundary(ic)) {
+         int nt = ntop[ic];
+         if (nt < (int)ncells) {
+            H[ic] =  H[nt];
+            U[ic] =  U[nt];
+            V[ic] = -V[nt];
+         }
+      }
+      if (mesh->is_top_boundary(ic)) {
+         int nb = nbot[ic];
+         if (nb < (int)ncells) {
+            H[ic] =  H[nb];
+            U[ic] =  U[nb];
+            V[ic] = -V[nb];
+         }
+      }
+   }
+}
+
+void State::apply_boundary_conditions_ghost(void)
+{
+   static int *nlft, *nrht, *nbot, *ntop;
+
+   size_t &ncells = mesh->ncells;
+   nlft = mesh->nlft;
+   nrht = mesh->nrht;
+   nbot = mesh->nbot;
+   ntop = mesh->ntop;
+
+   // This is for a mesh with boundary cells
+   int lowerBound, upperBound; 
+   mesh->get_bounds(lowerBound, upperBound);
+   for (uint ic=lowerBound; ic<upperBound; ic++) {
+      if (mesh->is_left_boundary(ic)) {
+         int nr = nrht[ic];
+         if (nr >= (int)ncells) {
+            H[ic] =  H[nr];
+            U[ic] = -U[nr];
+            V[ic] =  V[nr];
+         }
+      }
+      if (mesh->is_right_boundary(ic))  {
+         int nl = nlft[ic];
+         if (nl >= (int)ncells) {
+            H[ic] =  H[nl];
+            U[ic] = -U[nl];
+            V[ic] =  V[nl];
+         }
+      }
+      if (mesh->is_bottom_boundary(ic)) {
+         int nt = ntop[ic];
+         if (nt >= (int)ncells) {
+            H[ic] =  H[nt];
+            U[ic] =  U[nt];
+            V[ic] = -V[nt];
+         }
+      }
+      if (mesh->is_top_boundary(ic)) {
+         int nb = nbot[ic];
+         if (nb >= (int)ncells) {
+            H[ic] =  H[nb];
+            U[ic] =  U[nb];
+            V[ic] = -V[nb];
+         }
+      }
+   }
+}
+
+void State::apply_boundary_conditions(void)
+{
+   int *nlft, *nrht, *nbot, *ntop;
+
+   size_t &ncells = mesh->ncells;
+   nlft = mesh->nlft;
+   nrht = mesh->nrht;
+   nbot = mesh->nbot;
+   ntop = mesh->ntop;
+
+   // This is for a mesh with boundary cells
+   int lowerBound, upperBound;
+   mesh->get_bounds(lowerBound, upperBound);
+   for (uint ic=lowerBound; ic<upperBound; ic++) {
+      if (mesh->is_left_boundary(ic)) {
+         int nr = nrht[ic];
+         H[ic] =  H[nr];
+         U[ic] = -U[nr];
+         V[ic] =  V[nr];
+      }
+      if (mesh->is_right_boundary(ic))  {
+         int nl = nlft[ic];
+         H[ic] =  H[nl];
+         U[ic] = -U[nl];
+         V[ic] =  V[nl];
+      }
+      if (mesh->is_bottom_boundary(ic)) {
+         int nt = ntop[ic];
+         H[ic] =  H[nt];
+         U[ic] =  U[nt];
+         V[ic] = -V[nt];
+      }
+      if (mesh->is_top_boundary(ic)) {
+         int nb = nbot[ic];
+         H[ic] =  H[nb];
+         U[ic] =  U[nb];
+         V[ic] = -V[nb];
+      }
+   }
+}
+
+void State::remove_boundary_cells(void)
+{
+   if(! mesh->have_boundary) {
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+      {
+#endif
+         size_t &ncells = mesh->ncells;
+
+         // Resize to drop all the boundary cells
+         ncells = save_ncells;
+         H=(state_t *)state_memory.memory_realloc(save_ncells, H);
+         U=(state_t *)state_memory.memory_realloc(save_ncells, U);
+         V=(state_t *)state_memory.memory_realloc(save_ncells, V);
+         //printf("\nDEBUG remove_boundary cells\n"); 
+         //state_memory.memory_report();
+         //printf("DEBUG end remove_boundary cells\n\n"); 
+
+         mesh->i        = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->i);
+         mesh->j        = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->j);
+         mesh->level    = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->level);
+         mesh->celltype = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->celltype);
+         mesh->nlft     = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nlft);
+         mesh->nrht     = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nrht);
+         mesh->nbot     = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nbot);
+         mesh->ntop     = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->ntop);
+
+         // Reset the neighbors due to the dropped boundary cells
+         mesh->index.resize(save_ncells);
+         mesh->x.resize(save_ncells);
+         mesh->dx.resize(save_ncells);
+         mesh->y.resize(save_ncells);
+         mesh->dy.resize(save_ncells);
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+      mesh->set_bounds(mesh->ncells);
+
+      int lowerBound, upperBound;
+      mesh->get_bounds(lowerBound, upperBound);
+      for (uint ic=lowerBound; ic<upperBound; ic++) {
+         if (mesh->i[ic] == mesh->lev_ibegin[mesh->level[ic]]) mesh->nlft[ic] = ic;
+         if (mesh->i[ic] == mesh->lev_iend[mesh->level[ic]])   mesh->nrht[ic] = ic;
+         if (mesh->j[ic] == mesh->lev_jbegin[mesh->level[ic]]) mesh->nbot[ic] = ic;
+         if (mesh->j[ic] == mesh->lev_jend[mesh->level[ic]])   mesh->ntop[ic] = ic;
+      }
+
+   } // if have_boundary
+}
+
+double State::set_timestep(double g, double sigma)
+{
+   double globalmindeltaT;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   static double mindeltaT;
+
+   int lowerBounds, upperBounds;
+   mesh->set_bounds(mesh->ncells);
+   mesh->get_bounds(lowerBounds, upperBounds);
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      mindeltaT = 1000;
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+   double mymindeltaT = 1000.0; // private for each thread
+
+   for (int ic=lowerBounds; ic<upperBounds; ic++) {
+      if (mesh->celltype[ic] == REAL_CELL) {
+         int lev = mesh->level[ic];
+         double wavespeed = sqrt(g*H[ic]);
+         double xspeed = (fabs(U[ic])+wavespeed)/mesh->lev_deltax[lev];
+         double yspeed = (fabs(V[ic])+wavespeed)/mesh->lev_deltay[lev];
+         double deltaT=sigma/(xspeed+yspeed);
+         if (deltaT < mymindeltaT) mymindeltaT = deltaT;
+      }
+   }
+
+#ifdef _OPENMP
+#pragma omp critical
+   {
+#endif
+      if (mymindeltaT < mindeltaT) mindeltaT = mymindeltaT;
+#ifdef _OPENMP
+   } // End critical region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+
+
+   globalmindeltaT = mindeltaT;
+#ifdef HAVE_MPI
+      if (mesh->parallel) MPI_Allreduce(&mindeltaT, &globalmindeltaT, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+      cpu_timers[STATE_TIMER_SET_TIMESTEP] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+   } // End master region
+#pragma omp barrier
+#endif
+
+   return(globalmindeltaT);
+}
+
+#ifdef HAVE_OPENCL
+double State::gpu_set_timestep(double sigma)
+{
+   double deltaT, globalmindeltaT;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t &ncells       = mesh->ncells;
+#ifdef HAVE_MPI
+   int &parallel        = mesh->parallel;
+#endif
+   cl_mem &dev_level    = mesh->dev_level;
+   cl_mem &dev_celltype = mesh->dev_celltype;
+   cl_mem &dev_levdx    = mesh->dev_levdx;
+   cl_mem &dev_levdy    = mesh->dev_levdy;
+
+   assert(dev_H);
+   assert(dev_U);
+   assert(dev_V);
+   assert(dev_level);
+   assert(dev_celltype);
+   assert(dev_levdx);
+   assert(dev_levdy);
+
+   size_t local_work_size = 128;
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+   size_t block_size     = global_work_size/local_work_size;
+
+   cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+      /*
+      __kernel void set_timestep_cl(
+                       const int       ncells,     // 0  Total number of cells.
+                       const real_t    sigma,      // 1
+              __global const state_t  *H,          // 2
+              __global const state_t  *U,          // 3
+              __global const state_t  *V,          // 4
+              __global const int      *level,      // 5  Array of level information.
+              __global const int      *celltype,   // 6
+              __global const real_t   *lev_dx,     // 7
+              __global const real_t   *lev_dy,     // 8
+              __global       real_t   *redscratch, // 9
+              __global       real_t   *deltaT,     // 10
+              __local        real_t   *tile)       // 11
+      */
+
+   real_t sigma_local = sigma;
+   ezcl_set_kernel_arg(kernel_set_timestep,  0, sizeof(cl_int),  (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_set_timestep,  1, sizeof(cl_real_t), (void *)&sigma_local);
+   ezcl_set_kernel_arg(kernel_set_timestep,  2, sizeof(cl_mem),  (void *)&dev_H);
+   ezcl_set_kernel_arg(kernel_set_timestep,  3, sizeof(cl_mem),  (void *)&dev_U);
+   ezcl_set_kernel_arg(kernel_set_timestep,  4, sizeof(cl_mem),  (void *)&dev_V);
+   ezcl_set_kernel_arg(kernel_set_timestep,  5, sizeof(cl_mem),  (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_set_timestep,  6, sizeof(cl_mem),  (void *)&dev_celltype);
+   ezcl_set_kernel_arg(kernel_set_timestep,  7, sizeof(cl_mem),  (void *)&dev_levdx);
+   ezcl_set_kernel_arg(kernel_set_timestep,  8, sizeof(cl_mem),  (void *)&dev_levdy);
+   ezcl_set_kernel_arg(kernel_set_timestep,  9, sizeof(cl_mem),  (void *)&dev_redscratch);
+   ezcl_set_kernel_arg(kernel_set_timestep, 10, sizeof(cl_mem),  (void *)&dev_deltaT);
+   ezcl_set_kernel_arg(kernel_set_timestep, 11, local_work_size*sizeof(cl_real_t),  NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_set_timestep, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+   if (block_size > 1){
+         /*
+         __kernel void finish_reduction_min_cl(
+           const    int      isize,
+           __global real_t  *redscratch,
+           __global real_t  *deltaT,
+           __local  real_t  *tile)
+         */
+      ezcl_set_kernel_arg(kernel_reduction_min, 0, sizeof(cl_int),  (void *)&block_size);
+      ezcl_set_kernel_arg(kernel_reduction_min, 1, sizeof(cl_mem),  (void *)&dev_redscratch);
+      ezcl_set_kernel_arg(kernel_reduction_min, 2, sizeof(cl_mem),  (void *)&dev_deltaT);
+      ezcl_set_kernel_arg(kernel_reduction_min, 3, local_work_size*sizeof(cl_real_t), NULL);
+
+     ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_min, 1, NULL, &local_work_size, &local_work_size, NULL);
+   }
+
+   real_t deltaT_local;
+   ezcl_enqueue_read_buffer(command_queue, dev_deltaT, CL_TRUE,  0, sizeof(cl_real_t), &deltaT_local, NULL);
+   deltaT = deltaT_local;
+
+   globalmindeltaT = deltaT;
+#ifdef HAVE_MPI
+   if (parallel) MPI_Allreduce(&deltaT, &globalmindeltaT, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+   ezcl_device_memory_delete(dev_redscratch);
+
+   gpu_timers[STATE_TIMER_SET_TIMESTEP] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+   return(globalmindeltaT);
+}
+#endif
+
+void State::fill_circle(double  circ_radius,//  Radius of circle in grid units.
+                        double  fill_value, //  Circle height for shallow water.
+                        double  background) //  Background height for shallow water.
+{  
+   size_t &ncells = mesh->ncells;
+   vector<spatial_t> &x  = mesh->x;
+   vector<spatial_t> &dx = mesh->dx;
+   vector<spatial_t> &y  = mesh->y;
+   vector<spatial_t> &dy = mesh->dy;
+
+   for (uint ic = 0; ic < ncells; ic++)
+   {  H[ic] = background;
+      U[ic] = V[ic] = 0.0; }
+   
+   //  Clear the old k-D tree and generate new data (slow but necessary here).
+   //KDTree_Destroy(&mesh->tree);
+   mesh->kdtree_setup();
+   
+   int nez;
+   vector<int>    ind(ncells);
+   vector<double> weight(ncells);
+   
+#ifdef FULL_PRECISION
+   KDTree_QueryCircleInterior_Double(&mesh->tree, &nez, &(ind[0]), circ_radius, ncells,
+                                     &x[0], &dx[0],
+                                     &y[0], &dy[0]);
+#else
+   KDTree_QueryCircleInterior_Float(&mesh->tree, &nez, &(ind[0]), circ_radius, ncells,
+                                    &x[0], &dx[0],
+                                    &y[0], &dy[0]);
+#endif
+   for (int ic = 0; ic < nez; ++ic)
+   {  H[ind[ic]] = fill_value; }
+   
+#ifdef FULL_PRECISION
+   KDTree_QueryCircleIntersectWeighted_Double(&mesh->tree, &nez, &(ind[0]), &(weight[0]),
+                              circ_radius, ncells,
+                              &x[0], &dx[0],
+                              &y[0], &dy[0]);
+#else
+   KDTree_QueryCircleIntersectWeighted_Float(&mesh->tree, &nez, &(ind[0]), &(weight[0]),
+                              circ_radius, ncells,
+                              &x[0], &dx[0],
+                              &y[0], &dy[0]);
+#endif
+
+   for (int ic = 0; ic < nez; ++ic)
+   {  H[ind[ic]] = background + (fill_value - background) * weight[ic]; }
+
+   KDTree_Destroy(&mesh->tree);
+}
+
+void State::state_reorder(vector<int> iorder)
+{
+   H = state_memory.memory_reorder(H, &iorder[0]);
+   U = state_memory.memory_reorder(U, &iorder[0]);
+   V = state_memory.memory_reorder(V, &iorder[0]);
+   //printf("\nDEBUG reorder cells\n"); 
+   //state_memory.memory_report();
+   //printf("DEBUG end reorder cells\n\n"); 
+}
+
+void State::rezone_all(int icount, int jcount, vector<int> mpot)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   mesh->rezone_all(icount, jcount, mpot, 1, state_memory);
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+   memory_reset_ptrs();
+
+   cpu_timers[STATE_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+   } // end master region
+#endif
+}
+
+
+#ifdef HAVE_OPENCL
+void State::gpu_rezone_all(int icount, int jcount, bool localStencil)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("DEBUG -- localStencil is %d\n",localStencil);
+
+   mesh->gpu_rezone_all(icount, jcount, dev_mpot, gpu_state_memory);
+   dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H");
+   dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U");
+   dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V");
+
+   gpu_timers[STATE_TIMER_REZONE_ALL] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+}
+#endif
+
+//define macro for squaring a number
+#define SQ(x) ((x)*(x))
+//define macro to find minimum of 3 values
+//#define MIN3(a,b,c) (min(min((a),(b)),(c)))
+
+#define HXFLUX(ic)  ( U[ic] )
+#define UXFLUX(ic)  ( SQ(U[ic])/H[ic] + ghalf*SQ(H[ic]) )
+#define UVFLUX(ic)  ( U[ic]*V[ic]/H[ic] )
+
+#define HXFLUXIC ( Uic )
+#define HXFLUXNL ( Ul )
+#define HXFLUXNR ( Ur )
+#define HXFLUXNB ( Ub )
+#define HXFLUXNT ( Ut )
+
+#define UXFLUXIC ( SQ(Uic)/Hic + ghalf*SQ(Hic) )
+#define UXFLUXNL ( SQ(Ul)/Hl + ghalf*SQ(Hl) )
+#define UXFLUXNR ( SQ(Ur)/Hr + ghalf*SQ(Hr) )
+#define UXFLUXNB ( SQ(Ub)/Hb + ghalf*SQ(Hb) )
+#define UXFLUXNT ( SQ(Ut)/Ht + ghalf*SQ(Ht) )
+
+#define UVFLUXIC ( Uic*Vic/Hic )
+#define UVFLUXNL ( Ul*Vl/Hl )
+#define UVFLUXNR ( Ur*Vr/Hr )
+#define UVFLUXNB ( Ub*Vb/Hb )
+#define UVFLUXNT ( Ut*Vt/Ht )
+
+#define HYFLUX(ic)  ( V[ic] )
+#define VUFLUX(ic)  ( V[ic]*U[ic]/H[ic] )
+#define VYFLUX(ic)  ( SQ(V[ic])/H[ic] + ghalf*SQ(H[ic]) )
+
+#define HYFLUXIC ( Vic )
+#define HYFLUXNL ( Vl )
+#define HYFLUXNR ( Vr )
+#define HYFLUXNB ( Vb )
+#define HYFLUXNT ( Vt )
+
+#define VUFLUXIC  ( Vic*Uic/Hic )
+#define VUFLUXNL  ( Vl*Ul/Hl )
+#define VUFLUXNR  ( Vr*Ur/Hr )
+#define VUFLUXNB  ( Vb*Ub/Hb )
+#define VUFLUXNT  ( Vt*Ut/Ht )
+
+#define VYFLUXIC  ( SQ(Vic)/Hic + ghalf*SQ(Hic) )
+#define VYFLUXNL  ( SQ(Vl)/Hl + ghalf*SQ(Hl) )
+#define VYFLUXNR  ( SQ(Vr)/Hr + ghalf*SQ(Hr) )
+#define VYFLUXNB  ( SQ(Vb)/Hb + ghalf*SQ(Hb) )
+#define VYFLUXNT  ( SQ(Vt)/Ht + ghalf*SQ(Ht) )
+
+
+#define HNEWXFLUXMINUS  ( Uxminus )
+#define HNEWXFLUXPLUS   ( Uxplus )
+#define UNEWXFLUXMINUS  ( SQ(Uxminus)/Hxminus + ghalf*SQ(Hxminus) )
+#define UNEWXFLUXPLUS   ( SQ(Uxplus) /Hxplus +  ghalf*SQ(Hxplus)  )
+#define UVNEWFLUXMINUS  ( Uxminus*Vxminus/Hxminus )
+#define UVNEWFLUXPLUS   ( Uxplus *Vxplus /Hxplus  )
+
+#define HNEWYFLUXMINUS  ( Vyminus )
+#define HNEWYFLUXPLUS   ( Vyplus  )
+#define VNEWYFLUXMINUS  ( SQ(Vyminus)/Hyminus + ghalf*SQ(Hyminus) )
+#define VNEWYFLUXPLUS   ( SQ(Vyplus) /Hyplus  + ghalf*SQ(Hyplus)  )
+#define VUNEWFLUXMINUS  ( Vyminus*Uyminus/Hyminus )
+#define VUNEWFLUXPLUS   ( Vyplus *Uyplus /Hyplus )
+
+// XXX ADDED XXX
+#define HXFLUXNLT ( Ult )
+#define HXFLUXNRT ( Urt )
+#define UXFLUXNLT ( SQR(Ult)/Hlt + ghalf*SQR(Hlt) )
+#define UXFLUXNRT ( SQR(Urt)/Hrt + ghalf*SQR(Hrt) )
+#define UVFLUXNLT ( Ult*Vlt/Hlt )
+#define UVFLUXNRT ( Urt*Vrt/Hrt )
+#define HYFLUXNBR ( Vbr )
+#define HYFLUXNTR ( Vtr )
+#define VUFLUXNBR  ( Vbr*Ubr/Hbr )
+#define VUFLUXNTR  ( Vtr*Utr/Htr )
+#define VYFLUXNBR  ( SQR(Vbr)/Hbr + ghalf*SQR(Hbr) )
+#define VYFLUXNTR  ( SQR(Vtr)/Htr + ghalf*SQR(Htr) )
+#define HNEWXFLUXMINUS2  ( Uxminus2 )
+#define HNEWXFLUXPLUS2   ( Uxplus2 )
+#define UNEWXFLUXMINUS2  ( SQR(Uxminus2)/Hxminus2 + ghalf*SQR(Hxminus2) )
+#define UNEWXFLUXPLUS2   ( SQR(Uxplus2) /Hxplus2 +  ghalf*SQR(Hxplus2)  )
+#define UVNEWFLUXMINUS2  ( Uxminus2*Vxminus2/Hxminus2 )
+#define UVNEWFLUXPLUS2   ( Uxplus2 *Vxplus2 /Hxplus2  )
+#define HNEWYFLUXMINUS2  ( Vyminus2 )
+#define HNEWYFLUXPLUS2   ( Vyplus2  )
+#define VNEWYFLUXMINUS2  ( SQR(Vyminus2)/Hyminus2 + ghalf*SQR(Hyminus2) )
+#define VNEWYFLUXPLUS2   ( SQR(Vyplus2) /Hyplus2  + ghalf*SQR(Hyplus2)  )
+#define VUNEWFLUXMINUS2  ( Vyminus2*Uyminus2/Hyminus2 )
+#define VUNEWFLUXPLUS2   ( Vyplus2 *Uyplus2 /Hyplus2 )
+
+void State::calc_finite_difference(double deltaT){
+   real_t   g     = 9.80;   // gravitational constant
+   real_t   ghalf = 0.5*g;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   size_t ncells     = mesh->ncells;
+   size_t &ncells_ghost = mesh->ncells_ghost;
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   if (ncells_ghost < ncells) ncells_ghost = ncells;
+
+   //printf("\nDEBUG finite diff\n"); 
+
+#ifdef HAVE_MPI
+   // We need to populate the ghost regions since the calc neighbors has just been
+   // established for the mesh shortly before
+   if (mesh->numpe > 1) {
+      apply_boundary_conditions_local();
+
+#ifdef _OPENMP
+#pragma omp master
+      {
+#endif
+      H=(state_t *)state_memory.memory_realloc(ncells_ghost, H);
+      U=(state_t *)state_memory.memory_realloc(ncells_ghost, U);
+      V=(state_t *)state_memory.memory_realloc(ncells_ghost, V);
+
+      L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+      L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+      L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+      apply_boundary_conditions_ghost();
+   } else {
+      apply_boundary_conditions();
+   }
+#else
+   apply_boundary_conditions();
+#endif
+
+   static state_t *H_new, *U_new, *V_new;
+   int *nlft, *nrht, *nbot, *ntop, *level;
+
+   nlft  = mesh->nlft;
+   nrht  = mesh->nrht;
+   nbot  = mesh->nbot;
+   ntop  = mesh->ntop;
+   level = mesh->level;
+
+   vector<real_t> &lev_deltax = mesh->lev_deltax;
+   vector<real_t> &lev_deltay = mesh->lev_deltay;
+
+   int flags = 0;
+   flags = RESTART_DATA;
+#if defined (HAVE_J7)
+   if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   {
+      H_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+                                                    sizeof(state_t),
+                                                    "H_new", flags);
+      U_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+                                                    sizeof(state_t),
+                                                    "U_new", flags);
+      V_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+                                                    sizeof(state_t),
+                                                    "V_new", flags);
+   }
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+
+   int lowerBound, upperBound;
+   mesh->get_bounds(lowerBound, upperBound);
+
+   for(int gix = lowerBound; gix < upperBound; gix++) {
+#if DEBUG >= 3
+      printf("%d: DEBUG gix is %d at line %d in file %s\n",mesh->mype,gix,__LINE__,__FILE__);
+#endif
+
+      int lvl     = level[gix];
+      int nl      = nlft[gix];
+      int nr      = nrht[gix];
+      int nt      = ntop[gix];
+      int nb      = nbot[gix];
+
+      real_t Hic     = H[gix];
+      real_t Uic     = U[gix];
+      real_t Vic     = V[gix];
+
+#if DEBUG >= 3
+      if (nl < 0 || nl >= ncells_ghost ) printf("%d: Problem at file %s line %d with nl %ld\n",mesh->mype,__FILE__,__LINE__,nl);
+#endif
+      int nll     = nlft[nl];
+      real_t Hl      = H[nl];
+      real_t Ul      = U[nl];
+      real_t Vl      = V[nl];
+
+#if DEBUG >= 3
+      if (nr < 0 || nr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nr %ld\n",mesh->mype,__FILE__,__LINE__,nr);
+#endif
+      int nrr     = nrht[nr];
+      real_t Hr      = H[nr];
+      real_t Ur      = U[nr];
+      real_t Vr      = V[nr];
+
+#if DEBUG >= 3
+      if (nt < 0 || nt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nt %ld\n",mesh->mype,__FILE__,__LINE__,nt);
+#endif
+      int ntt     = ntop[nt];
+      real_t Ht      = H[nt];
+      real_t Ut      = U[nt];
+      real_t Vt      = V[nt];
+
+#if DEBUG >= 3
+      if (nb < 0 || nb >= ncells_ghost ) printf("%d: Problem at file %s line %d with nb %ld\n",mesh->mype,__FILE__,__LINE__,nb);
+#endif
+      int nbb     = nbot[nb];
+      real_t Hb      = H[nb];
+      real_t Ub      = U[nb];
+      real_t Vb      = V[nb];
+
+      int nlt     = ntop[nl];
+      int nrt     = ntop[nr];
+      int ntr     = nrht[nt];
+      int nbr     = nrht[nb];
+
+#if DEBUG >= 3
+      if (nll < 0 || nll >= ncells_ghost ) printf("%d: Problem at file %s line %d with nll %ld\n",mesh->mype,__FILE__,__LINE__,nll);
+#endif
+      real_t Hll     = H[nll];
+      real_t Ull     = U[nll];
+      //real_t Vll     = V[nll];
+
+#if DEBUG >= 3
+      if (nrr < 0 || nrr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nrr %ld\n",mesh->mype,__FILE__,__LINE__,nrr);
+#endif
+      real_t Hrr     = H[nrr];
+      real_t Urr     = U[nrr];
+      //real_t Vrr     = V[nrr];
+
+#if DEBUG >= 3
+      if (ntt < 0 || ntt >= ncells_ghost ) printf("%d: Problem at file %s line %d with ntt %ld\n",mesh->mype,__FILE__,__LINE__,ntt);
+#endif
+      real_t Htt     = H[ntt];
+      //real_t Utt     = U[ntt];
+      real_t Vtt     = V[ntt];
+
+#if DEBUG >= 3
+      if (nbb < 0 || nbb >= ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with nbb %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbb); sleep(15); }
+#endif
+      real_t Hbb     = H[nbb];
+      //real_t Ubb     = U[nbb];
+      real_t Vbb     = V[nbb];
+
+#if DEBUG >= 3
+      if (lvl < 0 || lvl >= (int)lev_deltax.size() ) printf("%d: Problem at file %s line %d with lvl %d\n",mesh->mype,__FILE__,__LINE__,lvl);
+#endif
+      real_t dxic    = lev_deltax[lvl];
+      real_t dyic    = lev_deltay[lvl];
+
+      real_t dxl     = lev_deltax[level[nl]];
+      real_t dxr     = lev_deltax[level[nr]];
+
+      real_t dyt     = lev_deltay[level[nt]];
+      real_t dyb     = lev_deltay[level[nb]];
+
+      real_t drl     = dxl;
+      real_t drr     = dxr;
+      real_t drt     = dyt;
+      real_t drb     = dyb;
+
+      real_t dric    = dxic;
+
+      int nltl = 0;
+      real_t Hlt = 0.0, Ult = 0.0, Vlt = 0.0;
+      real_t Hll2 = 0.0;
+      real_t Ull2 = 0.0;
+      if(lvl < level[nl]) {
+#if DEBUG >= 3
+         if (nlt < 0 || nlt > ncells_ghost ) printf("%d: Problem at file %s line %d with nlt %ld\n",mesh->mype,__FILE__,__LINE__,nlt);
+#endif
+         Hlt  = H[ ntop[nl] ];
+         Ult  = U[ ntop[nl] ];
+         Vlt  = V[ ntop[nl] ];
+         nltl = nlft[nlt];
+#if DEBUG >= 3
+         if (nltl < 0 || nltl > ncells_ghost ) printf("%d: Problem at file %s line %d with nltl %ld\n",mesh->mype,__FILE__,__LINE__,nltl);
+#endif
+         Hll2 = H[nltl];
+         Ull2 = U[nltl];
+      }
+
+      int nrtr = 0;
+      real_t Hrt = 0.0, Urt = 0.0, Vrt = 0.0;
+      real_t Hrr2 = 0.0;
+      real_t Urr2 = 0.0;
+      if(lvl < level[nr]) {
+#if DEBUG >= 3
+         if (nrt < 0 || nrt > ncells_ghost ) printf("%d: Problem at file %s line %d with nrt %ld\n",mesh->mype,__FILE__,__LINE__,nrt);
+#endif
+         Hrt  = H[ ntop[nr] ];
+         Urt  = U[ ntop[nr] ];
+         Vrt  = V[ ntop[nr] ];
+         nrtr = nrht[nrt];
+#if DEBUG >= 3
+         if (nrtr < 0 || nrtr > ncells_ghost ) printf("%d: Problem at file %s line %d with nrtr %ld\n",mesh->mype,__FILE__,__LINE__,nrtr);
+#endif
+         Hrr2 = H[nrtr];
+         Urr2 = U[nrtr];
+      }
+
+      int nbrb = 0;
+      real_t Hbr = 0.0, Ubr = 0.0, Vbr = 0.0;
+      real_t Hbb2 = 0.0;
+      real_t Vbb2 = 0.0;
+      if(lvl < level[nb]) {
+#if DEBUG >= 3
+         if (nbr < 0 || nbr > ncells_ghost ) printf("%d: Problem at file %s line %d with nbr %ld\n",mesh->mype,__FILE__,__LINE__,nbr);
+#endif
+         Hbr  = H[ nrht[nb] ];
+         Ubr  = U[ nrht[nb] ];
+         Vbr  = V[ nrht[nb] ];
+         nbrb = nbot[nbr];
+#if DEBUG >= 3
+         if (nbrb < 0 || nbrb > ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with nbrb %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbrb); sleep(20);}
+#endif
+         Hbb2 = H[nbrb];
+         Vbb2 = V[nbrb];
+      }
+
+      int ntrt = 0;
+      real_t Htr = 0.0, Utr = 0.0, Vtr = 0.0;
+      real_t Htt2 = 0.0;
+      real_t Vtt2 = 0.0;
+      if(lvl < level[nt]) {
+#if DEBUG >= 3
+         if (ntr < 0 || ntr > ncells_ghost ) printf("%d: Problem at file %s line %d with ntr %ld\n",mesh->mype,__FILE__,__LINE__,ntr);
+#endif
+         Htr  = H[ nrht[nt] ];
+         Utr  = U[ nrht[nt] ];
+         Vtr  = V[ nrht[nt] ];
+         ntrt = ntop[ntr];
+#if DEBUG >= 3
+         if (ntrt < 0 || ntrt > ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with ntrt %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,ntrt); sleep(20); }
+#endif
+         Htt2 = H[ntrt];
+         Vtt2 = V[ntrt];
+      }
+
+
+      real_t Hxminus = U_halfstep(deltaT, Hl, Hic, HXFLUXNL, HXFLUXIC,
+                           dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+      real_t Uxminus = U_halfstep(deltaT, Ul, Uic, UXFLUXNL, UXFLUXIC,
+                           dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+      real_t Vxminus = U_halfstep(deltaT, Vl, Vic, UVFLUXNL, UVFLUXIC,
+                           dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+
+      real_t Hxplus  = U_halfstep(deltaT, Hic, Hr, HXFLUXIC, HXFLUXNR,
+                           dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+      real_t Uxplus  = U_halfstep(deltaT, Uic, Ur, UXFLUXIC, UXFLUXNR,
+                           dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+      real_t Vxplus  = U_halfstep(deltaT, Vic, Vr, UVFLUXIC, UVFLUXNR,
+                           dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+
+      real_t Hyminus = U_halfstep(deltaT, Hb, Hic, HYFLUXNB, HYFLUXIC,
+                           dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+      real_t Uyminus = U_halfstep(deltaT, Ub, Uic, VUFLUXNB, VUFLUXIC,
+                           dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+      real_t Vyminus = U_halfstep(deltaT, Vb, Vic, VYFLUXNB, VYFLUXIC,
+                           dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+
+      real_t Hyplus  = U_halfstep(deltaT, Hic, Ht, HYFLUXIC, HYFLUXNT,
+                           dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+      real_t Uyplus  = U_halfstep(deltaT, Uic, Ut, VUFLUXIC, VUFLUXNT,
+                           dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+      real_t Vyplus  = U_halfstep(deltaT, Vic, Vt, VYFLUXIC, VYFLUXNT,
+                           dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+
+      real_t Hxfluxminus = HNEWXFLUXMINUS;
+      real_t Uxfluxminus = UNEWXFLUXMINUS;
+      real_t Vxfluxminus = UVNEWFLUXMINUS;
+
+      real_t Hxfluxplus  = HNEWXFLUXPLUS;
+      real_t Uxfluxplus  = UNEWXFLUXPLUS;
+      real_t Vxfluxplus  = UVNEWFLUXPLUS;
+
+      real_t Hyfluxminus = HNEWYFLUXMINUS;
+      real_t Uyfluxminus = VUNEWFLUXMINUS;
+      real_t Vyfluxminus = VNEWYFLUXMINUS;
+
+      real_t Hyfluxplus  = HNEWYFLUXPLUS;
+      real_t Uyfluxplus  = VUNEWFLUXPLUS;
+      real_t Vyfluxplus  = VNEWYFLUXPLUS;
+
+      real_t Hxminus2 = 0.0;
+      real_t Uxminus2 = 0.0;
+      real_t Vxminus2 = 0.0;
+      if(lvl < level[nl]) {
+
+         Hxminus2 = U_halfstep(deltaT, Hlt, Hic, HXFLUXNLT, HXFLUXIC,
+                               drl, dric, drl, dric, SQR(drl), SQR(dric));
+         Uxminus2 = U_halfstep(deltaT, Ult, Uic, UXFLUXNLT, UXFLUXIC,
+                               drl, dric, drl, dric, SQR(drl), SQR(dric));
+         Vxminus2 = U_halfstep(deltaT, Vlt, Vic, UVFLUXNLT, UVFLUXIC,
+                               drl, dric, drl, dric, SQR(drl), SQR(dric));
+
+         Hxfluxminus = (Hxfluxminus + HNEWXFLUXMINUS2) * HALF;
+         Uxfluxminus = (Uxfluxminus + UNEWXFLUXMINUS2) * HALF;
+         Vxfluxminus = (Vxfluxminus + UVNEWFLUXMINUS2) * HALF;
+
+      }
+
+      real_t Hxplus2 = 0.0;
+      real_t Uxplus2 = 0.0;
+      real_t Vxplus2 = 0.0;
+      if(lvl < level[nr]) {
+
+         Hxplus2  = U_halfstep(deltaT, Hic, Hrt, HXFLUXIC, HXFLUXNRT,
+                               dric, drr, dric, drr, SQR(dric), SQR(drr));
+         Uxplus2  = U_halfstep(deltaT, Uic, Urt, UXFLUXIC, UXFLUXNRT,
+                               dric, drr, dric, drr, SQR(dric), SQR(drr));
+         Vxplus2  = U_halfstep(deltaT, Vic, Vrt, UVFLUXIC, UVFLUXNRT,
+                               dric, drr, dric, drr, SQR(dric), SQR(drr));
+
+         Hxfluxplus  = (Hxfluxplus + HNEWXFLUXPLUS2) * HALF;
+         Uxfluxplus  = (Uxfluxplus + UNEWXFLUXPLUS2) * HALF;
+         Vxfluxplus  = (Vxfluxplus + UVNEWFLUXPLUS2) * HALF;
+
+      }
+
+      real_t Hyminus2 = 0.0;
+      real_t Uyminus2 = 0.0;
+      real_t Vyminus2 = 0.0;
+      if(lvl < level[nb]) {
+
+         Hyminus2 = U_halfstep(deltaT, Hbr, Hic, HYFLUXNBR, HYFLUXIC,
+                               drb, dric, drb, dric, SQR(drb), SQR(dric));
+         Uyminus2 = U_halfstep(deltaT, Ubr, Uic, VUFLUXNBR, VUFLUXIC,
+                               drb, dric, drb, dric, SQR(drb), SQR(dric));
+         Vyminus2 = U_halfstep(deltaT, Vbr, Vic, VYFLUXNBR, VYFLUXIC,
+                               drb, dric, drb, dric, SQR(drb), SQR(dric));
+
+         Hyfluxminus = (Hyfluxminus + HNEWYFLUXMINUS2) * HALF;
+         Uyfluxminus = (Uyfluxminus + VUNEWFLUXMINUS2) * HALF;
+         Vyfluxminus = (Vyfluxminus + VNEWYFLUXMINUS2) * HALF;
+
+      }
+
+      real_t Hyplus2 = 0.0;
+      real_t Uyplus2 = 0.0;
+      real_t Vyplus2 = 0.0;
+      if(lvl < level[nt]) {
+
+         Hyplus2  = U_halfstep(deltaT, Hic, Htr, HYFLUXIC, HYFLUXNTR,
+                               dric, drt, dric, drt, SQR(dric), SQR(drt));
+         Uyplus2  = U_halfstep(deltaT, Uic, Utr, VUFLUXIC, VUFLUXNTR,
+                               dric, drt, dric, drt, SQR(dric), SQR(drt));
+         Vyplus2  = U_halfstep(deltaT, Vic, Vtr, VYFLUXIC, VYFLUXNTR,
+                               dric, drt, dric, drt, SQR(dric), SQR(drt));
+
+         Hyfluxplus  = (Hyfluxplus + HNEWYFLUXPLUS2) * HALF;
+         Uyfluxplus  = (Uyfluxplus + VUNEWFLUXPLUS2) * HALF;
+         Vyfluxplus  = (Vyfluxplus + VNEWYFLUXPLUS2) * HALF;
+
+      }
+
+      //if (DEBUG >= 2) {
+      // printf("1st pass x direction nz %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf\n",
+      //    gix, nl, nr,
+      //    Hxplus,Hxplus2,Uxplus,Uxplus2,Vxplus,Vxplus2);
+      //    //H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+      //}
+
+      ////////////////////////////////////////
+      /// Artificial Viscosity corrections ///
+      ////////////////////////////////////////
+
+
+      if(level[nl] < level[nll]) {
+#if DEBUG >= 3
+         size_t nllt = ntop[nll];
+         if (nllt < 0 || nllt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nllt %ld\n",mesh->mype,__FILE__,__LINE__,nllt);
+#endif
+         Hll = (Hll + H[ ntop[nll] ]) * HALF;
+         Ull = (Ull + U[ ntop[nll] ]) * HALF;
+      }
+
+      real_t Hr2 = Hr;
+      real_t Ur2 = Ur;
+      if(lvl < level[nr]) {
+         Hr2 = (Hr2 + Hrt) * HALF;
+         Ur2 = (Ur2 + Urt) * HALF;
+      }
+
+      real_t wminusx_H = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+                              Hic-Hl, Hl-Hll, Hr2-Hic);
+
+      wminusx_H *= Hic - Hl;
+
+      if(lvl < level[nl]) {
+         if(level[nlt] < level[nltl])
+            Hll2 = (Hll2 + H[ ntop[nltl] ]) * HALF;
+         wminusx_H = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+                                  sqrt(g*Hxminus2), Hic-Hlt, Hlt-Hll2, Hr2-Hic) *
+                      (Hic - Hlt)) + wminusx_H)*HALF*HALF;
+      }
+
+
+      if(level[nr] < level[nrr]) {
+#if DEBUG >= 3
+         size_t nrrt = ntop[nrr];
+         if (nrrt < 0 || nrrt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nrrt %ld\n",mesh->mype,__FILE__,__LINE__,nrrt);
+#endif
+         Hrr = (Hrr + H[ ntop[nrr] ]) * HALF;
+         Urr = (Urr + U[ ntop[nrr] ]) * HALF;
+      }
+
+      real_t Hl2 = Hl;
+      real_t Ul2 = Ul;
+      if(lvl < level[nl]) {
+         Hl2 = (Hl2 + Hlt) * HALF;
+         Ul2 = (Ul2 + Ult) * HALF;
+      }
+
+      real_t wplusx_H = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+                           Hr-Hic, Hic-Hl2, Hrr-Hr);
+
+      wplusx_H *= Hr - Hic;
+
+      if(lvl < level[nr]) {
+         if(level[nrt] < level[nrtr])
+            Hrr2 = (Hrr2 + H[ ntop[nrtr] ]) * HALF;
+         wplusx_H = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+                                  sqrt(g*Hxplus2), Hrt-Hic, Hic-Hl2, Hrr2-Hrt) *
+                      (Hrt - Hic))+wplusx_H)*HALF*HALF;
+      }
+
+
+      real_t wminusx_U = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+                              Uic-Ul, Ul-Ull, Ur2-Uic);
+
+      wminusx_U *= Uic - Ul;
+
+      if(lvl < level[nl]) {
+         if(level[nlt] < level[nltl])
+            Ull2 = (Ull2 + U[ ntop[nltl] ]) * HALF;
+         wminusx_U = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+                                  sqrt(g*Hxminus2), Uic-Ult, Ult-Ull2, Ur2-Uic) *
+                      (Uic - Ult))+wminusx_U)*HALF*HALF;
+      }
+
+
+      real_t wplusx_U = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+                              Ur-Uic, Uic-Ul2, Urr-Ur);
+
+      wplusx_U *= Ur - Uic;
+
+      if(lvl < level[nr]) {
+         if(level[nrt] < level[nrtr])
+            Urr2 = (Urr2 + U[ ntop[nrtr] ]) * HALF;
+         wplusx_U = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+                                  sqrt(g*Hxplus2), Urt-Uic, Uic-Ul2, Urr2-Urt) *
+                      (Urt - Uic))+wplusx_U)*HALF*HALF;
+      }
+
+
+      if(level[nb] < level[nbb]) {
+#if DEBUG >= 3
+         size_t nbbr = nrht[nbb];
+         if (nbbr < 0 || nbbr >= ncells_ghost ) printf("%d: Problem at file %s line %d gix %d %d with nbbr %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbbr);
+#endif
+         Hbb = (Hbb + H[ nrht[nbb] ]) * HALF;
+         Vbb = (Vbb + V[ nrht[nbb] ]) * HALF;
+      }
+
+      real_t Ht2 = Ht;
+      real_t Vt2 = Vt;
+      if(lvl < level[nt]) {
+         Ht2 = (Ht2 + Htr) * HALF;
+         Vt2 = (Vt2 + Vtr) * HALF;
+      }
+
+      real_t wminusy_H = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+                              Hic-Hb, Hb-Hbb, Ht2-Hic);
+
+      wminusy_H *= Hic - Hb;
+
+      if(lvl < level[nb]) {
+         if(level[nbr] < level[nbrb])
+            Hbb2 = (Hbb2 + H[ nrht[nbrb] ]) * HALF;
+         wminusy_H = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+                                  sqrt(g*Hyminus2), Hic-Hbr, Hbr-Hbb2, Ht2-Hic) *
+                      (Hic - Hbr))+wminusy_H)*HALF*HALF;
+      }
+
+
+      if(level[nt] < level[ntt]) {
+#if DEBUG >= 3
+         size_t nttr = nrht[ntt];
+         if (nttr < 0 || nttr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nttr %ld\n",mesh->mype,__FILE__,__LINE__,nttr);
+#endif
+         Htt = (Htt + H[ nrht[ntt] ]) * HALF;
+         Vtt = (Vtt + V[ nrht[ntt] ]) * HALF;
+      }
+
+      real_t Hb2 = Hb;
+      real_t Vb2 = Vb;
+      if(lvl < level[nb]) {
+         Hb2 = (Hb2 + Hbr) * HALF;
+         Vb2 = (Vb2 + Vbr) * HALF;
+      }
+
+      real_t wplusy_H = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+                             Ht-Hic, Hic-Hb2, Htt-Ht);
+
+      wplusy_H *= Ht - Hic;
+
+      if(lvl < level[nt]) {
+         if(level[ntr] < level[ntrt])
+            Htt2 = (Htt2 + H[ nrht[ntrt] ]) * HALF;
+         wplusy_H = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+                                  sqrt(g*Hyplus2), Htr-Hic, Hic-Hb2, Htt2-Htr) *
+                      (Htr - Hic))+wplusy_H)*HALF*HALF;
+      }
+
+      real_t wminusy_V = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+                              Vic-Vb, Vb-Vbb, Vt2-Vic);
+
+      wminusy_V *= Vic - Vb;
+
+      if(lvl < level[nb]) {
+         if(level[nbr] < level[nbrb])
+            Vbb2 = (Vbb2 + V[ nrht[nbrb] ]) * HALF;
+         wminusy_V = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+                                  sqrt(g*Hyminus2), Vic-Vbr, Vbr-Vbb2, Vt2-Vic) *
+                      (Vic - Vbr))+wminusy_V)*HALF*HALF;
+      }
+
+      real_t wplusy_V = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+                           Vt-Vic, Vic-Vb2, Vtt-Vt);
+
+      wplusy_V *= Vt - Vic;
+
+      if(lvl < level[nt]) {
+         if(level[ntr] < level[ntrt])
+            Vtt2 = (Vtt2 + V[ nrht[ntrt] ]) * HALF;
+         wplusy_V = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+                                  sqrt(g*Hyplus2), Vtr-Vic, Vic-Vb2, Vtt2-Vtr) *
+                      (Vtr - Vic))+wplusy_V)*HALF*HALF;
+      }
+
+      H_new[gix] = U_fullstep(deltaT, dxic, Hic,
+                       Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus)
+                  - wminusx_H + wplusx_H - wminusy_H + wplusy_H;
+      U_new[gix] = U_fullstep(deltaT, dxic, Uic,
+                       Uxfluxplus, Uxfluxminus, Uyfluxplus, Uyfluxminus)
+                  - wminusx_U + wplusx_U;
+      V_new[gix] = U_fullstep(deltaT, dxic, Vic,
+                       Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus)
+                  - wminusy_V + wplusy_V;
+
+#if DEBUG >= 1
+      if (DEBUG >= 1) {
+         real_t U_tmp = U_new[gix];
+         real_t V_tmp = V_new[gix];
+         if (U_tmp == 0.0) U_tmp = 0.0;
+         if (V_tmp == 0.0) V_tmp = 0.0;
+         printf("DEBUG ic %d H_new %lf U_new %lf V_new %lf\n",gix,H_new[gix],U_tmp,V_tmp);
+      }
+#endif
+
+/*
+      printf("DEBUG ic %d deltaT, %lf dxic, %lf Hic, %lf Hxfluxplus, %lf Hxfluxminus, %lf Hyfluxplus, %lf Hyfluxminus %lf\n",
+         gix, deltaT, dxic, Hic, Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus);
+      printf("DEBUG ic %d wminusx_H %lf wplusx_H %lf wminusy_H %lf wplusy_H %lf\n",gix, wminusx_H, wplusx_H, wminusy_H, wplusy_H);
+      printf("DEBUG ic %d deltaT, %lf dxic, %lf Vic, %lf Vxfluxplus, %lf Vxfluxminus, %lf Vyfluxplus, %lf Vyfluxminus %lf\n",
+         gix, deltaT, dxic, Vic, Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus);
+      printf("DEBUG ic %d wminusy_V %lf wplusy_V %lf\n",gix, wminusy_V, wplusy_V);
+*/
+   } // cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      // Replace H with H_new and deallocate H. New memory will have the characteristics
+      // of the new memory and the name of the old. Both return and arg1 will be reset to new memory
+      H = (state_t *)state_memory.memory_replace(H, H_new);
+      U = (state_t *)state_memory.memory_replace(U, U_new);
+      V = (state_t *)state_memory.memory_replace(V, V_new);
+
+      //state_memory.memory_report();
+      //printf("DEBUG end finite diff\n\n"); 
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+      cpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += cpu_timer_stop(tstart_cpu);
+}
+
+void State::calc_finite_difference_via_faces(double deltaT){
+   real_t   g     = 9.80;   // gravitational constant
+   real_t   ghalf = HALF*g;
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   size_t ncells     = mesh->ncells;
+   size_t &ncells_ghost = mesh->ncells_ghost;
+#ifdef _OPENMP
+#pragma omp master
+#endif
+   if (ncells_ghost < ncells) ncells_ghost = ncells;
+
+   //printf("\nDEBUG finite diff\n");
+
+#ifdef HAVE_MPI
+   // We need to populate the ghost regions since the calc neighbors has just been
+   // established for the mesh shortly before
+   if (mesh->numpe > 1) {
+      apply_boundary_conditions_local();
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master 
+      {
+#endif
+         H=(state_t *)state_memory.memory_realloc(ncells_ghost, H);
+         U=(state_t *)state_memory.memory_realloc(ncells_ghost, U);
+         V=(state_t *)state_memory.memory_realloc(ncells_ghost, V);
+
+         L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+         L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+         L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+      }
+#pragma omp barrier
+#endif
+
+      apply_boundary_conditions_ghost();
+   } else {
+      apply_boundary_conditions();
+   }
+#else
+   apply_boundary_conditions();
+#endif
+
+   int *nlft, *nrht, *nbot, *ntop, *level;
+
+   nlft  = mesh->nlft;
+   nrht  = mesh->nrht;
+   nbot  = mesh->nbot;
+   ntop  = mesh->ntop;
+   level = mesh->level;
+
+   vector<real_t> &lev_deltax = mesh->lev_deltax;
+   vector<real_t> &lev_deltay = mesh->lev_deltay;
+
+   int flags = 0;
+   flags = RESTART_DATA;
+#if defined (HAVE_J7)
+   if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   mesh->calc_face_list_wbidirmap();
+#ifdef _OPENMP
+   }
+#endif
+
+   static vector<state_t> Hx, Ux, Vx;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   Hx.resize(mesh->nxface);
+   Ux.resize(mesh->nxface);
+   Vx.resize(mesh->nxface);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for 
+#endif
+   for (int iface = 0; iface < mesh->nxface; iface++){
+      int cell_lower = mesh->map_xface2cell_lower[iface];
+      int cell_upper = mesh->map_xface2cell_upper[iface];
+      int level_lower = level[cell_lower];
+      int level_upper = level[cell_upper];
+      if (level_lower == level_upper) {
+         int lev = level_upper;
+         real_t Cxhalf = 0.5*deltaT/mesh->lev_deltax[lev];
+         Hx[iface]=HALF*(H[cell_upper]+H[cell_lower]) - Cxhalf*( HXFLUX(cell_upper)-HXFLUX(cell_lower) );
+         Ux[iface]=HALF*(U[cell_upper]+U[cell_lower]) - Cxhalf*( UXFLUX(cell_upper)-UXFLUX(cell_lower) );
+         Vx[iface]=HALF*(V[cell_upper]+V[cell_lower]) - Cxhalf*( UVFLUX(cell_upper)-UVFLUX(cell_lower) );
+      } else {
+         real_t dx_lower = mesh->lev_deltax[level[cell_lower]];
+         real_t dx_upper = mesh->lev_deltax[level[cell_upper]];
+
+         real_t FA_lower = dx_lower;
+         real_t FA_upper = dx_upper;
+         real_t FA_lolim = FA_lower*min(ONE, FA_upper/FA_lower);
+         real_t FA_uplim = FA_upper*min(ONE, FA_lower/FA_upper);
+
+         real_t CV_lower = SQ(dx_lower);
+         real_t CV_upper = SQ(dx_upper);
+         real_t CV_lolim = CV_lower*min(HALF, CV_upper/CV_lower);
+         real_t CV_uplim = CV_upper*min(HALF, CV_lower/CV_upper);
+
+         // Weighted half-step calculation
+         //
+         // (dx_lower*H[cell_upper]+dx_upper*H[cell_lower])
+         // -----------------------------------------------   -
+         //             (dx_lower+dx_upper)
+         //
+         //                ( (FA_uplim*HXFLUX(cell_upper))-(FA_lolim*HXFLUX(cell_lower)) )
+         // 0.5*deltaT  *  ----------------------------------------------------------------
+         //                                    (CV_uplim+CV_lolim)
+         //
+
+         Hx[iface]=(dx_lower*H[cell_upper]+dx_upper*H[cell_lower])/(dx_lower+dx_upper) -
+                   HALF*deltaT*( (FA_uplim*HXFLUX(cell_upper))-(FA_lolim*HXFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+         Ux[iface]=(dx_lower*U[cell_upper]+dx_upper*U[cell_lower])/(dx_lower+dx_upper) -
+                   HALF*deltaT*( (FA_uplim*UXFLUX(cell_upper))-(FA_lolim*UXFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+         Vx[iface]=(dx_lower*V[cell_upper]+dx_upper*V[cell_lower])/(dx_lower+dx_upper) -
+                   HALF*deltaT*( (FA_uplim*UVFLUX(cell_upper))-(FA_lolim*UVFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+      }
+#if DEBUG >= 2
+      if (DEBUG >= 2) {
+         printf("1st pass x direction iface %d i %d j %d lev %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
+            iface, mesh->xface_i[iface], mesh->xface_j[iface], mesh->xface_level[iface],
+            mesh->map_xface2cell_lower[iface], mesh->map_xface2cell_upper[iface],
+            Hx[iface],Ux[iface],Vx[iface],
+            H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+      }
+#endif
+   }
+#if DEBUG >= 2
+   if (DEBUG >= 2) {
+      printf("\n");
+   }
+#endif
+
+   static vector<state_t> Hy, Uy, Vy;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+   Hy.resize(mesh->nyface);
+   Uy.resize(mesh->nyface);
+   Vy.resize(mesh->nyface);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for 
+#endif
+   for (int iface = 0; iface < mesh->nyface; iface++){
+      int cell_lower = mesh->map_yface2cell_lower[iface];
+      int cell_upper = mesh->map_yface2cell_upper[iface];
+      int level_lower = level[cell_lower];
+      int level_upper = level[cell_upper];
+      if (level_lower == level_upper) {
+         int lev = level_upper;
+         real_t Cyhalf = 0.5*deltaT/mesh->lev_deltay[lev];
+         Hy[iface]=HALF*(H[cell_upper]+H[cell_lower]) - Cyhalf*( HYFLUX(cell_upper)-HYFLUX(cell_lower) );
+         Uy[iface]=HALF*(U[cell_upper]+U[cell_lower]) - Cyhalf*( UVFLUX(cell_upper)-UVFLUX(cell_lower) );
+         Vy[iface]=HALF*(V[cell_upper]+V[cell_lower]) - Cyhalf*( VYFLUX(cell_upper)-VYFLUX(cell_lower) );
+      } else {
+         real_t dy_lower = mesh->lev_deltay[level[cell_lower]];
+         real_t dy_upper = mesh->lev_deltay[level[cell_upper]];
+
+         real_t FA_lower = dy_lower;
+         real_t FA_upper = dy_upper;
+         real_t FA_lolim = FA_lower*min(ONE, FA_upper/FA_lower);
+         real_t FA_uplim = FA_upper*min(ONE, FA_lower/FA_upper);
+
+         real_t CV_lower = SQ(dy_lower);
+         real_t CV_upper = SQ(dy_upper);
+         real_t CV_lolim = CV_lower*min(HALF, CV_upper/CV_lower);
+         real_t CV_uplim = CV_upper*min(HALF, CV_lower/CV_upper);
+
+         // Weighted half-step calculation
+         //
+         // (dy_lower*H[cell_upper]+dy_upper*H[cell_lower])
+         // -----------------------------------------------   -
+         //             (dy_lower+dy_upper)
+         //
+         //                ( (FA_uplim*HYFLUX(cell_upper))-(FA_lolim*HYFLUX(cell_lower)) )
+         // 0.5*deltaT  *  ----------------------------------------------------------------
+         //                                    (CV_uplim+CV_lolim)
+         //
+
+         Hy[iface]=(dy_lower*H[cell_upper]+dy_upper*H[cell_lower])/(dy_lower+dy_upper) -
+                   HALF*deltaT*( (FA_uplim*HYFLUX(cell_upper))-(FA_lolim*HYFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+         Uy[iface]=(dy_lower*U[cell_upper]+dy_upper*U[cell_lower])/(dy_lower+dy_upper) -
+                   HALF*deltaT*( (FA_uplim*UVFLUX(cell_upper))-(FA_lolim*UVFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+         Vy[iface]=(dy_lower*V[cell_upper]+dy_upper*V[cell_lower])/(dy_lower+dy_upper) -
+                   HALF*deltaT*( (FA_uplim*VYFLUX(cell_upper))-(FA_lolim*VYFLUX(cell_lower)) )/
+                   (CV_uplim+CV_lolim);
+
+      }
+
+#if DEBUG >= 2
+      if (DEBUG >= 2) {
+         printf("1st pass y direction iface %d i %d j %d lev %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
+            iface, mesh->yface_i[iface], mesh->yface_j[iface], mesh->yface_level[iface],
+            mesh->map_yface2cell_lower[iface], mesh->map_yface2cell_upper[iface],
+            Hy[iface],Uy[iface],Vy[iface],
+            H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+      }
+#endif
+   }
+#if DEBUG >= 2
+   if (DEBUG >= 2) {
+      printf("\n");
+   }
+#endif
+
+   static state_t *H_new, *U_new, *V_new;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      H_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "H_new", flags);
+      U_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "U_new", flags);
+      V_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "V_new", flags);
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+   int lowerBound, upperBound;
+
+   mesh->get_bounds(lowerBound, upperBound);
+   for (int ic = lowerBound; ic < upperBound; ic++){
+
+      int lvl     = level[ic];
+      int nl      = nlft[ic];
+      int nr      = nrht[ic];
+      int nt      = ntop[ic];
+      int nb      = nbot[ic];
+
+      real_t Hic     = H[ic];
+      real_t Uic     = U[ic];
+      real_t Vic     = V[ic];
+
+      int nll     = nlft[nl];
+      real_t Hl      = H[nl];
+      real_t Ul      = U[nl];
+      //real_t Vl      = V[nl];
+
+      int nrr     = nrht[nr];
+      real_t Hr      = H[nr];
+      real_t Ur      = U[nr];
+      //real_t Vr      = V[nr];
+
+      int ntt     = ntop[nt];
+      real_t Ht      = H[nt];
+      //real_t Ut      = U[nt];
+      real_t Vt      = V[nt];
+
+      int nbb     = nbot[nb];
+      real_t Hb      = H[nb];
+      //real_t Ub      = U[nb];
+      real_t Vb      = V[nb];
+
+      int nlt     = ntop[nl];
+      int nrt     = ntop[nr];
+      int ntr     = nrht[nt];
+      int nbr     = nrht[nb];
+
+      real_t Hll     = H[nll];
+      real_t Ull     = U[nll];
+      //real_t Vll     = V[nll];
+
+      real_t Hrr     = H[nrr];
+      real_t Urr     = U[nrr];
+      //real_t Vrr     = V[nrr];
+
+      real_t Htt     = H[ntt];
+      //real_t Utt     = U[ntt];
+      real_t Vtt     = V[ntt];
+
+      real_t Hbb     = H[nbb];
+      //real_t Ubb     = U[nbb];
+      real_t Vbb     = V[nbb];
+
+      real_t dxic    = lev_deltax[lvl];
+      //real_t dyic    = lev_deltay[lvl];
+
+      real_t dxl     = lev_deltax[level[nl]];
+      real_t dxr     = lev_deltax[level[nr]];
+
+      real_t dyt     = lev_deltay[level[nt]];
+      real_t dyb     = lev_deltay[level[nb]];
+
+      //real_t drl     = dxl;
+      //real_t drr     = dxr;
+      //real_t drt     = dyt;
+      //real_t drb     = dyb;
+
+      real_t dric    = dxic;
+
+      int nltl = 0;
+      real_t Hlt = 0.0, Ult = 0.0; // Vlt = 0.0;
+      real_t Hll2 = 0.0;
+      real_t Ull2 = 0.0;
+      if(lvl < level[nl]) {
+         Hlt  = H[ ntop[nl] ];
+         Ult  = U[ ntop[nl] ];
+         //Vlt  = V[ ntop[nl] ];
+
+         nltl = nlft[nlt];
+         Hll2 = H[nltl];
+         Ull2 = U[nltl];
+      }
+
+      int nrtr = 0;
+      real_t Hrt = 0.0, Urt = 0.0; // Vrt = 0.0;
+      real_t Hrr2 = 0.0;
+      real_t Urr2 = 0.0;
+      if(lvl < level[nr]) {
+         Hrt  = H[ ntop[nr] ];
+         Urt  = U[ ntop[nr] ];
+         //Vrt  = V[ ntop[nr] ];
+
+         nrtr = nrht[nrt];
+         Hrr2 = H[nrtr];
+         Urr2 = U[nrtr];
+      }
+
+      int nbrb = 0;
+      real_t Hbr = 0.0, Vbr = 0.0; // Ubr = 0.0
+      real_t Hbb2 = 0.0;
+      real_t Vbb2 = 0.0;
+      if(lvl < level[nb]) {
+         Hbr  = H[ nrht[nb] ];
+         //Ubr  = U[ nrht[nb] ];
+         Vbr  = V[ nrht[nb] ];
+
+         nbrb = nbot[nbr];
+         Hbb2 = H[nbrb];
+         Vbb2 = V[nbrb];
+      }
+
+      int ntrt = 0;
+      real_t Htr = 0.0, Vtr = 0.0; // Utr = 0.0
+      real_t Htt2 = 0.0;
+      real_t Vtt2 = 0.0;
+      if(lvl < level[nt]) {
+         Htr  = H[ nrht[nt] ];
+         //Utr  = U[ nrht[nt] ];
+         Vtr  = V[ nrht[nt] ];
+
+         ntrt = ntop[ntr];
+         Htt2 = H[ntrt];
+         Vtt2 = V[ntrt];
+      }
+
+      ////////////////////////////////////////
+      /// Artificial Viscosity corrections ///
+      ////////////////////////////////////////
+
+      real_t Hxminus = H[ic];
+      real_t Uxminus = 0.0;
+      real_t Vxminus = 0.0;
+      if (mesh->map_xcell2face_left1[ic] >= 0){
+         Hxminus  = Hx[mesh->map_xcell2face_left1[ic]];
+         Uxminus  = Ux[mesh->map_xcell2face_left1[ic]];
+         Vxminus  = Vx[mesh->map_xcell2face_left1[ic]];
+      }
+
+      real_t Hxminus2 = 0.0;
+      if(lvl < level[nl]) Hxminus2 = H[ic];
+      real_t Uxminus2 = 0.0;
+      real_t Vxminus2 = 0.0;
+      if (mesh->map_xcell2face_left2[ic] >= 0) {
+         Hxminus2 = Hx[mesh->map_xcell2face_left2[ic]];
+         Uxminus2 = Ux[mesh->map_xcell2face_left2[ic]];
+         Vxminus2 = Vx[mesh->map_xcell2face_left2[ic]];
+      }
+
+      real_t Hxplus = H[ic];
+      real_t Uxplus = 0.0;
+      real_t Vxplus = 0.0;
+      if (mesh->map_xcell2face_right1[ic] >= 0){
+         Hxplus   = Hx[mesh->map_xcell2face_right1[ic]];
+         Uxplus   = Ux[mesh->map_xcell2face_right1[ic]];
+         Vxplus   = Vx[mesh->map_xcell2face_right1[ic]];
+      }
+
+      real_t Hxplus2 = 0.0;
+      if(lvl < level[nr]) Hxplus2 = H[ic];
+      real_t Uxplus2 = 0.0;
+      real_t Vxplus2 = 0.0;
+      if (mesh->map_xcell2face_right2[ic] >= 0){
+         Hxplus2  = Hx[mesh->map_xcell2face_right2[ic]];
+         Uxplus2  = Ux[mesh->map_xcell2face_right2[ic]];
+         Vxplus2  = Vx[mesh->map_xcell2face_right2[ic]];
+      }
+
+      if(level[nl] < level[nll]) {
+         Hll = (Hll + H[ ntop[nll] ]) * HALF;
+         Ull = (Ull + U[ ntop[nll] ]) * HALF;
+      }
+
+      real_t Hr2 = Hr;
+      real_t Ur2 = Ur;
+      if(lvl < level[nr]) {
+         Hr2 = (Hr2 + Hrt) * HALF;
+         Ur2 = (Ur2 + Urt) * HALF;
+      }
+
+      real_t wminusx_H = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+                              Hic-Hl, Hl-Hll, Hr2-Hic);
+
+      wminusx_H *= Hic - Hl;
+
+      if(lvl < level[nl]) {
+         if(level[nlt] < level[nltl])
+            Hll2 = (Hll2 + H[ ntop[nltl] ]) * HALF;
+         wminusx_H = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+                                  sqrt(g*Hxminus2), Hic-Hlt, Hlt-Hll2, Hr2-Hic) *
+                      (Hic - Hlt)) + wminusx_H)*HALF*HALF;
+      }
+
+      if(level[nr] < level[nrr]) {
+         Hrr = (Hrr + H[ ntop[nrr] ]) * HALF;
+         Urr = (Urr + U[ ntop[nrr] ]) * HALF;
+      }
+
+      real_t Hl2 = Hl;
+      real_t Ul2 = Ul;
+      if(lvl < level[nl]) {
+         Hl2 = (Hl2 + Hlt) * HALF;
+         Ul2 = (Ul2 + Ult) * HALF;
+      }
+
+      real_t wplusx_H = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+                           Hr-Hic, Hic-Hl2, Hrr-Hr);
+
+      wplusx_H *= Hr - Hic;
+
+      if(lvl < level[nr]) {
+         if(level[nrt] < level[nrtr])
+            Hrr2 = (Hrr2 + H[ ntop[nrtr] ]) * HALF;
+         wplusx_H = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+                                  sqrt(g*Hxplus2), Hrt-Hic, Hic-Hl2, Hrr2-Hrt) *
+                      (Hrt - Hic))+wplusx_H)*HALF*HALF;
+      }
+
+
+      real_t wminusx_U = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+                              Uic-Ul, Ul-Ull, Ur2-Uic);
+
+      wminusx_U *= Uic - Ul;
+
+      if(lvl < level[nl]) {
+         if(level[nlt] < level[nltl])
+            Ull2 = (Ull2 + U[ ntop[nltl] ]) * HALF;
+         wminusx_U = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+                                  sqrt(g*Hxminus2), Uic-Ult, Ult-Ull2, Ur2-Uic) *
+                      (Uic - Ult))+wminusx_U)*HALF*HALF;
+      }
+
+
+      real_t wplusx_U = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+                              Ur-Uic, Uic-Ul2, Urr-Ur);
+
+      wplusx_U *= Ur - Uic;
+
+      if(lvl < level[nr]) {
+         if(level[nrt] < level[nrtr])
+            Urr2 = (Urr2 + U[ ntop[nrtr] ]) * HALF;
+         wplusx_U = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+                                  sqrt(g*Hxplus2), Urt-Uic, Uic-Ul2, Urr2-Urt) *
+                      (Urt - Uic))+wplusx_U)*HALF*HALF;
+      }
+
+
+      if(level[nb] < level[nbb]) {
+         Hbb = (Hbb + H[ nrht[nbb] ]) * HALF;
+         Vbb = (Vbb + V[ nrht[nbb] ]) * HALF;
+      }
+
+      real_t Ht2 = Ht;
+      real_t Vt2 = Vt;
+      if(lvl < level[nt]) {
+         Ht2 = (Ht2 + Htr) * HALF;
+         Vt2 = (Vt2 + Vtr) * HALF;
+      }
+
+      real_t Hyminus = H[ic];
+      real_t Uyminus = 0.0;
+      real_t Vyminus = 0.0;
+      if (mesh->map_ycell2face_bot1[ic] >= 0){
+         Hyminus  = Hy[mesh->map_ycell2face_bot1[ic]];
+         Uyminus  = Uy[mesh->map_ycell2face_bot1[ic]];
+         Vyminus  = Vy[mesh->map_ycell2face_bot1[ic]];
+      }
+
+      real_t Hyminus2 = 0.0;
+      if(lvl < level[nb]) Hyminus2 = H[ic];
+      real_t Uyminus2 = 0.0;
+      real_t Vyminus2 = 0.0;
+      if (mesh->map_ycell2face_bot2[ic] >= 0){
+         Hyminus2 = Hy[mesh->map_ycell2face_bot2[ic]];
+         Uyminus2 = Uy[mesh->map_ycell2face_bot2[ic]];
+         Vyminus2 = Vy[mesh->map_ycell2face_bot2[ic]];
+      }
+
+      real_t Hyplus = H[ic];
+      real_t Uyplus = 0.0;
+      real_t Vyplus = 0.0;
+      if (mesh->map_ycell2face_top1[ic] >= 0){
+         Hyplus   = Hy[mesh->map_ycell2face_top1[ic]];
+         Uyplus   = Uy[mesh->map_ycell2face_top1[ic]];
+         Vyplus   = Vy[mesh->map_ycell2face_top1[ic]];
+      }
+
+      real_t Hyplus2 = 0.0;
+      if(lvl < level[nt]) Hyplus2 = H[ic];
+      real_t Uyplus2 = 0.0;
+      real_t Vyplus2 = 0.0;
+      if (mesh->map_ycell2face_top2[ic] >= 0){
+         Hyplus2  = Hy[mesh->map_ycell2face_top2[ic]];
+         Uyplus2  = Uy[mesh->map_ycell2face_top2[ic]];
+         Vyplus2  = Vy[mesh->map_ycell2face_top2[ic]];
+      }
+
+      real_t wminusy_H = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+                              Hic-Hb, Hb-Hbb, Ht2-Hic);
+
+      wminusy_H *= Hic - Hb;
+
+      if(lvl < level[nb]) {
+         if(level[nbr] < level[nbrb])
+            Hbb2 = (Hbb2 + H[ nrht[nbrb] ]) * HALF;
+         wminusy_H = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+                                  sqrt(g*Hyminus2), Hic-Hbr, Hbr-Hbb2, Ht2-Hic) *
+                      (Hic - Hbr))+wminusy_H)*HALF*HALF;
+      }
+
+
+      if(level[nt] < level[ntt]) {
+         Htt = (Htt + H[ nrht[ntt] ]) * HALF;
+         Vtt = (Vtt + V[ nrht[ntt] ]) * HALF;
+      }
+
+      real_t Hb2 = Hb;
+      real_t Vb2 = Vb;
+      if(lvl < level[nb]) {
+         Hb2 = (Hb2 + Hbr) * HALF;
+         Vb2 = (Vb2 + Vbr) * HALF;
+      }
+
+      real_t wplusy_H = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+                             Ht-Hic, Hic-Hb2, Htt-Ht);
+
+      wplusy_H *= Ht - Hic;
+
+      if(lvl < level[nt]) {
+         if(level[ntr] < level[ntrt])
+            Htt2 = (Htt2 + H[ nrht[ntrt] ]) * HALF;
+         wplusy_H = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+                                  sqrt(g*Hyplus2), Htr-Hic, Hic-Hb2, Htt2-Htr) *
+                      (Htr - Hic))+wplusy_H)*HALF*HALF;
+      }
+
+      real_t wminusy_V = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+                              Vic-Vb, Vb-Vbb, Vt2-Vic);
+
+      wminusy_V *= Vic - Vb;
+
+      if(lvl < level[nb]) {
+         if(level[nbr] < level[nbrb])
+            Vbb2 = (Vbb2 + V[ nrht[nbrb] ]) * HALF;
+         wminusy_V = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+                                  sqrt(g*Hyminus2), Vic-Vbr, Vbr-Vbb2, Vt2-Vic) *
+                      (Vic - Vbr))+wminusy_V)*HALF*HALF;
+      }
+
+      real_t wplusy_V = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+                           Vt-Vic, Vic-Vb2, Vtt-Vt);
+
+      wplusy_V *= Vt - Vic;
+
+      if(lvl < level[nt]) {
+         if(level[ntr] < level[ntrt])
+            Vtt2 = (Vtt2 + V[ nrht[ntrt] ]) * HALF;
+         wplusy_V = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+                                  sqrt(g*Hyplus2), Vtr-Vic, Vic-Vb2, Vtt2-Vtr) *
+                      (Vtr - Vic))+wplusy_V)*HALF*HALF;
+      }
+
+      real_t Hxfluxminus = HNEWXFLUXMINUS;
+      real_t Uxfluxminus = UNEWXFLUXMINUS;
+      real_t Vxfluxminus = UVNEWFLUXMINUS;
+
+      real_t Hxfluxplus  = HNEWXFLUXPLUS;
+      real_t Uxfluxplus  = UNEWXFLUXPLUS;
+      real_t Vxfluxplus  = UVNEWFLUXPLUS;
+
+      real_t Hyfluxminus = HNEWYFLUXMINUS;
+      real_t Uyfluxminus = VUNEWFLUXMINUS;
+      real_t Vyfluxminus = VNEWYFLUXMINUS;
+
+      real_t Hyfluxplus  = HNEWYFLUXPLUS;
+      real_t Uyfluxplus  = VUNEWFLUXPLUS;
+      real_t Vyfluxplus  = VNEWYFLUXPLUS;
+
+      if(lvl < level[nl]) {
+         Hxfluxminus = (Hxfluxminus + HNEWXFLUXMINUS2) * HALF;
+         Uxfluxminus = (Uxfluxminus + UNEWXFLUXMINUS2) * HALF;
+         Vxfluxminus = (Vxfluxminus + UVNEWFLUXMINUS2) * HALF;
+      }
+
+      if(lvl < level[nr]) {
+         Hxfluxplus  = (Hxfluxplus + HNEWXFLUXPLUS2) * HALF;
+         Uxfluxplus  = (Uxfluxplus + UNEWXFLUXPLUS2) * HALF;
+         Vxfluxplus  = (Vxfluxplus + UVNEWFLUXPLUS2) * HALF;
+      }
+
+      if(lvl < level[nb]) {
+         Hyfluxminus = (Hyfluxminus + HNEWYFLUXMINUS2) * HALF;
+         Uyfluxminus = (Uyfluxminus + VUNEWFLUXMINUS2) * HALF;
+         Vyfluxminus = (Vyfluxminus + VNEWYFLUXMINUS2) * HALF;
+      }
+
+      if(lvl < level[nt]) {
+         Hyfluxplus  = (Hyfluxplus + HNEWYFLUXPLUS2) * HALF;
+         Uyfluxplus  = (Uyfluxplus + VUNEWFLUXPLUS2) * HALF;
+         Vyfluxplus  = (Vyfluxplus + VNEWYFLUXPLUS2) * HALF;
+      }
+
+      H_new[ic] = U_fullstep(deltaT, dxic, Hic,
+                      Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus)
+                 - wminusx_H + wplusx_H - wminusy_H + wplusy_H;
+      U_new[ic] = U_fullstep(deltaT, dxic, Uic,
+                      Uxfluxplus, Uxfluxminus, Uyfluxplus, Uyfluxminus)
+                 - wminusx_U + wplusx_U;
+      V_new[ic] = U_fullstep(deltaT, dxic, Vic,
+                      Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus)
+                 - wminusy_V + wplusy_V;
+
+#if DEBUG >= 1
+      if (DEBUG >= 1) {
+         real_t U_tmp = U_new[ic];
+         real_t V_tmp = V_new[ic];
+         if (U_tmp == 0.0) U_tmp = 0.0;
+         if (V_tmp == 0.0) V_tmp = 0.0;
+         printf("DEBUG ic %d H_new %lf U_new %lf V_new %lf\n",ic,H_new[ic],U_tmp,V_tmp);
+      }
+#endif
+
+/*
+      printf("DEBUG ic %d deltaT, %lf dxic, %lf Hic, %lf Hxfluxplus, %lf Hxfluxminus, %lf Hyfluxplus, %lf Hyfluxminus %lf\n",
+         ic, deltaT, dxic, Hic, Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus);
+      printf("DEBUG ic %d wminusx_H %lf wplusx_H %lf wminusy_H %lf wplusy_H %lf\n",ic, wminusx_H, wplusx_H, wminusy_H, wplusy_H);
+      printf("DEBUG ic %d deltaT, %lf dxic, %lf Vic, %lf Vxfluxplus, %lf Vxfluxminus, %lf Vyfluxplus, %lf Vyfluxminus %lf\n",
+         ic, deltaT, dxic, Vic, Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus);
+      printf("DEBUG ic %d wminusy_V %lf wplusy_V %lf\n",ic, wminusy_V, wplusy_V);
+*/
+   }//end forloop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+   {
+#endif
+      // Replace H with H_new and deallocate H. New memory will have the characteristics
+      // of the new memory and the name of the old. Both return and arg1 will be reset to new memory
+      H = (state_t *)state_memory.memory_replace(H, H_new);
+      U = (state_t *)state_memory.memory_replace(U, U_new);
+      V = (state_t *)state_memory.memory_replace(V, V_new);
+
+      //state_memory.memory_report();
+      //printf("DEBUG end finite diff\n\n"); 
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+      cpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += cpu_timer_stop(tstart_cpu);
+}
+
+#ifdef HAVE_OPENCL
+void State::gpu_calc_finite_difference(double deltaT)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   //cl_mem dev_ptr = NULL;
+
+   size_t &ncells    = mesh->ncells;
+   size_t &ncells_ghost = mesh->ncells_ghost;
+   if (ncells_ghost < ncells) ncells_ghost = ncells;
+   int &levmx           = mesh->levmx;
+   cl_mem &dev_celltype = mesh->dev_celltype;
+   cl_mem &dev_nlft     = mesh->dev_nlft;
+   cl_mem &dev_nrht     = mesh->dev_nrht;
+   cl_mem &dev_nbot     = mesh->dev_nbot;
+   cl_mem &dev_ntop     = mesh->dev_ntop;
+   cl_mem &dev_level    = mesh->dev_level;
+   cl_mem &dev_levdx    = mesh->dev_levdx;
+   cl_mem &dev_levdy    = mesh->dev_levdy;
+
+   assert(dev_H);
+   assert(dev_U);
+   assert(dev_V);
+   assert(dev_nlft);
+   assert(dev_nrht);
+   assert(dev_nbot);
+   assert(dev_ntop);
+   assert(dev_level);
+   assert(dev_levdx);
+   assert(dev_levdy);
+
+   cl_mem dev_H_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_H_new"), DEVICE_REGULAR_MEMORY);
+   cl_mem dev_U_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_U_new"), DEVICE_REGULAR_MEMORY);
+   cl_mem dev_V_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_V_new"), DEVICE_REGULAR_MEMORY);
+ 
+   size_t local_work_size = 128;
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+#ifdef HAVE_MPI
+   if (mesh->numpe > 1) {
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_local,   1, NULL, &global_work_size, &local_work_size, NULL);
+    
+        /*
+        __kernel void copy_state_data_cl(
+                         const int    isize,         // 0
+                __global      state_t *H,            // 1
+                __global      state_t *U,            // 2
+                __global      state_t *V,            // 3
+                __global      state_t *H_new,        // 4
+                __global      state_t *U_new,        // 5
+                __global      state_t *V_new)        // 6
+        */
+
+      ezcl_set_kernel_arg(kernel_copy_state_data, 0, sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 1, sizeof(cl_mem), (void *)&dev_H);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 2, sizeof(cl_mem), (void *)&dev_U);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 3, sizeof(cl_mem), (void *)&dev_V);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 4, sizeof(cl_mem), (void *)&dev_H_new);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 5, sizeof(cl_mem), (void *)&dev_U_new);
+      ezcl_set_kernel_arg(kernel_copy_state_data, 6, sizeof(cl_mem), (void *)&dev_V_new);
+
+      //ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_state_data,   1, NULL, &global_work_size, &local_work_size, &copy_state_data_event);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_state_data,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+      dev_H = (cl_mem)gpu_state_memory.memory_replace(dev_H, dev_H_new);
+      dev_U = (cl_mem)gpu_state_memory.memory_replace(dev_U, dev_U_new);
+      dev_V = (cl_mem)gpu_state_memory.memory_replace(dev_V, dev_V_new);
+
+      L7_Dev_Update(dev_H, L7_STATE_T, mesh->cell_handle);
+      L7_Dev_Update(dev_U, L7_STATE_T, mesh->cell_handle);
+      L7_Dev_Update(dev_V, L7_STATE_T, mesh->cell_handle);
+
+      dev_H_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_H_new"), DEVICE_REGULAR_MEMORY);
+      dev_U_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_U_new"), DEVICE_REGULAR_MEMORY);
+      dev_V_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_V_new"), DEVICE_REGULAR_MEMORY);
+
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_ghost,   1, NULL, &global_work_size, &local_work_size, NULL);
+   } else {
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions,   1, NULL, &global_work_size, &local_work_size, NULL);
+   }
+#else
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions,   1, NULL, &global_work_size, &local_work_size, NULL);
+#endif
+
+     /*
+     __kernel void calc_finite_difference_cl(
+                      const int     ncells,    // 0  Total number of cells.
+                      const int     lvmax,     // 1  Maximum level
+             __global       state_t *H,        // 2
+             __global       state_t *U,        // 3
+             __global       state_t *V,        // 4
+             __global       state_t *H_new,    // 5
+             __global       state_t *U_new,    // 6
+             __global       state_t *V_new,    // 7
+             __global const int     *nlft,     // 8  Array of left neighbors.
+             __global const int     *nrht,     // 9  Array of right neighbors.
+             __global const int     *ntop,     // 10  Array of bottom neighbors.
+             __global const int     *nbot,     // 11  Array of top neighbors.
+             __global const int     *level,    // 12  Array of level information.
+                      const real_t   deltaT,   // 13  Size of time step.
+             __global const real_t  *lev_dx,   // 14
+             __global const real_t  *lev_dy,   // 15
+             __local        state4_t *tile,    // 16  Tile size in state4.
+             __local        int8  *itile)      // 17  Tile size in int8.
+     */
+   cl_event calc_finite_difference_event;
+
+   real_t deltaT_local = deltaT;
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 0, sizeof(cl_int),  (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 1, sizeof(cl_int),  (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 2, sizeof(cl_mem),  (void *)&dev_H);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 3, sizeof(cl_mem),  (void *)&dev_U);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 4, sizeof(cl_mem),  (void *)&dev_V);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 5, sizeof(cl_mem),  (void *)&dev_H_new);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 6, sizeof(cl_mem),  (void *)&dev_U_new);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 7, sizeof(cl_mem),  (void *)&dev_V_new);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 8, sizeof(cl_mem),  (void *)&dev_nlft);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference, 9, sizeof(cl_mem),  (void *)&dev_nrht);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,10, sizeof(cl_mem),  (void *)&dev_ntop);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,11, sizeof(cl_mem),  (void *)&dev_nbot);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,12, sizeof(cl_mem),  (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,13, sizeof(cl_real_t), (void *)&deltaT_local);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,14, sizeof(cl_mem),  (void *)&dev_levdx);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,15, sizeof(cl_mem),  (void *)&dev_levdy);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,16, local_work_size*sizeof(cl_state4_t),    NULL);
+   ezcl_set_kernel_arg(kernel_calc_finite_difference,17, local_work_size*sizeof(cl_int8),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_finite_difference,   1, NULL, &global_work_size, &local_work_size, &calc_finite_difference_event);
+
+   ezcl_wait_for_events(1, &calc_finite_difference_event);
+   ezcl_event_release(calc_finite_difference_event);
+
+   dev_H = (cl_mem)gpu_state_memory.memory_replace(dev_H, dev_H_new);
+   dev_U = (cl_mem)gpu_state_memory.memory_replace(dev_U, dev_U_new);
+   dev_V = (cl_mem)gpu_state_memory.memory_replace(dev_V, dev_V_new);
+
+   gpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+}
+#endif
+
+void State::symmetry_check(const char *string, vector<int> sym_index, double eps,
+                           SIGN_RULE sign_rule, int &flag)
+{
+   size_t &ncells = mesh->ncells;
+
+   double xsign = 1.0, ysign = 1.0;
+
+   if (sign_rule == DIAG_RULE || sign_rule == X_RULE) {
+      xsign = -1.0;
+   }
+
+   if (sign_rule == DIAG_RULE || sign_rule == Y_RULE) {
+      ysign = -1.0;
+   }
+
+   for (uint ic=0; ic<ncells; ic++) {
+      /*  Symmetrical check */
+      if (fabs(H[ic] - H[sym_index[ic]]) > eps) {
+         printf("%s ic %d sym %d H[ic] %lf Hsym %lf diff %lf\n",
+                string,ic,sym_index[ic],H[ic],H[sym_index[ic]],fabs(H[ic]-H[sym_index[ic]]));
+         flag++;
+      }
+      if (fabs(U[ic] - xsign*U[sym_index[ic]]) > eps) {
+         printf("%s ic %d sym %d U[ic] %lf Usym %lf diff %lf\n",
+                string,ic,sym_index[ic],U[ic],U[sym_index[ic]],fabs(U[ic]-xsign*U[sym_index[ic]]));
+         flag++;
+      }
+      if (fabs(V[ic] - ysign*V[sym_index[ic]]) > eps) {
+         printf("%s ic %d sym %d V[ic] %lf Vsym %lf diff %lf\n",
+                string,ic,sym_index[ic],V[ic],V[sym_index[ic]],fabs(V[ic]-ysign*V[sym_index[ic]]));
+         flag++;
+      }
+   }
+
+}
+
+size_t State::calc_refine_potential(vector<int> &mpot,int &icount, int &jcount)
+{
+   
+  struct timeval tstart_cpu;
+#ifdef _OPENMP
+#pragma omp parallel 
+{
+#endif
+
+  struct timeval tstart_lev2;
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+   cpu_timer_start(&tstart_cpu);
+   if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+#ifdef _OPENMP
+}
+#endif
+
+   int *nlft, *nrht, *nbot, *ntop, *level;
+   
+   size_t ncells = mesh->ncells;
+   nlft  = mesh->nlft;
+   nrht  = mesh->nrht;
+   nbot  = mesh->nbot;
+   ntop  = mesh->ntop;
+   level = mesh->level;
+
+#ifdef _OPENMP
+#pragma omp master
+   {
+#endif
+   icount=0;
+   jcount=0;
+#ifdef _OPENMP
+   }
+#pragma omp barrier
+#endif
+
+#ifdef HAVE_MPI
+   // We need to update the ghost regions and boundary regions for the state
+   // variables since they were changed in the finite difference routine. We
+   // want to use the updated values for refinement decisions
+   if (mesh->numpe > 1) {
+      apply_boundary_conditions_local();
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+{
+#endif
+      L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+      L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+      L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+}
+#pragma omp barrier
+#endif
+      apply_boundary_conditions_ghost();
+   } else {
+      apply_boundary_conditions();
+   }
+#else
+   apply_boundary_conditions();
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+/*****HIGH LEVEL OMP******/
+
+   int lowerBound, upperBound;
+   //mesh->set_bounds(ncells);
+   mesh->get_bounds(lowerBound,upperBound);
+   for (int ic=lowerBound; ic<upperBound; ic++) {
+
+      if (mesh->celltype[ic] != REAL_CELL) continue;
+
+      state_t Hic = H[ic];
+      //state_t Uic = U[ic];
+      //state_t Vic = V[ic];
+
+      int nl = nlft[ic];
+      state_t Hl = H[nl];
+      //state_t Ul = U[nl];
+      //state_t Vl = V[nl];
+
+      if (level[nl] > level[ic]){
+         int nlt = ntop[nl];
+         Hl = REFINE_HALF * (Hl + H[nlt]);
+      }
+
+      int nr = nrht[ic];
+      state_t Hr = H[nr];
+      //state_t Ur = U[nr];
+      //state_t Vr = V[nr];
+
+      if (level[nr] > level[ic]){
+         int nrt = ntop[nr];
+         Hr = REFINE_HALF * (Hr + H[nrt]);
+      }
+
+      int nb = nbot[ic];
+      state_t Hb = H[nb];
+      //state_t Ub = U[nb];
+      //state_t Vb = V[nb];
+
+      if (level[nb] > level[ic]){
+         int nbr = nrht[nb];
+         Hb = REFINE_HALF * (Hb + H[nbr]);
+      }
+
+      int nt = ntop[ic];
+      state_t Ht = H[nt];
+      //state_t Ut = U[nt];
+      //state_t Vt = V[nt];
+
+      if (level[nt] > level[ic]){
+         int ntr = nrht[nt];
+         Ht = REFINE_HALF * (Ht + H[ntr]);
+      }
+
+      state_t duplus1; //, duplus2;
+      state_t duhalf1; //, duhalf2;
+      state_t duminus1; //, duminus2;
+
+      duplus1 = Hr-Hic;
+      //duplus2 = Ur-Uic;
+      duhalf1 = Hic-Hl;
+      //duhalf2 = Uic-Ul;
+
+      state_t qmax = REFINE_NEG_THOUSAND;
+
+      state_t qpot = max(fabs(duplus1/Hic), fabs(duhalf1/Hic));
+      if (qpot > qmax) qmax = qpot;
+
+      duminus1 = Hic-Hl;
+      //duminus2 = Uic-Ul;
+      duhalf1 = Hr-Hic;
+      //duhalf2 = Ur-Uic;
+
+      qpot = max(fabs(duminus1/Hic), fabs(duhalf1/Hic));
+      if (qpot > qmax) qmax = qpot;
+
+      duplus1 = Ht-Hic;
+      //duplus2 = Vt-Vic;
+      duhalf1 = Hic-Hb;
+      //duhalf2 = Vic-Vb;
+
+      qpot = max(fabs(duplus1/Hic), fabs(duhalf1/Hic));
+      if (qpot > qmax) qmax = qpot;
+
+      duminus1 = Hic-Hb;
+      //duminus2 = Vic-Vb;
+      duhalf1 = Ht-Hic;
+      //duhalf2 = Vt-Vic;
+
+      qpot = max(fabs(duminus1/Hic), fabs(duhalf1/Hic));
+      if (qpot > qmax) qmax = qpot;
+
+      mpot[ic]=0;
+      if (qmax > REFINE_GRADIENT && level[ic] < mesh->levmx) {
+         mpot[ic]=1;
+      } else if (qmax < COARSEN_GRADIENT && level[ic] > 0) {
+         mpot[ic] = -1;
+      }
+      //if (mpot[ic]) printf("DEBUG cpu cell is %d mpot %d\n",ic,mpot[ic]);
+   }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+   if (TIMING_LEVEL >= 2) {
+      cpu_timers[STATE_TIMER_CALC_MPOT] += cpu_timer_stop(tstart_lev2);
+   }
+#ifdef _OPENMP
+}
+#endif
+
+#ifdef _OPENMP
+}
+#pragma omp barrier
+#endif
+   int newcount = mesh->refine_smooth(mpot, icount, jcount);
+   //printf("DEBUG -- after refine smooth in file %s line %d icount %d jcount %d newcount %d\n",__FILE__,__LINE__,icount,jcount,newcount);
+
+   cpu_timers[STATE_TIMER_REFINE_POTENTIAL] += cpu_timer_stop(tstart_cpu);
+
+   return(newcount);
+}
+
+#ifdef HAVE_OPENCL
+size_t State::gpu_calc_refine_potential(int &icount, int &jcount)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   struct timeval tstart_lev2;
+   if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t &ncells       = mesh->ncells;
+   int &levmx           = mesh->levmx;
+   cl_mem &dev_nlft     = mesh->dev_nlft;
+   cl_mem &dev_nrht     = mesh->dev_nrht;
+   cl_mem &dev_nbot     = mesh->dev_nbot;
+   cl_mem &dev_ntop     = mesh->dev_ntop;
+   //cl_mem &dev_mpot     = mesh->dev_mpot;
+   cl_mem &dev_i        = mesh->dev_i;
+   cl_mem &dev_j        = mesh->dev_j;
+   cl_mem &dev_level    = mesh->dev_level;
+   cl_mem &dev_celltype = mesh->dev_celltype;
+   cl_mem &dev_levdx    = mesh->dev_levdx;
+   cl_mem &dev_levdy    = mesh->dev_levdy;
+
+   assert(dev_H);
+   assert(dev_U);
+   assert(dev_V);
+   assert(dev_nlft);
+   assert(dev_nrht);
+   assert(dev_nbot);
+   assert(dev_ntop);
+   assert(dev_i);
+   assert(dev_j);
+   assert(dev_level);
+   //assert(dev_mpot);
+   //assert(dev_ioffset);
+   assert(dev_levdx);
+   assert(dev_levdy);
+
+   icount = 0;
+   jcount = 0;
+
+   size_t local_work_size = 128;
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+   size_t block_size = global_work_size/local_work_size;
+
+#ifdef HAVE_MPI
+   //size_t nghost_local = mesh->ncells_ghost - ncells;
+
+   if (mesh->numpe > 1) {
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_local,   1, NULL, &global_work_size, &local_work_size, NULL);
+
+      L7_Dev_Update(dev_H, L7_STATE_T, mesh->cell_handle);
+      L7_Dev_Update(dev_U, L7_STATE_T, mesh->cell_handle);
+      L7_Dev_Update(dev_V, L7_STATE_T, mesh->cell_handle);
+
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_ghost,   1, NULL, &global_work_size, &local_work_size, NULL);
+   } else {
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+      ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions,   1, NULL, &global_work_size, &local_work_size, NULL);
+   }
+#else
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+   ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions,   1, NULL, &global_work_size, &local_work_size, NULL);
+#endif
+
+#ifdef BOUNDS_CHECK
+      {
+         vector<int> nlft_tmp(mesh->ncells_ghost);
+         vector<int> nrht_tmp(mesh->ncells_ghost);
+         vector<int> nbot_tmp(mesh->ncells_ghost);
+         vector<int> ntop_tmp(mesh->ncells_ghost);
+         vector<int> level_tmp(mesh->ncells_ghost);
+         vector<state_t> H_tmp(mesh->ncells_ghost);
+         ezcl_enqueue_read_buffer(command_queue, dev_nlft,  CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nlft_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nrht,  CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nrht_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_nbot,  CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nbot_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_ntop,  CL_TRUE,  0, mesh->ncells_ghost*sizeof(cl_int), &ntop_tmp[0],  NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE,  0, mesh->ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+         ezcl_enqueue_read_buffer(command_queue, dev_H,     CL_TRUE,  0, mesh->ncells_ghost*sizeof(cl_int), &H_tmp[0],     NULL);
+         for (uint ic=0; ic<ncells; ic++){
+            int nl = nlft_tmp[ic];
+            if (nl<0 || nl>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mesh->mype,__LINE__,ic,nl);
+            if (level_tmp[nl] > level_tmp[ic]){
+               int ntl = ntop_tmp[nl];
+               if (ntl<0 || ntl>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mesh->mype,__LINE__,ic,ic+mesh->noffset,nl,ntl);
+            }
+            int nr = nrht_tmp[ic];
+            if (nr<0 || nr>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mesh->mype,__LINE__,ic,nr);
+            if (level_tmp[nr] > level_tmp[ic]){
+               int ntr = ntop_tmp[nr];
+               if (ntr<0 || ntr>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mesh->mype,__LINE__,ic,ntr);
+            }
+            int nb = nbot_tmp[ic];
+            if (nb<0 || nb>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mesh->mype,__LINE__,ic,nb);
+            if (level_tmp[nb] > level_tmp[ic]){
+               int nrb = nrht_tmp[nb];
+               if (nrb<0 || nrb>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mesh->mype,__LINE__,ic,nrb);
+            }
+            int nt = ntop_tmp[ic];
+            if (nt<0 || nt>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d ntop %d\n",mesh->mype,__LINE__,ic,nt);
+            if (level_tmp[nt] > level_tmp[ic]){
+               int nrt = nrht_tmp[nt];
+               if (nrt<0 || nrt>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mesh->mype,__LINE__,ic,nrt);
+            }
+         }
+         for (uint ic=0; ic<mesh->ncells_ghost; ic++){
+            if (H_tmp[ic] < 1.0) printf("%d: Warning at line %d cell %d H %lf\n",mesh->mype,__LINE__,ic,H_tmp[ic]);
+         }
+      }
+#endif
+
+   size_t result_size = 1;
+   cl_mem dev_result     = ezcl_malloc(NULL, const_cast<char *>("dev_result"),     &result_size,        sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+   cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size,         sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+
+   dev_mpot              = ezcl_malloc(NULL, const_cast<char *>("dev_mpot"),       &mesh->ncells_ghost, sizeof(cl_int),  CL_MEM_READ_WRITE, 0);
+
+     /*
+     __kernel void refine_potential
+              const int      ncells,     // 0  Total number of cells.
+              const int      levmx,      // 1  Maximum level
+     __global       state_t *H,          // 2
+     __global       state_t *U,          // 3
+     __global       state_t *V,          // 4
+     __global const int     *nlft,       // 5  Array of left neighbors.
+     __global const int     *nrht,       // 6  Array of right neighbors.
+     __global const int     *ntop,       // 7  Array of bottom neighbors.
+     __global const int     *nbot,       // 8  Array of top neighbors.
+     __global const int     *level,      // 9  Array of level information.
+     __global const int     *celltype,   // 10  Array of celltype information.
+     __global       int     *mpot,       // 11  Array of mesh potential information.
+     __global       int2    *redscratch, // 12
+     __global const real_t  *lev_dx,     // 13
+     __global const real_t  *lev_dy,     // 14
+     __global       int2    *result,     // 15
+     __local        state_t *tile,       // 16  Tile size in real4.
+     __local        int8    *itile)      // 17  Tile size in int8.
+     */
+
+   ezcl_set_kernel_arg(kernel_refine_potential, 0, sizeof(cl_int),  (void *)&ncells);
+   ezcl_set_kernel_arg(kernel_refine_potential, 1, sizeof(cl_int),  (void *)&levmx);
+   ezcl_set_kernel_arg(kernel_refine_potential, 2, sizeof(cl_mem),  (void *)&dev_H);
+   ezcl_set_kernel_arg(kernel_refine_potential, 3, sizeof(cl_mem),  (void *)&dev_U);
+   ezcl_set_kernel_arg(kernel_refine_potential, 4, sizeof(cl_mem),  (void *)&dev_V);
+   ezcl_set_kernel_arg(kernel_refine_potential, 5, sizeof(cl_mem),  (void *)&dev_nlft);
+   ezcl_set_kernel_arg(kernel_refine_potential, 6, sizeof(cl_mem),  (void *)&dev_nrht);
+   ezcl_set_kernel_arg(kernel_refine_potential, 7, sizeof(cl_mem),  (void *)&dev_ntop);
+   ezcl_set_kernel_arg(kernel_refine_potential, 8, sizeof(cl_mem),  (void *)&dev_nbot);
+   ezcl_set_kernel_arg(kernel_refine_potential, 9, sizeof(cl_mem),  (void *)&dev_i);
+   ezcl_set_kernel_arg(kernel_refine_potential,10, sizeof(cl_mem),  (void *)&dev_j);
+   ezcl_set_kernel_arg(kernel_refine_potential,11, sizeof(cl_mem),  (void *)&dev_level);
+   ezcl_set_kernel_arg(kernel_refine_potential,12, sizeof(cl_mem),  (void *)&dev_celltype);
+   ezcl_set_kernel_arg(kernel_refine_potential,13, sizeof(cl_mem),  (void *)&dev_levdx);
+   ezcl_set_kernel_arg(kernel_refine_potential,14, sizeof(cl_mem),  (void *)&dev_levdy);
+   ezcl_set_kernel_arg(kernel_refine_potential,15, sizeof(cl_mem),  (void *)&dev_mpot);
+   ezcl_set_kernel_arg(kernel_refine_potential,16, sizeof(cl_mem),  (void *)&dev_redscratch);
+   ezcl_set_kernel_arg(kernel_refine_potential,17, sizeof(cl_mem),  (void *)&dev_result);
+   ezcl_set_kernel_arg(kernel_refine_potential,18, local_work_size*sizeof(cl_state_t),    NULL);
+   ezcl_set_kernel_arg(kernel_refine_potential,19, local_work_size*sizeof(cl_int8),    NULL);
+
+   ezcl_enqueue_ndrange_kernel(command_queue, kernel_refine_potential, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+   mesh->gpu_rezone_count2(block_size, local_work_size, dev_redscratch, dev_result);
+
+   int count[2] = {0, 0};
+   ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int2), count, NULL);
+   icount  = count[0];
+   jcount  = count[1];
+   //size_t result = ncells + icount - jcount;
+
+   //int mpot_check[ncells];
+   //ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), mpot_check, NULL);
+   //for (int ic=0; ic<ncells; ic++){
+   //   if (mpot_check[ic]) printf("DEBUG -- cell %d mpot %d\n",ic,mpot_check[ic]);
+   //}
+
+   //printf("result = %lu after first refine potential icount %d jcount %d\n",result, icount, jcount);
+//   int which_smooth = 1;
+
+   ezcl_device_memory_delete(dev_redscratch);
+   ezcl_device_memory_delete(dev_result);
+
+   if (TIMING_LEVEL >= 2) {
+      gpu_timers[STATE_TIMER_CALC_MPOT] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+   }
+
+   int my_result = mesh->gpu_refine_smooth(dev_mpot, icount, jcount);
+   //printf("DEBUG gpu calc refine potential %d icount %d jcount %d\n",my_result,icount,jcount);
+
+   gpu_timers[STATE_TIMER_REFINE_POTENTIAL] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+   return((size_t)my_result);
+}
+#endif
+
+double State::mass_sum(int enhanced_precision_sum)
+{
+   size_t &ncells = mesh->ncells;
+   int *celltype = mesh->celltype;
+   int *level    = mesh->level;
+
+#ifdef HAVE_MPI
+   //int &mype = mesh->mype;
+#endif
+
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   double summer = 0.0;
+   double total_sum = 0.0;
+
+   if (enhanced_precision_sum == SUM_KAHAN) {
+      //printf("DEBUG -- kahan_sum\n");
+      double corrected_next_term, new_sum;
+      struct esum_type local;
+#ifdef HAVE_MPI
+      struct esum_type global;
+#endif
+
+      local.sum = 0.0;
+      local.correction = 0.0;
+      int ic;
+      for (ic = 0; ic < (int)ncells; ic++) {
+         if (celltype[ic] == REAL_CELL) {
+            //  Exclude boundary cells.
+            corrected_next_term= H[ic]*mesh->lev_deltax[level[ic]]*mesh->lev_deltay[level[ic]] + local.correction;
+            new_sum            = local.sum + local.correction;
+            local.correction   = corrected_next_term - (new_sum - local.sum);
+            local.sum          = new_sum;
+         }
+      }
+
+#ifdef HAVE_MPI
+      if (mesh->parallel) {
+         MPI_Allreduce(&local, &global, 1, MPI_TWO_DOUBLES, KNUTH_SUM, MPI_COMM_WORLD);
+         total_sum = global.sum + global.correction;
+      } else {
+         total_sum = local.sum + local.correction;
+      }
+
+//if(mype == 0) printf("MYPE %d: Line %d Iteration %d \t local_sum = %12.6lg, global_sum = %12.6lg\n", mype, __LINE__, mesh->m_ncycle, local.sum, global.sum);
+
+#else
+      total_sum = local.sum + local.correction;
+#endif
+
+   } else if (enhanced_precision_sum == SUM_REGULAR) {
+      //printf("DEBUG -- regular_sum\n");
+      for (uint ic=0; ic < ncells; ic++){
+         if (celltype[ic] == REAL_CELL) {
+            summer += H[ic]*mesh->lev_deltax[level[ic]]*mesh->lev_deltay[level[ic]];
+         }
+      }
+#ifdef HAVE_MPI
+      if (mesh->parallel) {
+         MPI_Allreduce(&summer, &total_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+      } else {
+         total_sum = summer;
+      }
+#else
+      total_sum = summer;
+#endif
+   }
+
+   cpu_timers[STATE_TIMER_MASS_SUM] += cpu_timer_stop(tstart_cpu);
+
+   return(total_sum);
+}
+
+#ifdef HAVE_OPENCL
+double State::gpu_mass_sum(int enhanced_precision_sum)
+{
+   struct timeval tstart_cpu;
+   cpu_timer_start(&tstart_cpu);
+
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   size_t &ncells       = mesh->ncells;
+   cl_mem &dev_levdx    = mesh->dev_levdx;
+   cl_mem &dev_levdy    = mesh->dev_levdy;
+   cl_mem &dev_celltype = mesh->dev_celltype;
+   cl_mem &dev_level    = mesh->dev_level;
+
+   assert(dev_H);
+   assert(dev_level);
+   assert(dev_levdx);
+   assert(dev_levdy);
+   assert(dev_celltype);
+
+   size_t one = 1;
+   cl_mem dev_mass_sum, dev_redscratch;
+   double gpu_mass_sum_total;
+
+   size_t local_work_size = 128;
+   size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+   size_t block_size     = global_work_size/local_work_size;
+
+   if (enhanced_precision_sum) {
+      dev_mass_sum = ezcl_malloc(NULL, const_cast<char *>("dev_mass_sum"), &one,    sizeof(cl_real2_t), CL_MEM_READ_WRITE, 0);
+      dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real2_t), CL_MEM_READ_WRITE, 0);
+
+        /*
+        __kernel void reduce_sum_cl(
+                         const int      isize,      // 0
+                __global       state_t *array,      // 1   Array to be reduced.
+                __global       int     *level,      // 2
+                __global       int     *levdx,      // 3
+                __global       int     *levdy,      // 4
+                __global       int     *celltype,   // 5
+                __global       real_t  *redscratch, // 6   Final result of operation.
+                __local        real_t  *tile)       // 7
+        */
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 0, sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 1, sizeof(cl_mem), (void *)&dev_H);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 2, sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 3, sizeof(cl_mem), (void *)&dev_levdx);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 4, sizeof(cl_mem), (void *)&dev_levdy);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 5, sizeof(cl_mem), (void *)&dev_celltype);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 6, sizeof(cl_mem), (void *)&dev_mass_sum);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 7, sizeof(cl_mem), (void *)&dev_redscratch);
+      ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 8, local_work_size*sizeof(cl_real2_t), NULL);
+
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_epsum_mass_stage1of2, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+      if (block_size > 1) {
+           /*
+           __kernel void reduce_sum_cl(
+                            const int      isize,      // 0
+                   __global       int     *redscratch, // 1   Array to be reduced.
+                   __local        real_t  *tile)       // 2
+           */
+
+         ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+         ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 1, sizeof(cl_mem), (void *)&dev_mass_sum);
+         ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 2, sizeof(cl_mem), (void *)&dev_redscratch);
+         ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 3, local_work_size*sizeof(cl_real2_t), NULL);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_epsum_mass_stage2of2, 1, NULL, &local_work_size, &local_work_size, NULL);
+      }
+
+      struct esum_type local, global;
+      real2_t mass_sum;
+
+      ezcl_enqueue_read_buffer(command_queue, dev_mass_sum, CL_TRUE, 0, 1*sizeof(cl_real2_t), &mass_sum, NULL);
+
+      local.sum = mass_sum.s0;
+      local.correction = mass_sum.s1;
+      global.sum = local.sum;
+      global.correction = local.correction;
+#ifdef HAVE_MPI
+      MPI_Allreduce(&local, &global, 1, MPI_TWO_DOUBLES, KNUTH_SUM, MPI_COMM_WORLD);
+#endif
+      gpu_mass_sum_total = global.sum + global.correction;
+   } else {
+      dev_mass_sum = ezcl_malloc(NULL, const_cast<char *>("dev_mass_sum"), &one,    sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+      dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+        /*
+        __kernel void reduce_sum_cl(
+                         const int      isize,      // 0
+                __global       state_t *array,      // 1   Array to be reduced.
+                __global       int     *level,      // 2
+                __global       int     *levdx,      // 3
+                __global       int     *levdy,      // 4
+                __global       int     *celltype,   // 5
+                __global       real_t  *redscratch, // 6   Final result of operation.
+                __local        real_t  *tile)       // 7
+        */
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 0, sizeof(cl_int), (void *)&ncells);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 1, sizeof(cl_mem), (void *)&dev_H);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 2, sizeof(cl_mem), (void *)&dev_level);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 3, sizeof(cl_mem), (void *)&dev_levdx);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 4, sizeof(cl_mem), (void *)&dev_levdy);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 5, sizeof(cl_mem), (void *)&dev_celltype);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 6, sizeof(cl_mem), (void *)&dev_mass_sum);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 7, sizeof(cl_mem), (void *)&dev_redscratch);
+      ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 8, local_work_size*sizeof(cl_real_t), NULL);
+
+      ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_mass_stage1of2, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+      if (block_size > 1) {
+           /*
+           __kernel void reduce_sum_cl(
+                            const int     isize,      // 0
+                   __global       int    *redscratch, // 1   Array to be reduced.
+                   __local        real_t  *tile)       // 2
+           */
+
+         ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+         ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 1, sizeof(cl_mem), (void *)&dev_mass_sum);
+         ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 2, sizeof(cl_mem), (void *)&dev_redscratch);
+         ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 3, local_work_size*sizeof(cl_real_t), NULL);
+
+         ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_mass_stage2of2, 1, NULL, &local_work_size, &local_work_size, NULL);
+      }
+
+      double local_sum, global_sum;
+      real_t mass_sum;
+
+      ezcl_enqueue_read_buffer(command_queue, dev_mass_sum, CL_TRUE, 0, 1*sizeof(cl_real_t), &mass_sum, NULL);
+      
+      local_sum = mass_sum;
+      global_sum = local_sum;
+#ifdef HAVE_MPI
+      MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#endif
+      gpu_mass_sum_total = global_sum;
+   }
+
+   ezcl_device_memory_delete(dev_redscratch);
+   ezcl_device_memory_delete(dev_mass_sum);
+
+   gpu_timers[STATE_TIMER_MASS_SUM] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+   return(gpu_mass_sum_total);
+}
+#endif
+
+#ifdef HAVE_OPENCL
+void State::allocate_device_memory(size_t ncells)
+{
+   dev_H = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_H"), DEVICE_REGULAR_MEMORY);
+   dev_U = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_U"), DEVICE_REGULAR_MEMORY);
+   dev_V = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_V"), DEVICE_REGULAR_MEMORY);
+}
+#endif
+
+void State::resize_old_device_memory(size_t ncells)
+{
+#ifdef HAVE_OPENCL
+   gpu_state_memory.memory_delete(dev_H);
+   gpu_state_memory.memory_delete(dev_U);
+   gpu_state_memory.memory_delete(dev_V);
+   dev_H = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_H"), DEVICE_REGULAR_MEMORY);
+   dev_U = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_U"), DEVICE_REGULAR_MEMORY);
+   dev_V = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_V"), DEVICE_REGULAR_MEMORY);
+#else
+   // Just to block compiler warnings
+   if (1 == 2) printf("DEBUG -- ncells is %ld\n",ncells);
+#endif
+}
+
+#ifdef HAVE_MPI
+void State::do_load_balance_local(size_t &numcells){
+   mesh->do_load_balance_local(numcells, NULL, state_memory);
+   memory_reset_ptrs();
+}
+#endif
+#ifdef HAVE_OPENCL
+#ifdef HAVE_MPI
+void State::gpu_do_load_balance_local(size_t &numcells){
+   if (mesh->gpu_do_load_balance_local(numcells, NULL, gpu_state_memory) ){
+      //gpu_state_memory.memory_report();
+      dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H");
+      dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U");
+      dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V");
+/*
+      if (dev_H == NULL){
+         dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H_new");
+         dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U_new");
+         dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V_new");
+      }
+      printf("DEBUG memory for proc %d dev_H is %p dev_U is %p dev_V is %p\n",mesh->mype,dev_H,dev_U,dev_V);
+*/
+   }
+}
+#endif
+#endif
+
+static double reference_time = 0.0;
+
+void State::output_timing_info(int do_cpu_calc, int do_gpu_calc, double total_elapsed_time)
+{
+   int parallel = mesh->parallel;
+
+   double cpu_time_compute = 0.0;
+   double gpu_time_compute = 0.0;
+
+   double cpu_elapsed_time = 0.0;
+   double gpu_elapsed_time = 0.0;
+
+   double cpu_mesh_time = 0.0;
+   double gpu_mesh_time = 0.0;
+
+   if (do_cpu_calc) {
+      cpu_time_compute = get_cpu_timer(STATE_TIMER_SET_TIMESTEP) +
+                         get_cpu_timer(STATE_TIMER_FINITE_DIFFERENCE) +
+                         get_cpu_timer(STATE_TIMER_REFINE_POTENTIAL) +
+                         get_cpu_timer(STATE_TIMER_REZONE_ALL) +
+                         mesh->get_cpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+                         mesh->get_cpu_timer(MESH_TIMER_LOAD_BALANCE) +
+                         get_cpu_timer(STATE_TIMER_MASS_SUM) +
+                         mesh->get_cpu_timer(MESH_TIMER_CALC_SPATIAL_COORDINATES) +
+                         mesh->get_cpu_timer(MESH_TIMER_PARTITION);
+      cpu_elapsed_time = cpu_time_compute;
+      cpu_mesh_time = mesh->get_cpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+                      get_cpu_timer(STATE_TIMER_REZONE_ALL) +
+                      mesh->get_cpu_timer(MESH_TIMER_REFINE_SMOOTH) +
+                      mesh->get_cpu_timer(MESH_TIMER_LOAD_BALANCE);
+   }
+   if (do_gpu_calc) {
+      gpu_time_compute = get_gpu_timer(STATE_TIMER_APPLY_BCS) +
+                         get_gpu_timer(STATE_TIMER_SET_TIMESTEP) +
+                         get_gpu_timer(STATE_TIMER_FINITE_DIFFERENCE) +
+                         get_gpu_timer(STATE_TIMER_REFINE_POTENTIAL) +
+                         get_gpu_timer(STATE_TIMER_REZONE_ALL) +
+                         mesh->get_gpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+                         mesh->get_gpu_timer(MESH_TIMER_LOAD_BALANCE) +
+                         get_gpu_timer(STATE_TIMER_MASS_SUM) +
+                         mesh->get_gpu_timer(MESH_TIMER_CALC_SPATIAL_COORDINATES) +
+                         mesh->get_gpu_timer(MESH_TIMER_COUNT_BCS);
+      gpu_elapsed_time = get_gpu_timer(STATE_TIMER_WRITE) + gpu_time_compute + get_gpu_timer(STATE_TIMER_READ);
+      gpu_mesh_time = mesh->get_gpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+                      get_gpu_timer(STATE_TIMER_REZONE_ALL) +
+                      mesh->get_gpu_timer(MESH_TIMER_REFINE_SMOOTH) +
+                      mesh->get_gpu_timer(MESH_TIMER_LOAD_BALANCE);
+   }
+
+   if (! parallel && do_cpu_calc) reference_time = cpu_elapsed_time;
+
+   double speedup_ratio = 0.0;
+   if (reference_time > 0.0){
+      if (do_cpu_calc && parallel) speedup_ratio = reference_time/cpu_elapsed_time;
+      if (do_gpu_calc) speedup_ratio = reference_time/gpu_elapsed_time;
+   }
+
+   if (do_cpu_calc) {
+      output_timer_block(MESH_DEVICE_CPU, cpu_elapsed_time, cpu_mesh_time, cpu_time_compute, total_elapsed_time, speedup_ratio);
+   }
+   if (do_gpu_calc) {
+      output_timer_block(MESH_DEVICE_GPU, gpu_elapsed_time, gpu_mesh_time, gpu_time_compute, total_elapsed_time, speedup_ratio);
+   }
+}
+
+void State::output_timer_block(mesh_device_types device_type, double elapsed_time,
+   double mesh_time, double compute_time, double total_elapsed_time, double speedup_ratio)
+{
+   int mype  = mesh->mype;
+   int parallel = mesh->parallel;
+
+   int rank = mype;
+   if (! parallel) {
+      // We need to get rank info for check routines
+#ifdef HAVE_MPI
+      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+   }
+
+   if (! parallel && rank) return;
+
+   char device_string[10];
+   if (device_type == MESH_DEVICE_CPU) {
+      sprintf(device_string,"CPU");
+   } else {
+      sprintf(device_string,"GPU");
+   }
+
+#ifdef TIMING
+   if (rank == 0) {
+      printf("\n");
+      printf("~~~~~~~~~~~~~~~~ Device timing information ~~~~~~~~~~~~~~~~~~\n");
+   }
+
+   if (rank == 0 && parallel) {
+      printf("\n%3s: Parallel timings\n\n",device_string);
+   }
+
+   if (device_type == MESH_DEVICE_GPU) {
+      mesh->parallel_output("GPU: Write to device          time was",  get_gpu_timer(STATE_TIMER_WRITE), 0, "s");
+      mesh->parallel_output("GPU: Read from device         time was",  get_gpu_timer(STATE_TIMER_READ),  0, "s");
+   }
+
+   const char *device_compute_string[2] = {
+      "CPU: Device compute           time was",
+      "GPU: Device compute           time was"
+   };
+   mesh->parallel_output(device_compute_string[device_type], compute_time, 0, "s");
+
+   timer_output(STATE_TIMER_SET_TIMESTEP,                  device_type, 1);
+   timer_output(STATE_TIMER_FINITE_DIFFERENCE,             device_type, 1);
+   timer_output(STATE_TIMER_REFINE_POTENTIAL,              device_type, 1);
+   timer_output(STATE_TIMER_CALC_MPOT,                     device_type, 2);
+   mesh->timer_output(MESH_TIMER_REFINE_SMOOTH,            device_type, 2);
+   timer_output(STATE_TIMER_REZONE_ALL,                    device_type, 1);
+   mesh->timer_output(MESH_TIMER_PARTITION,                device_type, 1);
+   mesh->timer_output(MESH_TIMER_CALC_NEIGHBORS,           device_type, 1);
+   if (mesh->get_calc_neighbor_type() == HASH_TABLE) {
+      mesh->timer_output(MESH_TIMER_HASH_SETUP,            device_type, 2);
+      mesh->timer_output(MESH_TIMER_HASH_QUERY,            device_type, 2);
+      if (parallel) {
+         mesh->timer_output(MESH_TIMER_FIND_BOUNDARY,      device_type, 2);
+         mesh->timer_output(MESH_TIMER_PUSH_SETUP,         device_type, 2);
+         mesh->timer_output(MESH_TIMER_PUSH_BOUNDARY,      device_type, 2);
+         mesh->timer_output(MESH_TIMER_LOCAL_LIST,         device_type, 2);
+         mesh->timer_output(MESH_TIMER_LAYER1,             device_type, 2);
+         mesh->timer_output(MESH_TIMER_LAYER2,             device_type, 2);
+         mesh->timer_output(MESH_TIMER_LAYER_LIST,         device_type, 2);
+         mesh->timer_output(MESH_TIMER_COPY_MESH_DATA,     device_type, 2);
+         mesh->timer_output(MESH_TIMER_FILL_MESH_GHOST,    device_type, 2);
+         mesh->timer_output(MESH_TIMER_FILL_NEIGH_GHOST,   device_type, 2);
+         mesh->timer_output(MESH_TIMER_SET_CORNER_NEIGH,   device_type, 2);
+         mesh->timer_output(MESH_TIMER_NEIGH_ADJUST,       device_type, 2);
+         mesh->timer_output(MESH_TIMER_SETUP_COMM,         device_type, 2);
+      }
+   } else {
+      mesh->timer_output(MESH_TIMER_KDTREE_SETUP,          device_type, 2);
+      mesh->timer_output(MESH_TIMER_KDTREE_QUERY,          device_type, 2);
+   }
+   timer_output(STATE_TIMER_MASS_SUM,                      device_type, 1);
+   if (parallel) {
+      mesh->timer_output(MESH_TIMER_LOAD_BALANCE,          device_type, 1);
+   }
+   mesh->timer_output(MESH_TIMER_CALC_SPATIAL_COORDINATES, device_type, 1);
+   if (! mesh->have_boundary) {
+      mesh->timer_output(MESH_TIMER_COUNT_BCS,             device_type, 1);
+   }
+   if (rank == 0) printf("=============================================================\n");
+
+   const char *profile_string[2] = {
+      "Profiling: Total CPU          time was",
+      "Profiling: Total GPU          time was"
+   };
+   mesh->parallel_output(profile_string[device_type], elapsed_time, 0, "s");
+   if (elapsed_time > 600.0){
+      mesh->parallel_output("                                  or  ", elapsed_time/60.0, 0, "min");
+   }
+
+   if (rank == 0) printf("-------------------------------------------------------------\n");
+   mesh->parallel_output("Mesh Ops (Neigh+rezone+smooth+balance) ",mesh_time, 0, "s");
+   mesh->parallel_output("Mesh Ops Percentage                    ",mesh_time/elapsed_time*100.0, 0, "percent");
+   if (rank == 0) printf("=============================================================\n");
+
+   mesh->parallel_output("Profiling: Total              time was",total_elapsed_time, 0, "s");
+   if (elapsed_time > 600.0){
+      mesh->parallel_output("                                  or  ",total_elapsed_time/60.0, 0, "min");
+   }
+
+   if (speedup_ratio > 0.0) {
+      mesh->parallel_output("Parallel Speed-up:                    ",speedup_ratio, 0, "Reference Serial CPU");
+   }
+
+   if (rank == 0) printf("=============================================================\n");
+#endif
+}
+
+void State::timer_output(state_timer_category category, mesh_device_types device_type, int timer_level)
+{
+   int mype = mesh->mype;
+
+   double local_time = 0.0;
+   if (device_type == MESH_DEVICE_CPU){
+      local_time = get_cpu_timer(category);
+   } else {
+      local_time = get_gpu_timer(category);
+   }
+
+   char string[80] = "/0";
+
+   if (mype == 0) {
+      const char *blank="          ";
+
+      const char *device_string[2] = {
+         "CPU",
+         "GPU"
+      };
+
+      sprintf(string,"%3s: %.*s%-30.30s\t", device_string[device_type],
+         2*timer_level, blank, state_timer_descriptor[category]);
+   }
+
+   mesh->parallel_output(string, local_time, timer_level, "s");
+}
+
+#ifdef HAVE_OPENCL
+void State::compare_state_gpu_global_to_cpu_global(const char* string, int cycle, uint ncells)
+{
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   vector<state_t>H_check(ncells);
+   vector<state_t>U_check(ncells);
+   vector<state_t>V_check(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), &H_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), &U_check[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE,  0, ncells*sizeof(cl_state_t), &V_check[0], NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (fabs(H[ic]-H_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d H & H_check %d %lf %lf\n",string,cycle,ic,H[ic],H_check[ic]);
+      if (fabs(U[ic]-U_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d U & U_check %d %lf %lf\n",string,cycle,ic,U[ic],U_check[ic]);
+      if (fabs(V[ic]-V_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d V & V_check %d %lf %lf\n",string,cycle,ic,V[ic],V_check[ic]);
+   }
+}
+#endif
+
+void State::compare_state_cpu_local_to_cpu_global(State *state_global, const char* string, int cycle, uint ncells, uint ncells_global, int *nsizes, int *ndispl)
+{
+   state_t *H_global = state_global->H;
+   state_t *U_global = state_global->U;
+   state_t *V_global = state_global->V;
+
+   vector<state_t>H_check(ncells_global);
+   vector<state_t>U_check(ncells_global);
+   vector<state_t>V_check(ncells_global);
+#ifdef HAVE_MPI
+   MPI_Allgatherv(&H[0], ncells, MPI_STATE_T, &H_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&U[0], ncells, MPI_STATE_T, &U_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&V[0], ncells, MPI_STATE_T, &V_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+#else
+   // Just to block compiler warnings
+   if (1 == 2) printf("DEBUG -- ncells %u nsizes %d ndispl %d\n",ncells, nsizes[0],ndispl[0]);
+#endif
+
+   for (uint ic = 0; ic < ncells_global; ic++){
+      if (fabs(H_global[ic]-H_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d H & H_check %d %lf %lf\n",string,cycle,ic,H_global[ic],H_check[ic]);
+      if (fabs(U_global[ic]-U_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d U & U_check %d %lf %lf\n",string,cycle,ic,U_global[ic],U_check[ic]);
+      if (fabs(V_global[ic]-V_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d V & V_check %d %lf %lf\n",string,cycle,ic,V_global[ic],V_check[ic]);
+   }
+}
+
+#ifdef HAVE_OPENCL
+void State::compare_state_all_to_gpu_local(State *state_global, uint ncells, uint ncells_global, int mype, int ncycle, int *nsizes, int *ndispl)
+{
+#ifdef HAVE_MPI
+   cl_command_queue command_queue = ezcl_get_command_queue();
+
+   state_t *H_global = state_global->H;
+   state_t *U_global = state_global->U;
+   state_t *V_global = state_global->V;
+   cl_mem &dev_H_global = state_global->dev_H;
+   cl_mem &dev_U_global = state_global->dev_U;
+   cl_mem &dev_V_global = state_global->dev_V;
+
+   // Need to compare dev_H to H, etc
+   vector<state_t>H_save(ncells);
+   vector<state_t>U_save(ncells);
+   vector<state_t>V_save(ncells);
+   ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), &H_save[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), &U_save[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE,  0, ncells*sizeof(cl_state_t), &V_save[0], NULL);
+   for (uint ic = 0; ic < ncells; ic++){
+      if (fabs(H[ic]-H_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d H & H_save %d %lf %lf \n",mype,ncycle,ic,H[ic],H_save[ic]);
+      if (fabs(U[ic]-U_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d U & U_save %d %lf %lf \n",mype,ncycle,ic,U[ic],U_save[ic]);
+      if (fabs(V[ic]-V_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d V & V_save %d %lf %lf \n",mype,ncycle,ic,V[ic],V_save[ic]);
+   }
+
+   // And compare dev_H gathered to H_global, etc
+   vector<state_t>H_save_global(ncells_global);
+   vector<state_t>U_save_global(ncells_global);
+   vector<state_t>V_save_global(ncells_global);
+   MPI_Allgatherv(&H_save[0], nsizes[mype], MPI_STATE_T, &H_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&U_save[0], nsizes[mype], MPI_STATE_T, &U_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&V_save[0], nsizes[mype], MPI_STATE_T, &V_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   if (mype == 0) {
+      for (uint ic = 0; ic < ncells_global; ic++){
+         if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d H_global & H_save_global %d %lf %lf \n",mype,ncycle,ic,H_global[ic],H_save_global[ic]);
+         if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d U_global & U_save_global %d %lf %lf \n",mype,ncycle,ic,U_global[ic],U_save_global[ic]);
+         if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d V_global & V_save_global %d %lf %lf \n",mype,ncycle,ic,V_global[ic],V_save_global[ic]);
+      }
+   }
+
+   // And compare H gathered to H_global, etc
+   MPI_Allgatherv(&H[0], nsizes[mype], MPI_STATE_T, &H_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&U[0], nsizes[mype], MPI_STATE_T, &U_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   MPI_Allgatherv(&V[0], nsizes[mype], MPI_STATE_T, &V_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+   if (mype == 0) {
+      for (uint ic = 0; ic < ncells_global; ic++){
+         if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d H_global & H_save_global %d %lf %lf \n",ncycle,ic,H_global[ic],H_save_global[ic]);
+         if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d U_global & U_save_global %d %lf %lf \n",ncycle,ic,U_global[ic],U_save_global[ic]);
+         if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d V_global & V_save_global %d %lf %lf \n",ncycle,ic,V_global[ic],V_save_global[ic]);
+      }
+   }
+
+   // Now the global dev_H_global to H_global, etc
+   ezcl_enqueue_read_buffer(command_queue, dev_H_global, CL_FALSE, 0, ncells_global*sizeof(cl_state_t), &H_save_global[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_U_global, CL_FALSE, 0, ncells_global*sizeof(cl_state_t), &U_save_global[0], NULL);
+   ezcl_enqueue_read_buffer(command_queue, dev_V_global, CL_TRUE,  0, ncells_global*sizeof(cl_state_t), &V_save_global[0], NULL);
+   if (mype == 0) {
+      for (uint ic = 0; ic < ncells_global; ic++){
+         if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d H_global & H_save_global %d %lf %lf \n",mype,ncycle,ic,H_global[ic],H_save_global[ic]);
+         if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d U_global & U_save_global %d %lf %lf \n",mype,ncycle,ic,U_global[ic],U_save_global[ic]);
+         if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d V_global & V_save_global %d %lf %lf \n",mype,ncycle,ic,V_global[ic],V_save_global[ic]);
+      }
+   }
+#else
+   // Just to get rid of compiler warnings
+   if (1 == 2) printf("%d: DEBUG -- ncells %d ncells_global %d ncycle %d nsizes[0] %d ndispl %d state_global %p\n",
+      mype,ncells,ncells_global,ncycle,nsizes[0],ndispl[0],state_global);
+#endif
+}
+#endif
+
+void State::print_object_info(void)
+{
+   printf(" ---- State object info -----\n");
+
+#ifdef HAVE_OPENCL
+   int num_elements, elsize;
+
+   num_elements = ezcl_get_device_mem_nelements(dev_H);
+   elsize = ezcl_get_device_mem_elsize(dev_H);
+   printf("dev_H       ptr : %p nelements %d elsize %d\n",dev_H,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_U);
+   elsize = ezcl_get_device_mem_elsize(dev_U);
+   printf("dev_U       ptr : %p nelements %d elsize %d\n",dev_U,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_V);
+   elsize = ezcl_get_device_mem_elsize(dev_V);
+   printf("dev_V       ptr : %p nelements %d elsize %d\n",dev_V,num_elements,elsize);
+   num_elements = ezcl_get_device_mem_nelements(dev_mpot);
+   elsize = ezcl_get_device_mem_elsize(dev_mpot);
+   printf("dev_mpot    ptr : %p nelements %d elsize %d\n",dev_mpot,num_elements,elsize);
+   //num_elements = ezcl_get_device_mem_nelements(dev_ioffset);
+   //elsize = ezcl_get_device_mem_elsize(dev_ioffset);
+   //printf("dev_ioffset ptr : %p nelements %d elsize %d\n",dev_ioffset,num_elements,elsize);
+#endif
+   state_memory.memory_report();
+   //printf("vector H    ptr : %p nelements %ld elsize %ld\n",&H[0],H.size(),sizeof(H[0]));
+   //printf("vector U    ptr : %p nelements %ld elsize %ld\n",&U[0],U.size(),sizeof(U[0]));
+   //printf("vector V    ptr : %p nelements %ld elsize %ld\n",&V[0],V.size(),sizeof(V[0]));
+}
+
+void State::print(void)
+{  //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+   if (mesh->fp == NULL) {
+      char filename[10];
+      sprintf(filename,"out%1d",mesh->mype);
+      mesh->fp=fopen(filename,"w");
+   }
+
+   if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+      fprintf(mesh->fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+      for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+   } else {
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev\n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d  \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+      }
+   }
+}
+
+const int CRUX_STATE_VERSION = 102;
+const int num_int_vals       = 1;
+
+size_t State::get_checkpoint_size(void)
+{
+#ifdef FULL_PRECISION
+   size_t nsize = mesh->ncells*3*sizeof(double);
+#else
+   size_t nsize = mesh->ncells*3*sizeof(float);
+#endif
+   nsize += num_int_vals*sizeof(int);
+   nsize += mesh->get_checkpoint_size();
+   return(nsize);
+}
+
+void State::store_checkpoint(Crux *crux)
+{
+   // Store mesh data first
+   mesh->store_checkpoint(crux);
+
+//#ifndef HAVE_MPI
+   // Load up scalar values
+   int int_vals[num_int_vals];
+   int_vals[0] = CRUX_STATE_VERSION;
+
+   // Add to memory database for storing checkpoint
+   state_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "state_int_vals", RESTART_DATA | REPLICATED_DATA);
+   state_memory.memory_add(cpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_cpu_timers", RESTART_DATA);
+   state_memory.memory_add(gpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_gpu_timers", RESTART_DATA);
+
+   crux->store_MallocPlus(state_memory);
+
+   // Remove from database after checkpoint is stored
+   state_memory.memory_remove(int_vals);
+   state_memory.memory_remove(cpu_timers);
+   state_memory.memory_remove(gpu_timers);
+//#endif
+}
+
+void State::restore_checkpoint(Crux *crux)
+{
+   int storage;
+   // Restore mesh data first
+   mesh->restore_checkpoint(crux);
+   crux->restore_named_ints("storage", 7, &storage, 1);
+
+   // Create memory for restoring data into
+   int int_vals[num_int_vals];
+
+   // allocate is a state method
+   allocate(storage);
+
+   // Add to memory database for restoring checkpoint
+   state_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "state_int_vals", RESTART_DATA | REPLICATED_DATA);
+   state_memory.memory_add(cpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_cpu_timers", RESTART_DATA);
+   state_memory.memory_add(gpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_gpu_timers", RESTART_DATA);
+
+   // Restore memory database
+   crux->restore_MallocPlus(state_memory);
+
+   // Check version number
+   if (int_vals[ 0] != CRUX_STATE_VERSION) {
+      printf("CRUX version mismatch for state data, version on file is %d, version in code is %d\n",
+         int_vals[0], CRUX_STATE_VERSION);
+      exit(0);
+   }
+
+#ifdef DEBUG_RESTORE_VALS
+   if (DEBUG_RESTORE_VALS) {
+      printf("\n");
+      printf("       === Restored state cpu timers ===\n");
+      for (int i = 0; i < STATE_TIMER_SIZE; i++){
+         printf("       %-30s %lg\n",state_timer_descriptor[i], cpu_timers[i]);
+      }
+      printf("       === Restored state cpu timers ===\n");
+      printf("\n");
+   }
+#endif
+
+#ifdef DEBUG_RESTORED_VALS
+   if (DEBUG_RESTORED_VALS) {
+      printf("\n");
+      printf("       === Restored state gpu timers ===\n");
+      for (int i = 0; i < STATE_TIMER_SIZE; i++){
+         printf("       %-30s %lld\n",state_timer_descriptor[i], gpu_timers[i]);
+      }
+      printf("       === Restored state gpu_timers ===\n");
+      printf("\n");
+   }
+#endif
+
+   state_memory.memory_remove(int_vals);
+   state_memory.memory_remove(cpu_timers);
+   state_memory.memory_remove(gpu_timers);
+   
+   memory_reset_ptrs();
+//#endif
+}
+
+// Added overloaded print to get mesh information to print in each cycle
+// Brian Atkinson (5-29-14)
+void State::print(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage)
+{  //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+      char filename[40];
+      sprintf(filename,"iteration%d",iteration);
+      mesh->fp=fopen(filename,"w");
+
+      if(iteration_mass == 0.0){
+         fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+         fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+         fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tSimulation Time: %lf\n", initial_mass, simTime);
+      }
+      else{
+         double mass_diff = iteration_mass - initial_mass;
+         fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+         fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+         fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+         fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+      }
+
+   if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+      fprintf(mesh->fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+      for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+   } else {
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev\n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d  \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+      }
+   }
+}
+
+void State::print_local(int ncycle)
+{  //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+   if (mesh->fp == NULL) {
+      char filename[10];
+      sprintf(filename,"out%1d",mesh->mype);
+      mesh->fp=fopen(filename,"w");
+   }
+
+   fprintf(mesh->fp,"DEBUG in print_local ncycle is %d\n",ncycle);
+   if (mesh->nlft != NULL){
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev   nlft   nrht   nbot   ntop\n",mesh->mype);
+      uint state_size = state_memory.get_memory_size(H);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         if (ic >= state_size){
+            fprintf(mesh->fp,"%d: %6d                              %4d  %4d   %4d  %4d  %4d  %4d  %4d\n", mesh->mype,ic, mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+         } else {
+            fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d  %4d  %4d  %4d  %4d\n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+         }
+      }
+   } else {
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev\n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d\n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+      }
+   }
+}
+
+void State::print_failure_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, bool got_nan){
+   char filename[] = {"failure.log"};
+   mesh->fp=fopen(filename,"w");
+
+   double mass_diff = iteration_mass - initial_mass;
+   if(got_nan){
+      fprintf(mesh->fp,"Failed because of nan for H_sum was equal to NAN\n");
+   }
+   else{
+      fprintf(mesh->fp,"Failed because mass difference is outside of accepted percentage\n");
+   }
+   fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+   fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+   fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+   fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+
+   if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+      fprintf(mesh->fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+      for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+   } else {
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev\n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d  \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+      }
+   }
+}
+
+void State::print_rollback_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, int backup_attempt, int num_of_attempts, int error_status){
+   char filename[40];
+   sprintf(filename, "rollback%d.log",backup_attempt);
+   mesh->fp=fopen(filename,"w");
+
+   double mass_diff = iteration_mass - initial_mass;
+   if(error_status == STATUS_NAN){
+      fprintf(mesh->fp,"Rolling back because of nan for H_sum was equal to NAN\n");
+   }
+   else{
+      fprintf(mesh->fp,"Rolling back because mass difference is outside of accepted percentage\n");
+   }
+   fprintf(mesh->fp,"Rollback attempt %d of %d ---> Number of attempts left:%d\n", backup_attempt, num_of_attempts, num_of_attempts - backup_attempt);
+   fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+   fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+   fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+   fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+
+   if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+      fprintf(mesh->fp,"%d:   index global  i     j     lev   nlft  nrht  nbot  ntop \n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+      for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d  %6d %4d  %4d   %4d  %4d  %4d  %4d  %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+      }
+   } else {
+      fprintf(mesh->fp,"%d:  index     H        U         V      i     j     lev\n",mesh->mype);
+      for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+         fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d  %4d   %4d  \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+      }
+   }
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/state.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,364 @@
+/*
+ *  Copyright (c) 2011-2013, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef STATE_H_
+#define STATE_H_
+
+#include <list>
+#include "MallocPlus.h"
+#include "mesh.h"
+#include "crux.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+//#include "l7/l7.h"
+
+#define STATUS_OK        0
+#define STATUS_NAN       1
+#define STATUS_MASS_LOSS 2
+
+#if !defined(FULL_PRECISION) && !defined(MIXED_PRECISION) && !defined(MINIMUM_PRECISION)
+#define FULL_PRECISION
+#endif
+#ifdef NO_CL_DOUBLE
+#undef  FULL_PRECISION
+#undef  MIXED_PRECISION
+#define MINIMUM_PRECISION
+#endif
+
+#if defined(MINIMUM_PRECISION)
+   typedef float state_t; // this is for physics state variables ncell in size
+   typedef float real_t; // this is used for intermediate calculations
+   typedef struct
+   {
+      float s0;
+      float s1;
+   }  real2_t;
+#define CONSERVATION_EPS    15.0
+#ifdef HAVE_OPENCL
+   typedef cl_float  cl_state_t; // for gpu physics state variables
+   typedef cl_float4 cl_state4_t; // for gpu physics state variables
+   typedef cl_float  cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_float2 cl_real2_t; // for intermediate gpu physics state variables
+   typedef cl_float4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+   #define MPI_STATE_T MPI_FLOAT // for MPI communication for physics state variables
+   #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
+   #define L7_STATE_T L7_FLOAT
+   #define L7_REAL_T L7_FLOAT
+#endif
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+   typedef float state_t;
+   typedef double real_t;
+   typedef struct
+   {
+      double s0;
+      double s1;
+   }  real2_t;
+#define CONSERVATION_EPS    .02
+#ifdef HAVE_OPENCL
+   typedef cl_float   cl_state_t;
+   typedef cl_float4  cl_state4_t;
+   typedef cl_double  cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
+   typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+   #define MPI_STATE_T MPI_FLOAT
+   #define MPI_REAL_T MPI_DOUBLE
+   #define L7_STATE_T L7_FLOAT
+   #define L7_REAL_T L7_DOUBLE
+#endif
+
+#elif defined(FULL_PRECISION)
+   typedef double state_t;
+   typedef double real_t;
+   typedef struct
+   {
+      double s0;
+      double s1;
+   }  real2_t;
+#define CONSERVATION_EPS    .02
+#ifdef HAVE_OPENCL
+   typedef cl_double  cl_state_t;
+   typedef cl_double4 cl_state4_t;
+   typedef cl_double  cl_real_t; // for intermediate gpu physics state variables
+   typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
+   typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+   #define MPI_STATE_T MPI_DOUBLE
+   #define MPI_REAL_T MPI_DOUBLE
+   #define L7_STATE_T L7_DOUBLE
+   #define L7_REAL_T L7_DOUBLE
+#endif
+#endif
+
+extern "C" void do_calc(void);
+
+enum SUM_TYPE {
+   SUM_REGULAR,
+   SUM_KAHAN
+};
+
+
+enum SIGN_RULE {
+   DIAG_RULE,
+   X_RULE,
+   Y_RULE,
+};
+
+enum state_timers
+{
+   STATE_TIMER_APPLY_BCS,
+   STATE_TIMER_SET_TIMESTEP,
+   STATE_TIMER_FINITE_DIFFERENCE,
+   STATE_TIMER_REFINE_POTENTIAL,
+   STATE_TIMER_CALC_MPOT,
+   STATE_TIMER_REZONE_ALL,
+   STATE_TIMER_MASS_SUM,
+   STATE_TIMER_READ,
+   STATE_TIMER_WRITE,
+   STATE_TIMER_SIZE
+};
+
+typedef enum state_timers   state_timer_category;
+
+using namespace std;
+
+class State {
+   
+public:
+   MallocPlus state_memory;
+   MallocPlus gpu_state_memory;
+   Mesh *mesh;
+   state_t *H;
+   state_t *U;
+   state_t *V;
+
+#ifdef HAVE_OPENCL
+   cl_mem dev_H;
+   cl_mem dev_U;
+   cl_mem dev_V;
+
+   cl_mem dev_mass_sum;
+   cl_mem dev_deltaT;
+
+   cl_event apply_BCs_event;
+
+   cl_mem dev_mpot;
+   //cl_mem dev_ioffset;
+   cl_mem dev_result;
+#endif
+
+   double    cpu_timers[STATE_TIMER_SIZE];
+   long long gpu_timers[STATE_TIMER_SIZE];
+
+   // constructor -- allocates state arrays to size ncells
+   State(Mesh *mesh_in);
+
+   void init(int do_gpu_calc);
+   void terminate(void);
+
+   /* Memory routines for linked list of state arrays */
+   void allocate(size_t ncells);
+   void allocate_from_backup_file(FILE *fp);
+   void allocate_for_rollback(State *state_to_copy);
+   void resize(size_t ncells);
+   void memory_reset_ptrs(void);
+#ifdef HAVE_OPENCL
+   void allocate_device_memory(size_t ncells);
+#endif
+   void resize_old_device_memory(size_t ncells);
+
+   /* Accessor routines */
+   double get_cpu_timer(state_timer_category category)  {return(cpu_timers[category]); };
+   /* Convert nanoseconds to msecs */
+   double get_gpu_timer(state_timer_category category)  {return((double)(gpu_timers[category])*1.0e-9); };
+
+   /* Boundary routines -- not currently used */
+   void add_boundary_cells(void);
+   void apply_boundary_conditions(void);
+   void apply_boundary_conditions_local(void);
+   void apply_boundary_conditions_ghost(void);
+   void remove_boundary_cells(void);
+
+   /*******************************************************************
+   * set_timestep
+   *  Input
+   *    H, U, V -- from state object
+   *    celltype, level, lev_delta
+   *  Output
+   *    mindeltaT returned
+   *******************************************************************/
+   double set_timestep(double g, double sigma);
+#ifdef HAVE_OPENCL
+   double gpu_set_timestep(double sigma);
+#endif
+
+   /*******************************************************************
+   * calc finite difference
+   *      will add ghost region to H, U, V and fill at start of routine
+   *   Input
+   *      H, U, V -- from state object
+   *      nlft, nrht, nbot, ntop, level, celltype -- from mesh object
+   *   Output
+   *      H, U, V
+   *******************************************************************/
+   void calc_finite_difference(double deltaT);
+   void calc_finite_difference_via_faces(double deltaT);
+#ifdef HAVE_OPENCL
+   void gpu_calc_finite_difference(double deltaT);
+#endif
+
+   /*******************************************************************
+   * calc refine potential -- state has responsibility to calc initial
+   *      refinement potential array that is then passed to mesh for
+   *      smoothing and enforcing refinement ruiles
+   *  Input
+   *    H, U, V -- from state object
+   *  Output
+   *    mpot
+   *    ioffset
+   *    count
+   *******************************************************************/
+   size_t calc_refine_potential(vector<int> &mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+   size_t gpu_calc_refine_potential(int &icount, int &jcount);
+#endif
+
+   /*******************************************************************
+   * rezone all -- most of call is done in mesh
+   *  Input
+   *    Mesh and state variables
+   *  Output
+   *    New mesh and state variables on refined mesh
+   *******************************************************************/
+   void rezone_all(int icount, int jcount, vector<int> mpot);
+#ifdef HAVE_OPENCL
+   void gpu_rezone_all(int icount, int jcount, bool localStencil);
+#endif
+
+   /*******************************************************************
+   * load balance -- most of call is done in mesh, but pointers are
+   *    reset to newly allocated state arrays
+   *  Input
+   *    Mesh and state variables
+   *  Output
+   *    New mesh and state variables on refined mesh
+   *******************************************************************/
+#ifdef HAVE_MPI
+   void do_load_balance_local(size_t &numcells);
+#ifdef HAVE_OPENCL
+   void gpu_do_load_balance_local(size_t &numcells);
+#endif
+#endif
+
+   /*******************************************************************
+   * mass sum -- Conservation of mass check
+   *  Input
+   *    H from state object
+   *    Precision type for sum
+   *  Output
+   *    total mass is returned
+   *******************************************************************/
+   double mass_sum(int enhanced_precision_sum);
+#ifdef HAVE_OPENCL
+   double gpu_mass_sum(int enhanced_precision_sum);
+#endif
+   
+   void fill_circle(double circ_radius, double fill_value, double background);
+   void state_reorder(vector<int> iorder);
+
+   void symmetry_check(const char *string, vector<int> sym_index, double eps, 
+                       SIGN_RULE sign_rule, int &flag);
+
+   void output_timing_info(int do_cpu_calc, int do_gpu_calc, double total_elapsed_time);
+
+   /* state comparison routines */
+#ifdef HAVE_OPENCL
+   void compare_state_gpu_global_to_cpu_global(const char* string, int cycle, uint ncells);
+#endif
+   void compare_state_cpu_local_to_cpu_global(State *state_global, const char* string, int cycle, uint ncells, uint ncells_global, int *nsizes, int *ndispl);
+#ifdef HAVE_OPENCL
+   void compare_state_all_to_gpu_local(State *state_global, uint ncells, uint ncells_global, int mype, int ncycle, int *nsizes, int *ndispl);
+#endif
+
+   void output_timer_block(mesh_device_types device_type, double elapsed_time,
+      double mesh_time, double compute_time, double total_elapsed_time, double speedup_ratio);
+
+   void timer_output(state_timer_category category, mesh_device_types device_type, int timer_level);
+
+   void print(void);
+
+   size_t get_checkpoint_size(void);
+   void store_checkpoint(Crux *crux);
+   void restore_checkpoint(Crux *crux);
+   //Added to for second print for every interation: Brian Atkinson (5-29-14)
+   void print(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage);  
+   void print_local(int ncycle);
+   void print_failure_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, bool got_nan);
+   void print_rollback_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, int backup_attempt, int num_of_attempts, int error_status);
+
+private:
+   State(const State&); // To block copy constructor so copies are not made inadvertently
+
+   void print_object_info(void);
+};
+
+#endif // ifndef STATE_H_
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/timer.c?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c Sun Sep  3 20:10:18 2017
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#include <sys/time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "timer.h"
+
+void cpu_timer_start(struct timeval *tstart_cpu){
+#ifdef _OPENMP
+   if ( omp_in_parallel() ) {
+#pragma omp master
+      {
+         gettimeofday(tstart_cpu, NULL);
+      }
+   } else {
+      gettimeofday(tstart_cpu, NULL);
+   }
+#else
+   gettimeofday(tstart_cpu, NULL);
+#endif
+}
+
+double cpu_timer_stop(struct timeval tstart_cpu){
+   double result;
+   struct timeval tstop_cpu, tresult;
+
+#ifdef _OPENMP
+   if ( omp_in_parallel() ) {
+#pragma omp master
+      {
+         gettimeofday(&tstop_cpu, NULL);
+         tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+         tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+         result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+      }
+   } else {
+      gettimeofday(&tstop_cpu, NULL);
+      tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+      tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+      result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+   }
+#else
+   gettimeofday(&tstop_cpu, NULL);
+   tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+   tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+   result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+#endif
+   return(result);
+}
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/timer.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef _TIMER_H
+#define _TIMER_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void cpu_timer_start(struct timeval *tstart_cpu);
+double cpu_timer_stop(struct timeval tstart_cpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* _TIMER_H */
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/zorder.c?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c Sun Sep  3 20:10:18 2017
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include "s7.h"
+#include "zorder.h"
+
+#define DEBUG 0
+
+void calc_zorder(int size, int *i, int *j, int *level, int levmx, int ibase, int *z_index, int *z_order)
+{   unsigned long long ibit,   //   Bitwise representation of x-index.
+                       jbit;   //   Bitwise representation of y-index.
+
+   //   Convert the indices to a bitwise representation.
+   int ic;
+   for (ic = 0; ic < size; ic++)
+   {  if (level[ic] < 0) continue;
+      ibit = index_to_bit(i[ic], level[ic], levmx, ibase);
+      jbit = index_to_bit(j[ic], level[ic], levmx, ibase);
+      z_index[ic] = twobit_to_index(ibit, jbit);
+      z_order[ic] = ic; }
+
+   //   Sort the z-ordered indices.
+   S7_Index_Sort(z_index, size, S7_INT, z_order);
+
+   //   Output ordered mesh information.
+   if (DEBUG)
+   {   printf("orig index   i     j     lev     ibit       jbit       ijbit      z index  z order\n");
+      for (ic=0; ic<size; ic++){
+         printf(" %6d   %4d  %4d   %4d   ",ic+1, j[ic], i[ic], level[ic]);
+         printbits(index_to_bit(j[ic], level[ic], levmx, ibase));
+         printf("   ");
+         printbits(index_to_bit(i[ic], level[ic], levmx, ibase));
+         printf("   ");
+         printbits( index_to_bit(i[ic], level[ic], levmx, ibase)
+               | (index_to_bit(j[ic], level[ic], levmx, ibase)
+               << 1));
+         printf("   %6d     %5d\n",z_index[ic], z_order[ic]); } } }
+
+unsigned long long index_to_bit(unsigned long long index,
+                        int lev,
+                        int levmx,
+                        int ibase)
+{   static const unsigned long long B[] =
+      {0x55555555,  /* 01010101010101010101010101010101 */
+       0x33333333,  /* 00110011001100110011001100110011 */
+       0x0F0F0F0F,  /* 00001111000011110000111100001111 */
+       0x00FF00FF,  /* 00000000111111110000000011111111 */
+       0x0000FFFF}; /* 00000000000000001111111111111111 */
+   static const unsigned long long S[] = {1, 2, 4, 8, 16};
+
+   //   Convert the index to a bit representation.
+   unsigned long long ii, ibit;
+   ii = index - ibase;
+   if (lev < levmx)
+   {   ii = ii * pow((double)2, (double)(levmx - lev)); }
+   ibit = ii;
+   ibit = (ibit | (ibit << S[3])) & B[3];
+   ibit = (ibit | (ibit << S[2])) & B[2];
+   ibit = (ibit | (ibit << S[1])) & B[1];
+   ibit = (ibit | (ibit << S[0])) & B[0];
+
+   return (ibit); }
+
+unsigned long long twobit_to_index(unsigned long long ibit,
+                           unsigned long long jbit)
+{   unsigned long long ijbit;
+   return (ijbit = ibit | (jbit << 1)); }
+
+//   Print n as a binary number.
+void printbits(int n)
+{  
+   int i, step;
+
+   if (0 == n)
+   {   //   For simplicity's sake, treat 0 as a special case.
+      printf("00000000");
+      return; }
+
+   i      =  1 << (sizeof(n) * 8 - 1);
+   step   = -1;   //   Only print the relevant digits.
+   step  >>=  8;   //   Print in groups of four.
+   while (step >= n)
+   {   i    >>= 8;
+      step >>= 8; }
+
+   //   At this point, i is the smallest power of two larger or equal to n.
+   while (i > 0)
+   {   if (n & i)
+         printf("1");
+      else
+         printf("0");
+      i >>= 1; } }
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/zorder.h?rev=312463&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h Sun Sep  3 20:10:18 2017
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ *  All rights Reserved.
+ *
+ *  Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced 
+ *  under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National 
+ *  Laboratory (LANL), which is operated by Los Alamos National Security, LLC 
+ *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
+ *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR LOS 
+ *  ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
+ *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
+ *  to produce derivative works, such modified software should be clearly marked,
+ *  so as not to confuse it with the version available from LANL.
+ *
+ *  Additionally, redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the Los Alamos National Security, LLC, Los Alamos 
+ *       National Laboratory, LANL, the U.S. Government, nor the names of its 
+ *       contributors may be used to endorse or promote products derived from 
+ *       this software without specific prior written permission.
+ *  
+ *  THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND 
+ *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
+ *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *  
+ *  CLAMR -- LA-CC-11-094
+ *  This research code is being developed as part of the 
+ *  2011 X Division Summer Workshop for the express purpose
+ *  of a collaborative code for development of ideas in
+ *  the implementation of AMR codes for Exascale platforms
+ *  
+ *  AMR implementation of the Wave code previously developed
+ *  as a demonstration code for regular grids on Exascale platforms
+ *  as part of the Supercomputing Challenge and Los Alamos 
+ *  National Laboratory
+ *  
+ *  Authors: Bob Robey       XCP-2   brobey at lanl.gov
+ *           Neal Davis              davis68 at lanl.gov, davis68 at illinois.edu
+ *           David Nicholaeff        dnic at lanl.gov, mtrxknight at aol.com
+ *           Dennis Trujillo         dptrujillo at lanl.gov, dptru10 at gmail.com
+ * 
+ */
+#ifndef _ZORDER_H
+#define _ZORDER_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void calc_zorder(int size, int *i, int *j, int *level, int levmx, int ibase, int *z_index, int *z_order);
+unsigned long long index_to_bit(unsigned long long index, int lev, int levmx, int ibase);
+unsigned long long twobit_to_index(unsigned long long ibit, unsigned long long jbit);
+void printbits(int n);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* _ZORDER_H */
+

Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CMakeLists.txt?rev=312463&r1=312462&r2=312463&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt Sun Sep  3 20:10:18 2017
@@ -1,3 +1,4 @@
 add_subdirectory(HPCCG)
 add_subdirectory(PENNANT)
 add_subdirectory(miniFE)
+add_subdirectory(CLAMR)

Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/Makefile?rev=312463&r1=312462&r2=312463&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile Sun Sep  3 20:10:18 2017
@@ -1,6 +1,6 @@
 # MultiSource/DOE-ProxyApps-C++ Makefile: Build all subdirectories automatically
 
 LEVEL = ../../..
-PARALLEL_DIRS = HPCCG PENNANT miniFE
+PARALLEL_DIRS = HPCCG PENNANT miniFE CLAMR
 
 include $(LEVEL)/Makefile.programs