[test-suite] r312488 - Revert "Revert "[test-suite] Adding the CLAMR mini-app""
Hal Finkel via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 4 07:25:36 PDT 2017
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/mesh.cpp?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp Mon Sep 4 07:25:36 2017
@@ -0,0 +1,10456 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifdef HAVE_MPI
+#include "mpi.h"
+#endif
+
+#include <algorithm>
+#include <unistd.h>
+#include <limits.h>
+#include <time.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+//#include "hsfc.h"
+#include "KDTree.h"
+#include "mesh.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+#include "timer.h"
+#ifdef HAVE_MPI
+#include "l7/l7.h"
+#endif
+#include "reduce.h"
+#include "genmalloc.h"
+#include "hash.h"
+
+#define DEBUG 0
+//#define BOUNDS_CHECK 1
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+#define DEBUG_RESTORE_VALS 1
+
+typedef int scanInt;
+void scan ( scanInt *input , scanInt *output , scanInt length);
+
+#ifdef _OPENMP
+#undef REZONE_NO_OPTIMIZATION
+#else
+#define REZONE_NO_OPTIMIZATION 1
+#endif
+
+#define TIMING_LEVEL 2
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+
+#define IPOW2(a) (2 << (a))
+
+#if defined(MINIMUM_PRECISION)
+#define CONSERVATION_EPS .1
+#define STATE_EPS 15.0
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+#define CONSERVATION_EPS .02
+#define STATE_EPS .025
+
+#elif defined(FULL_PRECISION)
+#define CONSERVATION_EPS .02
+#define STATE_EPS .025
+
+#endif
+
+typedef unsigned int uint;
+#ifdef __APPLE_CC__
+typedef unsigned long ulong;
+#endif
+
+#define TWO 2
+#define HALF 0.5
+
+#define __NEW_STENCIL__
+//#define __OLD_STENCIL__
+//#define STENCIL_WARNING 1
+
+#ifdef STENCIL_WARNING
+int do_stencil_warning=1;
+#else
+int do_stencil_warning=0;
+#endif
+
+#ifdef HAVE_OPENCL
+#include "mesh_kernel.inc"
+#endif
+
+extern bool localStencil;
+int calc_neighbor_type;
+bool dynamic_load_balance_on;
+bool neighbor_remap;
+
+#ifdef _OPENMP
+static bool iversion_flag = false;
+#endif
+
+static const char *mesh_timer_descriptor[MESH_TIMER_SIZE] = {
+ "mesh_timer_count_BCs",
+ "mesh_timer_calc_neighbors",
+ "mesh_timer_hash_setup",
+ "mesh_timer_hash_query",
+ "mesh_timer_find_boundary",
+ "mesh_timer_push_setup",
+ "mesh_timer_push_boundary",
+ "mesh_timer_local_list",
+ "mesh_timer_layer1",
+ "mesh_timer_layer2",
+ "mesh_timer_layer_list",
+ "mesh_timer_copy_mesh_data",
+ "mesh_timer_fill_mesh_ghost",
+ "mesh_timer_fill_neigh_ghost",
+ "mesh_timer_set_corner_neigh",
+ "mesh_timer_neigh_adjust",
+ "mesh_timer_setup_comm",
+ "mesh_timer_kdtree_setup",
+ "mesh_timer_kdtree_query",
+ "mesh_timer_refine_smooth",
+ "mesh_timer_rezone_all",
+ "mesh_timer_partition",
+ "mesh_timer_calc_spatial_coordinates",
+ "mesh_timer_load_balance"
+};
+
+#ifdef HAVE_OPENCL
+cl_kernel kernel_hash_adjust_sizes;
+cl_kernel kernel_hash_setup;
+cl_kernel kernel_hash_setup_local;
+cl_kernel kernel_neighbor_init;
+cl_kernel kernel_calc_neighbors;
+cl_kernel kernel_calc_neighbors_local;
+cl_kernel kernel_calc_border_cells;
+cl_kernel kernel_calc_border_cells2;
+cl_kernel kernel_finish_scan;
+cl_kernel kernel_get_border_data;
+cl_kernel kernel_calc_layer1;
+cl_kernel kernel_calc_layer1_sethash;
+cl_kernel kernel_calc_layer2;
+cl_kernel kernel_get_border_data2;
+cl_kernel kernel_calc_layer2_sethash;
+cl_kernel kernel_copy_mesh_data;
+cl_kernel kernel_fill_mesh_ghost;
+cl_kernel kernel_fill_neighbor_ghost;
+cl_kernel kernel_set_corner_neighbor;
+cl_kernel kernel_adjust_neighbors_local;
+cl_kernel kernel_reduction_scan2;
+cl_kernel kernel_reduction_count;
+cl_kernel kernel_reduction_count2;
+cl_kernel kernel_hash_size;
+cl_kernel kernel_finish_hash_size;
+cl_kernel kernel_calc_spatial_coordinates;
+cl_kernel kernel_count_BCs;
+cl_kernel kernel_do_load_balance_lower;
+cl_kernel kernel_do_load_balance_middle;
+cl_kernel kernel_do_load_balance_upper;
+#ifndef MINIMUM_PRECISION
+cl_kernel kernel_do_load_balance_double;
+#endif
+cl_kernel kernel_do_load_balance_float;
+cl_kernel kernel_refine_smooth;
+cl_kernel kernel_coarsen_smooth;
+cl_kernel kernel_coarsen_check_block;
+cl_kernel kernel_rezone_all;
+cl_kernel kernel_rezone_neighbors;
+#ifndef MINIMUM_PRECISION
+cl_kernel kernel_rezone_one_double;
+#endif
+cl_kernel kernel_rezone_one_float;
+cl_kernel kernel_copy_mpot_ghost_data;
+cl_kernel kernel_set_boundary_refinement;
+#endif
+
+extern size_t hash_header_size;
+extern int choose_hash_method;
+
+void Mesh::write_grid(int ncycle)
+{
+ FILE *fp;
+ char filename[20];
+
+ if (ncycle<0) ncycle=0;
+ sprintf(filename,"grid%02d.gph",ncycle);
+ fp=fopen(filename,"w");
+
+ fprintf(fp,"viewport %lf %lf %lf %lf\n",xmin,ymin,xmax,ymax);
+ for (uint ic = 0; ic < ncells; ic++) {
+ fprintf(fp,"rect %lf %lf %lf %lf\n",x[ic],y[ic],x[ic]+dx[ic],y[ic]+dy[ic]);
+ }
+
+ fprintf(fp,"line_init %lf %lf\n",x[0]+0.5*dx[0],y[0]+0.5*dy[0]);
+ for (uint ic = 1; ic < ncells; ic++){
+ fprintf(fp,"line %lf %lf\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic]);
+ }
+
+ for (uint ic = 0; ic < ncells; ic++){
+ fprintf(fp,"text %lf %lf %d\n",x[ic]+0.5*dx[ic],y[ic]+0.5*dy[ic],ic);
+ }
+
+ fclose(fp);
+}
+
+Mesh::Mesh(FILE *fin, int *numpe)
+{
+ char string[80];
+ ibase = 1;
+
+ time_t trand;
+ time(&trand);
+ srand48((long)trand);
+
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"levmax %d",&levmx);
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"cells %ld",&ncells);
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"numpe %d",numpe);
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"ndim %d",&ndim);
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+#ifdef MINIMUM_PRECISION
+ sscanf(string,"xaxis %f %f",&xmin, &deltax);
+#else
+ sscanf(string,"xaxis %lf %lf",&xmin, &deltax);
+#endif
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"yaxis %lf %lf",(double*)&ymin, (double*)&deltay);
+ if (ndim == THREE_DIMENSIONAL){
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+ sscanf(string,"zaxis %lf %lf",(double*)&zmin, (double*)&deltaz);
+ }
+ if(fgets(string, 80, fin) == NULL) exit(-1);
+
+ index.resize(ncells);
+
+ allocate(ncells);
+
+ uint ic=0;
+ while(fgets(string, 80, fin)!=NULL){
+ sscanf(string, "%d %d %d %d", &(index[ic]), &(i[ic]), &(j[ic]), &(level[ic]));
+ ic++;
+ }
+
+ ibase = 0;
+ calc_spatial_coordinates(ibase);
+ KDTree_Initialize(&tree);
+
+
+ print();
+
+ if (ic != ncells) {
+ printf("Error -- cells read does not match number specified\n");
+ }
+ return;
+}
+
+void Mesh::print(void)
+{
+ assert(&nlft[0] != NULL);
+ assert(&x[0] != NULL);
+ assert(&index[0] != NULL);
+
+ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+ printf("index orig index i j lev nlft nrht nbot ntop xlow xhigh ylow yhigh\n");
+ for (uint ic=0; ic<ncells; ic++)
+ { printf("%6d %6d %4d %4d %4d %4d %4d %4d %4d ", ic, index[ic], i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+ printf("%8.2lf %8.2lf %8.2lf %8.2lf\n", x[ic], x[ic]+dx[ic], y[ic], y[ic]+dy[ic]); }
+}
+
+void Mesh::print_local()
+{ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+ if (mesh_memory.get_memory_size(nlft) >= ncells_ghost){
+ fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
+ for (uint ic=0; ic<ncells; ic++) {
+ fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+ }
+ for (uint ic=ncells; ic<ncells_ghost; ic++) {
+ fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i[ic], j[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+ }
+ } else {
+ fprintf(fp,"%d: index i j lev\n",mype);
+ for (uint ic=0; ic<ncells_ghost; ic++) {
+ fprintf(fp,"%d: %6d %4d %4d %4d \n", mype,ic, i[ic], j[ic], level[ic]);
+ }
+ }
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::print_dev_local(void)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<int>i_tmp(ncells_ghost);
+ vector<int>j_tmp(ncells_ghost);
+ vector<int>level_tmp(ncells_ghost);
+ vector<int>nlft_tmp(ncells_ghost);
+ vector<int>nrht_tmp(ncells_ghost);
+ vector<int>nbot_tmp(ncells_ghost);
+ vector<int>ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+ //fprintf(fp,"\n%d: Printing mesh for dev_local\n\n",mype);
+
+ fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
+ for (uint ic=0; ic<MAX(ncells_ghost,ncells); ic++) {
+ fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
+ }
+ //fprintf(fp,"\n%d: Finished printing mesh for dev_local\n\n",mype);
+}
+
+void Mesh::compare_dev_local_to_local(void)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<int>i_tmp(ncells_ghost);
+ vector<int>j_tmp(ncells_ghost);
+ vector<int>level_tmp(ncells_ghost);
+ vector<int>nlft_tmp(ncells_ghost);
+ vector<int>nrht_tmp(ncells_ghost);
+ vector<int>nbot_tmp(ncells_ghost);
+ vector<int>ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+ fprintf(fp,"\n%d: Comparing mesh for dev_local to local\n\n",mype);
+ //fprintf(fp,"%d: index global i j lev nlft nrht nbot ntop \n",mype);
+ for (uint ic=0; ic<ncells_ghost; ic++) {
+ if (i_tmp[ic] != i[ic] ) fprintf(fp,"%d: Error: cell %d dev_i %d i %d\n",mype,ic,i_tmp[ic], i[ic]);
+ if (j_tmp[ic] != j[ic] ) fprintf(fp,"%d: Error: cell %d dev_j %d j %d\n",mype,ic,j_tmp[ic], j[ic]);
+ if (level_tmp[ic] != level[ic]) fprintf(fp,"%d: Error: cell %d dev_level %d level %d\n",mype,ic,level_tmp[ic],level[ic]);
+
+ //fprintf(fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mype,ic, ic+noffset,i_tmp[ic], j_tmp[ic], level_tmp[ic], nlft_tmp[ic], nrht_tmp[ic], nbot_tmp[ic], ntop_tmp[ic]);
+ }
+ fprintf(fp,"\n%d: Finished comparing mesh for dev_local to local\n\n",mype);
+}
+
+void Mesh::compare_neighbors_gpu_global_to_cpu_global()
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<int>nlft_check(ncells);
+ vector<int>nrht_check(ncells);
+ vector<int>nbot_check(ncells);
+ vector<int>ntop_check(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells*sizeof(cl_int), &ntop_check[0], NULL);
+
+ //printf("\n%d: Comparing neighbors for gpu_global to cpu_global\n\n",mype);
+ for (uint ic=0; ic<ncells; ic++) {
+ if (nlft[ic] != nlft_check[ic]) printf("DEBUG -- nlft: ic %d nlft %d nlft_check %d\n",ic, nlft[ic], nlft_check[ic]);
+ if (nrht[ic] != nrht_check[ic]) printf("DEBUG -- nrht: ic %d nrht %d nrht_check %d\n",ic, nrht[ic], nrht_check[ic]);
+ if (nbot[ic] != nbot_check[ic]) printf("DEBUG -- nbot: ic %d nbot %d nbot_check %d\n",ic, nbot[ic], nbot_check[ic]);
+ if (ntop[ic] != ntop_check[ic]) printf("DEBUG -- ntop: ic %d ntop %d ntop_check %d\n",ic, ntop[ic], ntop_check[ic]);
+ }
+ //printf("\n%d: Finished comparing mesh for dev_local to local\n\n",mype);
+}
+#endif
+
+void Mesh::compare_neighbors_cpu_local_to_cpu_global(uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
+{
+
+#ifdef HAVE_MPI
+ int *nlft_global = mesh_global->nlft;
+ int *nrht_global = mesh_global->nrht;
+ int *nbot_global = mesh_global->nbot;
+ int *ntop_global = mesh_global->ntop;
+
+ vector<int> Test(ncells_ghost);
+ for(uint ic=0; ic<ncells; ic++){
+ Test[ic] = mype*1000 +ic;
+ }
+ if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
+
+ vector<int> Test_global(ncells_global);
+ MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ vector<int> Test_check(ncells);
+ vector<int> Test_check_global(ncells_global);
+
+ // ==================== check left value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nlft[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check left left value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nlft[nlft[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
+ mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
+ printf("%d: check %5d -- nlftl %5d nlftl nlftl %5d check %5d\n",
+ mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check right value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nrht[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check right right value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nrht[nrht[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
+ mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
+ printf("%d: check %5d -- nrhtl %5d nrhtl nrhtl %5d check %5d\n",
+ mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check bottom value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nbot[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check bottom bottom value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nbot[nbot[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
+ mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
+ printf("%d: check %5d -- nbotl %5d nbotl nbotl %5d check %5d\n",
+ mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check top value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[ntop[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check top top value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[ntop[ntop[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+ mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+ printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
+ mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+ }
+ }
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- ncells_global %d ncells_ghost %d mesh_global %p nsizes[0] %d ndispl[0] %d\n",
+ ncells_global,ncells_ghost,mesh_global,nsizes[0],ndispl[0]);
+#endif
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_neighbors_all_to_gpu_local(Mesh *mesh_global, int *nsizes, int *ndispl)
+//uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl)
+{
+#ifdef HAVE_MPI
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t &ncells_global = mesh_global->ncells;
+ int *nlft_global = mesh_global->nlft;
+ int *nrht_global = mesh_global->nrht;
+ int *nbot_global = mesh_global->nbot;
+ int *ntop_global = mesh_global->ntop;
+
+ // Checking CPU parallel to CPU global
+ vector<int> Test(ncells_ghost);
+ for(uint ic=0; ic<ncells; ic++){
+ Test[ic] = mype*1000 +ic;
+ }
+ if (numpe > 1) L7_Update(&Test[0], L7_INT, cell_handle);
+
+ vector<int> Test_global(ncells_global);
+ MPI_Allgatherv(&Test[0], nsizes[mype], MPI_INT, &Test_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ vector<int> Test_check(ncells);
+ vector<int> Test_check_global(ncells_global);
+
+ // ==================== check left value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nlft[ic]];
+ //if (mype == 1 && ic==0) printf("%d: nlft check for ic 0 is %d\n",mype,nlft[0]);
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ //if (Test_global[nlft_global[ic]] != Test_check_global[ic]) {
+ //if (mype == 0) printf("%d: Error with nlft for cell %d -- nlft %d global %d check %d\n",mype,ic,nlft_global[ic],Test_global[nlft_global[ic]],Test_check_global[ic]);
+ //}
+ }
+
+ // ==================== check left left value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nlft[nlft[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nlft_global[nlft_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nlft nlft for cell %5d -- nlftg %5d nlftg nlftg %5d global %5d\n",
+ mype,ic,nlft_global[ic],nlft_global[nlft_global[ic]],Test_global[nlft_global[nlft_global[ic]]]);
+ printf("%d: check %5d -- nlftl %5d nlftl nlftl %5d check %5d\n",
+ mype,ic,nlft[ic],nlft[nlft[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check right value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nrht[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nrht_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with nrht for cell %d -- %d %d\n",mype,ic,Test_global[nrht_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check right right value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nrht[nrht[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nrht_global[nrht_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nrht nrht for cell %5d -- nrhtg %5d nrhtg nrhtg %5d global %5d\n",
+ mype,ic,nrht_global[ic],nrht_global[nrht_global[ic]],Test_global[nrht_global[nrht_global[ic]]]);
+ printf("%d: check %5d -- nrhtl %5d nrhtl nrhtl %5d check %5d\n",
+ mype,ic,nrht[ic],nrht[nrht[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check bottom value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nbot[ic]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nbot_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with nbot for cell %d -- %d %d\n",mype,ic,Test_global[nbot_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check bottom bottom value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[nbot[nbot[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[nbot_global[nbot_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with nbot nbot for cell %5d -- nbotg %5d nbotg nbotg %5d global %5d\n",
+ mype,ic,nbot_global[ic],nbot_global[nbot_global[ic]],Test_global[nbot_global[nbot_global[ic]]]);
+ printf("%d: check %5d -- nbotl %5d nbotl nbotl %5d check %5d\n",
+ mype,ic,nbot[ic],nbot[nbot[ic]],Test_check_global[ic]);
+ }
+ }
+ // ==================== check top value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[ntop[ic]];
+ }
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[ntop_global[ic]] != Test_check_global[ic]) {
+ if (mype == 0) printf("%d: Error with ntop for cell %d -- %d %d\n",mype,ic,Test_global[ntop_global[ic]],Test_check_global[ic]);
+ }
+ }
+
+ // ==================== check top top value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[ntop[ntop[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+ mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+ printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
+ mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+ }
+ }
+ // checking gpu results
+ vector<int> nlft_check(ncells_ghost); vector<int> nrht_check(ncells_ghost);
+ vector<int> nbot_check(ncells_ghost); vector<int> ntop_check(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_check[0], NULL);
+
+ for (uint ic=0; ic<ncells_ghost; ic++){
+ if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
+ if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
+ if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
+ if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
+ }
+
+ // ==================== check top top value ====================
+ for (uint ic=0; ic<ncells; ic++){
+ Test_check[ic] = Test[ntop[ntop[ic]]];
+ }
+
+ MPI_Allgatherv(&Test_check[0], nsizes[mype], MPI_INT, &Test_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+ for (uint ic=0; ic<ncells_global; ic++){
+ if (Test_global[ntop_global[ntop_global[ic]]] != Test_check_global[ic]) {
+ printf("%d: Error with ntop ntop for cell %5d -- ntopg %5d ntopg ntopg %5d global %5d\n",
+ mype,ic,ntop_global[ic],ntop_global[ntop_global[ic]],Test_global[ntop_global[ntop_global[ic]]]);
+ printf("%d: check %5d -- ntopl %5d ntopl ntopl %5d check %5d\n",
+ mype,ic,ntop[ic],ntop[ntop[ic]],Test_check_global[ic]);
+ }
+ }
+ // checking gpu results
+ //vector<int> nlft_check(ncells_ghost); vector<int> nrht_check(ncells_ghost);
+ //vector<int> nbot_check(ncells_ghost); vector<int> ntop_check(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_check[0], NULL);
+
+ for (uint ic=0; ic<ncells_ghost; ic++){
+ if (nlft[ic] != nlft_check[ic]) printf("%d: Error with gpu calculated nlft for cell %d nlft %d check %d\n",mype,ic,nlft[ic],nlft_check[ic]);
+ if (nrht[ic] != nrht_check[ic]) printf("%d: Error with gpu calculated nrht for cell %d nrht %d check %d\n",mype,ic,nrht[ic],nrht_check[ic]);
+ if (nbot[ic] != nbot_check[ic]) printf("%d: Error with gpu calculated nbot for cell %d nbot %d check %d\n",mype,ic,nbot[ic],nbot_check[ic]);
+ if (ntop[ic] != ntop_check[ic]) printf("%d: Error with gpu calculated ntop for cell %d ntop %d check %d\n",mype,ic,ntop[ic],ntop_check[ic]);
+ }
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- mesh_global %p nsizes[0] %d ndispl[0] %d\n",
+ mesh_global,nsizes[0],ndispl[0]);
+#endif
+}
+
+void Mesh::compare_indices_gpu_global_to_cpu_global(void)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<int> i_check(ncells);
+ vector<int> j_check(ncells);
+ vector<int> level_check(ncells);
+ vector<int> celltype_check(ncells);
+ /// Set read buffers for data.
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells*sizeof(cl_int), &level_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, ncells*sizeof(cl_int), &celltype_check[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (i[ic] != i_check[ic] ) printf("DEBUG -- i: ic %d i %d i_check %d\n",ic, i[ic], i_check[ic]);
+ if (j[ic] != j_check[ic] ) printf("DEBUG -- j: ic %d j %d j_check %d\n",ic, j[ic], j_check[ic]);
+ if (level[ic] != level_check[ic] ) printf("DEBUG -- level: ic %d level %d level_check %d\n",ic, level[ic], level_check[ic]);
+ if (celltype[ic] != celltype_check[ic] ) printf("DEBUG -- celltype: ic %d celltype %d celltype_check %d\n",ic, celltype[ic], celltype_check[ic]);
+ }
+}
+#endif
+
+void Mesh::compare_indices_cpu_local_to_cpu_global(uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl, int cycle)
+{
+ int *celltype_global = mesh_global->celltype;
+ int *i_global = mesh_global->i;
+ int *j_global = mesh_global->j;
+ int *level_global = mesh_global->level;
+
+ vector<int> i_check_global(ncells_global);
+ vector<int> j_check_global(ncells_global);
+ vector<int> level_check_global(ncells_global);
+ vector<int> celltype_check_global(ncells_global);
+
+/*
+ vector<int> i_check_local(ncells);
+ vector<int> j_check_local(ncells);
+ vector<int> level_check_local(ncells);
+ vector<int> celltype_check_local(ncells);
+*/
+
+#ifdef HAVE_MPI
+ MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&i[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&j[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&level[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+
+/*
+ MPI_Scatterv(&celltype_global[0], &nsizes[0], &ndispl[0], MPI_INT, &celltype_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+ MPI_Scatterv(&i_global[0], &nsizes[0], &ndispl[0], MPI_INT, &i_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+ MPI_Scatterv(&j_global[0], &nsizes[0], &ndispl[0], MPI_INT, &j_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+ MPI_Scatterv(&level_global[0], &nsizes[0], &ndispl[0], MPI_INT, &level_check_local[0], nsizes[mype], MPI_INT, 0, MPI_COMM_WORLD);
+*/
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d\n",
+ nsizes[0],ndispl[0]);
+#endif
+
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d %d \n",cycle,ic,celltype_global[ic],celltype_check_global[ic]);
+ if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d %d \n",cycle,ic,i_global[ic],i_check_global[ic]);
+ if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d %d \n",cycle,ic,j_global[ic],j_check_global[ic]);
+ if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d %d \n",cycle,ic,level_global[ic],level_check_global[ic]);
+ }
+
+/*
+ for (uint ic = 0; ic < ncells; ic++){
+ if (celltype[ic] != celltype_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d celltype & celltype_check_local %d %d %d \n",cycle,ic,celltype[ic],celltype_check_local[ic]);
+ if (i[ic] != i_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d i & i_check_local %d %d %d \n",cycle,ic,i[ic],i_check_local[ic]);
+ if (j[ic] != j_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d j & j_check_local %d %d %d \n",cycle,ic,j[ic],j_check_local[ic]);
+ if (level[ic] != level_check_local[ic]) fprintf(fp,"DEBUG rezone 3 at cycle %d level & level_check_local %d %d %d \n",cycle,ic,level[ic],level_check_local[ic]);
+ }
+*/
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_indices_all_to_gpu_local(Mesh *mesh_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
+{
+#ifdef HAVE_MPI
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ int *level_global = mesh_global->level;
+ int *celltype_global = mesh_global->celltype;
+ int *i_global = mesh_global->i;
+ int *j_global = mesh_global->j;
+
+ cl_mem &dev_celltype_global = mesh_global->dev_celltype;
+ cl_mem &dev_i_global = mesh_global->dev_i;
+ cl_mem &dev_j_global = mesh_global->dev_j;
+ cl_mem &dev_level_global = mesh_global->dev_level;
+
+ // Need to compare dev_H to H, etc
+ vector<int> level_check(ncells);
+ vector<int> celltype_check(ncells);
+ vector<int> i_check(ncells);
+ vector<int> j_check(ncells);
+ /// Set read buffers for data.
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells*sizeof(cl_int), &level_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_FALSE, 0, ncells*sizeof(cl_int), &celltype_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_TRUE, 0, ncells*sizeof(cl_int), &j_check[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (level[ic] != level_check[ic] ) printf("%d: DEBUG rezone 1 cell %d level %d level_check %d\n",mype, ic, level[ic], level_check[ic]);
+ if (celltype[ic] != celltype_check[ic] ) printf("%d: DEBUG rezone 1 cell %d celltype %d celltype_check %d\n",mype, ic, celltype[ic], celltype_check[ic]);
+ if (i[ic] != i_check[ic] ) printf("%d: DEBUG rezone 1 cell %d i %d i_check %d\n",mype, ic, i[ic], i_check[ic]);
+ if (j[ic] != j_check[ic] ) printf("%d: DEBUG rezone 1 cell %d j %d j_check %d\n",mype, ic, j[ic], j_check[ic]);
+ }
+
+ // And compare dev_H gathered to H_global, etc
+ vector<int>celltype_check_global(ncells_global);
+ vector<int>i_check_global(ncells_global);
+ vector<int>j_check_global(ncells_global);
+ vector<int>level_check_global(ncells_global);
+ MPI_Allgatherv(&celltype_check[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&i_check[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&j_check[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&level_check[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (level_global[ic] != level_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d level_global %d level_check_global %d\n",mype, ic, level_global[ic], level_check_global[ic]);
+ if (celltype_global[ic] != celltype_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d celltype_global %d celltype_check_global %d\n",mype, ic, celltype_global[ic], celltype_check_global[ic]);
+ if (i_global[ic] != i_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d i_global %d i_check_global %d\n",mype, ic, i_global[ic], i_check_global[ic]);
+ if (j_global[ic] != j_check_global[ic] ) printf("%d: DEBUG rezone 2 cell %d j_global %d j_check_global %d\n",mype, ic, j_global[ic], j_check_global[ic]);
+ }
+
+ // And compare H gathered to H_global, etc
+ MPI_Allgatherv(&celltype[0], nsizes[mype], MPI_INT, &celltype_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&i[0], nsizes[mype], MPI_INT, &i_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&j[0], nsizes[mype], MPI_INT, &j_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgatherv(&level[0], nsizes[mype], MPI_INT, &level_check_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 3 at cycle %d celltype_global & celltype_check_global %d %d %d \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
+ if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 3 at cycle %d i_global & i_check_global %d %d %d \n",ncycle,ic,i_global[ic],i_check_global[ic]);
+ if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 3 at cycle %d j_global & j_check_global %d %d %d \n",ncycle,ic,j_global[ic],j_check_global[ic]);
+ if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 3 at cycle %d level_global & level_check_global %d %d %d \n",ncycle,ic,level_global[ic],level_check_global[ic]);
+ }
+
+ // Now the global dev_H_global to H_global, etc
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &celltype_check_global[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_i_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &i_check_global[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j_global, CL_FALSE, 0, ncells_global*sizeof(cl_int), &j_check_global[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level_global, CL_TRUE, 0, ncells_global*sizeof(cl_int), &level_check_global[0], NULL);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (celltype_global[ic] != celltype_check_global[ic]) printf("DEBUG rezone 4 at cycle %d celltype_global & celltype_check_global %d %d %d \n",ncycle,ic,celltype_global[ic],celltype_check_global[ic]);
+ if (i_global[ic] != i_check_global[ic]) printf("DEBUG rezone 4 at cycle %d i_global & i_check_global %d %d %d \n",ncycle,ic,i_global[ic],i_check_global[ic]);
+ if (j_global[ic] != j_check_global[ic]) printf("DEBUG rezone 4 at cycle %d j_global & j_check_global %d %d %d \n",ncycle,ic,j_global[ic],j_check_global[ic]);
+ if (level_global[ic] != level_check_global[ic]) printf("DEBUG rezone 4 at cycle %d level_global & level_check_global %d %d %d \n",ncycle,ic,level_global[ic],level_check_global[ic]);
+ }
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- mesh_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
+ mesh_global,ncells_global,nsizes[0],ndispl[0],ncycle);
+#endif
+}
+
+void Mesh::compare_coordinates_gpu_global_to_cpu_global_double(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, double *H)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<spatial_t>x_check(ncells);
+ vector<spatial_t>dx_check(ncells);
+ vector<spatial_t>y_check(ncells);
+ vector<spatial_t>dy_check(ncells);
+ vector<double>H_check(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_x, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_dx, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_y, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_dy, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_H, CL_TRUE, 0, ncells*sizeof(cl_double), &H_check[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
+ printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
+ exit(0);
+ }
+ }
+ for (uint ic = 0; ic < ncells; ic++){
+ if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
+ printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
+ exit(0);
+ }
+ }
+}
+
+void Mesh::compare_coordinates_gpu_global_to_cpu_global_float(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, float *H)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<spatial_t>x_check(ncells);
+ vector<spatial_t>dx_check(ncells);
+ vector<spatial_t>y_check(ncells);
+ vector<spatial_t>dy_check(ncells);
+ vector<float>H_check(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_x, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &x_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_dx, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dx_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_y, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &y_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_dy, CL_FALSE, 0, ncells*sizeof(cl_spatial_t), &dy_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_H, CL_TRUE, 0, ncells*sizeof(cl_float), &H_check[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (x[ic] != x_check[ic] || dx[ic] != dx_check[ic] || y[ic] != y_check[ic] || dy[ic] != dy_check[ic] ) {
+ printf("Error -- mismatch in spatial coordinates for cell %d is gpu %lf %lf %lf %lf cpu %lf %lf %lf %lf\n",ic,x_check[ic],dx_check[ic],y_check[ic],dy_check[ic],x[ic],dx[ic],y[ic],dy[ic]);
+ exit(0);
+ }
+ }
+ for (uint ic = 0; ic < ncells; ic++){
+ if (fabs(H[ic] - H_check[ic]) > CONSERVATION_EPS) {
+ printf("Error -- mismatch in H for cell %d is gpu %lf cpu %lf\n",ic,H_check[ic],H[ic]);
+ exit(0);
+ }
+ }
+}
+#endif
+
+void Mesh::compare_coordinates_cpu_local_to_cpu_global_double(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, double *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, double *H_global, int cycle)
+{
+ vector<spatial_t> x_check_global(ncells_global);
+ vector<spatial_t> dx_check_global(ncells_global);
+ vector<spatial_t> y_check_global(ncells_global);
+ vector<spatial_t> dy_check_global(ncells_global);
+ vector<double> H_check_global(ncells_global);
+
+#ifdef HAVE_MPI
+ MPI_Allgatherv(&x[0], nsizes[mype], MPI_SPATIAL_T, &x_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&y[0], nsizes[mype], MPI_SPATIAL_T, &y_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&H[0], nsizes[mype], MPI_DOUBLE, &H_check_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
+ nsizes[0],ndispl[0],x,dx,y,dy,H);
+#endif
+
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
+ if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
+ if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
+ if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
+ if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
+ }
+
+}
+
+void Mesh::compare_coordinates_cpu_local_to_cpu_global_float(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, float *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, float *H_global, int cycle)
+{
+ vector<spatial_t> x_check_global(ncells_global);
+ vector<spatial_t> dx_check_global(ncells_global);
+ vector<spatial_t> y_check_global(ncells_global);
+ vector<spatial_t> dy_check_global(ncells_global);
+ vector<float> H_check_global(ncells_global);
+
+#ifdef HAVE_MPI
+ MPI_Allgatherv(&x[0], nsizes[mype], MPI_SPATIAL_T, &x_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&dx[0], nsizes[mype], MPI_SPATIAL_T, &dx_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&y[0], nsizes[mype], MPI_SPATIAL_T, &y_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&dy[0], nsizes[mype], MPI_SPATIAL_T, &dy_check_global[0], &nsizes[0], &ndispl[0], MPI_SPATIAL_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&H[0], nsizes[mype], MPI_FLOAT, &H_check_global[0], &nsizes[0], &ndispl[0], MPI_FLOAT, MPI_COMM_WORLD);
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d x %p dx %p y %p dy %p H %p\n",
+ nsizes[0],ndispl[0],x,dx,y,dy,H);
+#endif
+
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(x_global[ic] -x_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d x_global & x_check_global %d %lf %lf \n",cycle,ic,x_global[ic], x_check_global[ic]);
+ if (fabs(dx_global[ic]-dx_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dx_global & dx_check_global %d %lf %lf \n",cycle,ic,dx_global[ic],dx_check_global[ic]);
+ if (fabs(y_global[ic] -y_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d y_global & y_check_global %d %lf %lf \n",cycle,ic,y_global[ic], y_check_global[ic]);
+ if (fabs(dy_global[ic]-dy_check_global[ic]) > STATE_EPS) printf("DEBUG graphics at cycle %d dy_global & dy_check_global %d %lf %lf \n",cycle,ic,dy_global[ic],dy_check_global[ic]);
+ if (fabs(H_global[ic] -H_check_global[ic] ) > STATE_EPS) printf("DEBUG graphics at cycle %d H_global & H_check_global %d %lf %lf \n",cycle,ic,H_global[ic], H_check_global[ic]);
+ }
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_mpot_gpu_global_to_cpu_global(int *mpot, cl_mem dev_mpot)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<int>mpot_check(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot_check[0], NULL);
+
+ for (uint ic=0; ic<ncells; ic++) {
+ if (mpot[ic] != mpot_check[ic]) printf("DEBUG -- mpot: ic %d mpot %d mpot_check %d\n",ic, mpot[ic], mpot_check[ic]);
+ }
+}
+#endif
+
+void Mesh::compare_mpot_cpu_local_to_cpu_global(uint ncells_global, int *nsizes, int *ndispl, int *mpot, int *mpot_global, int cycle)
+{
+ vector<int>mpot_save_global(ncells_global);
+#ifdef HAVE_MPI
+ MPI_Allgatherv(&mpot[0], ncells, MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- nsizes[0] %d ndispl[0] %d mpot %p\n",
+ nsizes[0],ndispl[0],mpot);
+#endif
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (mpot_global[ic] != mpot_save_global[ic]) {
+ if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,cycle,ic,mpot_global[ic],mpot_save_global[ic]);
+ }
+ }
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::compare_mpot_all_to_gpu_local(int *mpot, int *mpot_global, cl_mem dev_mpot, cl_mem dev_mpot_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle)
+{
+#ifdef HAVE_MPI
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ // Need to compare dev_mpot to mpot
+ vector<int>mpot_save(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), &mpot_save[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (mpot[ic] != mpot_save[ic]) {
+ printf("%d: DEBUG refine_potential 1 at cycle %d cell %d mpot & mpot_save %d %d \n",mype,ncycle,ic,mpot[ic],mpot_save[ic]);
+ }
+ }
+
+ // Compare dev_mpot to mpot_global
+ vector<int>mpot_save_global(ncells_global);
+ MPI_Allgatherv(&mpot_save[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (mpot_global[ic] != mpot_save_global[ic]) {
+ if (mype == 0) printf("%d: DEBUG refine_potential 2 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+ }
+ }
+
+ // Compare mpot to mpot_global
+ MPI_Allgatherv(&mpot[0], nsizes[mype], MPI_INT, &mpot_save_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (mpot_global[ic] != mpot_save_global[ic]) {
+ if (mype == 0) printf("%d: DEBUG refine_potential 3 at cycle %d cell %d mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+ }
+ }
+
+ // Compare dev_mpot_global to mpot_global
+ ezcl_enqueue_read_buffer(command_queue, dev_mpot_global, CL_TRUE, 0, ncells_global*sizeof(cl_int), &mpot_save_global[0], NULL);
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (mpot_global[ic] != mpot_save_global[ic]) {
+ if (mype == 0) printf("%d: DEBUG refine_potential 4 at cycle %d cell %u mpot_global & mpot_save_global %d %d \n",mype,ncycle,ic,mpot_global[ic],mpot_save_global[ic]);
+ }
+ }
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- mpot %p mpot_global %p dev_mpot %p dev_mpot_global %p ncells_global %d nsizes[0] %d ndispl[0] %d ncycle %d\n",
+ mpot,mpot_global,dev_mpot,dev_mpot_global,ncells_global,nsizes[0],ndispl[0],ncycle);
+#endif
+}
+
+void Mesh::compare_ioffset_gpu_global_to_cpu_global(uint old_ncells, int *mpot)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t local_work_size = MIN(ncells, TILE_SIZE);
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+ //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel.
+ size_t block_size = global_work_size/local_work_size;
+
+ vector<int> ioffset_check(block_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset_check[0], NULL);
+
+ int mcount, mtotal;
+ mtotal = 0;
+ for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
+ mcount = 0;
+ for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+ if (ic >= old_ncells) break;
+
+ if (mpot[ic] < 0) {
+ if (celltype[ic] == REAL_CELL) {
+ // remove all but cell that will remain to get count right when split
+ // across processors
+ if (is_lower_left(i[ic],j[ic]) ) mcount++;
+ } else {
+ // either upper right or lower left will remain for boundary cells
+ if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
+ }
+ }
+ if (mpot[ic] >= 0) {
+ if (celltype[ic] == REAL_CELL){
+ mcount += mpot[ic] ? 4 : 1;
+ } else {
+ mcount += mpot[ic] ? 2 : 1;
+ }
+ }
+ }
+ if (mtotal != ioffset_check[ig]) printf("DEBUG ig %d ioffset %d mcount %d\n",ig,ioffset_check[ig],mtotal);
+ mtotal += mcount;
+ }
+}
+
+void Mesh::compare_ioffset_all_to_gpu_local(uint old_ncells, uint old_ncells_global, int block_size, int block_size_global, int *mpot, int *mpot_global, cl_mem dev_ioffset, cl_mem dev_ioffset_global, int *ioffset, int *ioffset_global, int *celltype_global, int *i_global, int *j_global)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ // This compares ioffset for each block in the calculation
+ ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, block_size*sizeof(cl_int), &ioffset[0], NULL);
+ int mtotal = 0;
+ for (uint ig=0; ig<(old_ncells+TILE_SIZE-1)/TILE_SIZE; ig++){
+ int mcount = 0;
+ for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+ if (ic >= old_ncells) break;
+
+ if (mpot[ic] < 0) {
+ if (celltype[ic] == REAL_CELL) {
+ // remove all but cell that will remain to get count right when split
+ // across processors
+ if (is_lower_left(i[ic],j[ic]) ) mcount++;
+ } else {
+ // either upper right or lower left will remain for boundary cells
+ if (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) mcount++;
+ }
+ }
+ if (mpot[ic] >= 0) {
+ if (celltype[ic] == REAL_CELL){
+ mcount += mpot[ic] ? 4 : 1;
+ } else {
+ mcount += mpot[ic] ? 2 : 1;
+ }
+ }
+ }
+ if (mtotal != ioffset[ig]) printf("%d: DEBUG ig %d ioffset %d mtotal %d\n",mype,ig,ioffset[ig],mtotal);
+ mtotal += mcount;
+ }
+
+ // For global This compares ioffset for each block in the calculation
+ ezcl_enqueue_read_buffer(command_queue, dev_ioffset_global, CL_TRUE, 0, block_size_global*sizeof(cl_int), &ioffset_global[0], NULL);
+ mtotal = 0;
+ int count = 0;
+ for (uint ig=0; ig<(old_ncells_global+TILE_SIZE-1)/TILE_SIZE; ig++){
+ int mcount = 0;
+ for (uint ic=ig*TILE_SIZE; ic<(ig+1)*TILE_SIZE; ic++){
+ if (ic >= old_ncells_global) break;
+
+ if (mpot_global[ic] < 0) {
+ if (celltype_global[ic] == REAL_CELL) {
+ // remove all but cell that will remain to get count right when split
+ // across processors
+ if (is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
+ } else {
+ // either upper right or lower left will remain for boundary cells
+ if (is_upper_right(i_global[ic],j_global[ic]) || is_lower_left(i_global[ic],j_global[ic]) ) mcount++;
+ }
+ }
+
+ if (mpot_global[ic] >= 0) {
+ if (celltype_global[ic] == REAL_CELL) {
+ mcount += mpot_global[ic] ? 4 : 1;
+ } else {
+ mcount += mpot_global[ic] ? 2 : 1;
+ }
+ }
+ }
+ if (mtotal != ioffset_global[ig]) {
+ printf("DEBUG global ig %d ioffset %d mtotal %d\n",ig,ioffset_global[ig],mtotal);
+ count++;
+ }
+ if (count > 10) exit(0);
+ mtotal += mcount;
+ }
+}
+#endif
+
+Mesh::Mesh(int nx, int ny, int levmx_in, int ndim_in, double deltax_in, double deltay_in, int boundary, int parallel_in, int do_gpu_calc)
+{
+ lowerBound_Global = NULL;
+ upperBound_Global = NULL;
+ for (int i = 0; i < MESH_TIMER_SIZE; i++){
+ cpu_timers[i] = 0.0;
+ gpu_timers[i] = 0L;
+ }
+
+ for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+ cpu_counters[i] = 0;
+ gpu_counters[i] = 0;
+ }
+
+ ndim = ndim_in;
+ levmx = levmx_in;
+#ifdef HAVE_OPENCL
+ if (ndim == TWO_DIMENSIONAL) defines = "-DTWO_DIMENSIONAL -DCARTESIAN";
+#endif
+
+ offtile_ratio_local = 0;
+ offtile_local_count = 1;
+
+ mype = 0;
+ numpe = 1;
+ ncells = 0;
+ ncells_ghost = 0;
+ parallel = parallel_in;
+ noffset = 0;
+ mem_factor = 1.0;
+ //mem_factor = 1.5;
+
+#ifdef HAVE_MPI
+ int mpi_init;
+ MPI_Initialized(&mpi_init);
+ if (mpi_init && parallel){
+ MPI_Comm_rank(MPI_COMM_WORLD,&mype);
+ MPI_Comm_size(MPI_COMM_WORLD,&numpe);
+ }
+ // TODO add fini
+ if (parallel) mesh_memory.pinit(MPI_COMM_WORLD, 2L * 1024 * 1024 * 1024);
+#endif
+ cell_handle = 0;
+
+ if (numpe == 1) mem_factor = 1.0;
+
+ deltax = deltax_in;
+ deltay = deltay_in;
+
+ have_boundary = boundary;
+
+ //int istart = 1;
+ //int jstart = 1;
+ //int iend = nx;
+ //int jend = ny;
+ int nxx = nx;
+ int nyy = ny;
+ imin = 0;
+ jmin = 0;
+ imax = nx+1;
+ jmax = ny+1;
+ if (have_boundary) {
+ //istart = 0;
+ //jstart = 0;
+ //iend = nx + 1;
+ //jend = ny + 1;
+ nxx = nx + 2;
+ nyy = ny + 2;
+ imin = 0;
+ jmin = 0;
+ imax = nx + 1;
+ jmax = ny + 1;
+ }
+
+ xmin = -deltax * 0.5 * (real_t)nxx;
+ ymin = -deltay * 0.5 * (real_t)nyy;
+ xmax = deltax * 0.5 * (real_t)nxx;
+ ymax = deltay * 0.5 * (real_t)nyy;
+
+ size_t lvlMxSize = levmx + 1;
+
+ levtable.resize(lvlMxSize);
+ lev_ibegin.resize(lvlMxSize);
+ lev_jbegin.resize(lvlMxSize);
+ lev_iend.resize( lvlMxSize);
+ lev_jend.resize( lvlMxSize);
+ lev_deltax.resize(lvlMxSize);
+ lev_deltay.resize(lvlMxSize);
+
+ lev_ibegin[0] = imin + 1;
+ lev_iend[0] = imax - 1;
+ lev_jbegin[0] = jmin + 1;
+ lev_jend[0] = jmax - 1;
+ lev_deltax[0] = deltax;
+ lev_deltay[0] = deltay;
+
+ for (int lev = 1; lev <= levmx; lev++) {
+ lev_ibegin[lev] = lev_ibegin[lev-1]*2;
+ lev_iend[lev] = lev_iend [lev-1]*2 + 1;
+ lev_jbegin[lev] = lev_jbegin[lev-1]*2;
+ lev_jend[lev] = lev_jend [lev-1]*2 + 1;
+ lev_deltax[lev] = lev_deltax[lev-1]*0.5;
+ lev_deltay[lev] = lev_deltay[lev-1]*0.5;
+ }
+ for (uint lev=0; lev<lvlMxSize; lev++){
+ levtable[lev] = IPOW2(lev);
+ }
+
+ if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+ // The copy host ptr flag will have the data copied to the GPU as part of the allocation
+ dev_levtable = ezcl_malloc(&levtable[0], const_cast<char *>("dev_levtable"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_levdx = ezcl_malloc(&lev_deltax[0], const_cast<char *>("dev_levdx"), &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_levdy = ezcl_malloc(&lev_deltay[0], const_cast<char *>("dev_levdy"), &lvlMxSize, sizeof(cl_real_t), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_levibeg = ezcl_malloc(&lev_ibegin[0], const_cast<char *>("dev_levibeg"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_leviend = ezcl_malloc(&lev_iend[0], const_cast<char *>("dev_leviend"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_levjbeg = ezcl_malloc(&lev_jbegin[0], const_cast<char *>("dev_levjbeg"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+ dev_levjend = ezcl_malloc(&lev_jend[0], const_cast<char *>("dev_levjend"), &lvlMxSize, sizeof(cl_int), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0);
+#endif
+ }
+
+ ibase = 0;
+
+ int ncells_corners = 4;
+ int i_corner[] = { 0, 0,imax,imax};
+ int j_corner[] = { 0,jmax, 0,jmax};
+
+ for(int ic=0; ic<ncells_corners; ic++){
+ for (int jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
+ for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
+ corners_i.push_back(ii);
+ corners_j.push_back(jj);
+ }
+ }
+ }
+
+ do_rezone = true;
+ gpu_do_rezone = true;
+
+ celltype = NULL;
+ nlft = NULL;
+ nrht = NULL;
+ nbot = NULL;
+ ntop = NULL;
+}
+
+void Mesh::init(int nx, int ny, real_t circ_radius, partition_method initial_order, int do_gpu_calc)
+{
+ if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+
+ hash_lib_init();
+ if (mype == 0) printf("Starting compile of kernels in mesh\n");
+ char *bothsources = (char *)malloc(strlen(mesh_kern_source)+strlen(get_hash_kernel_source_string())+1);
+ strcpy(bothsources, get_hash_kernel_source_string());
+ strcat(bothsources, mesh_kern_source);
+ strcat(bothsources, "\0");
+ const char *defines = NULL;
+ cl_program program = ezcl_create_program_wsource(context, defines, bothsources);
+ free(bothsources);
+
+ kernel_reduction_scan2 = ezcl_create_kernel_wprogram(program, "finish_reduction_scan2_cl");
+ kernel_reduction_count = ezcl_create_kernel_wprogram(program, "finish_reduction_count_cl");
+ kernel_reduction_count2 = ezcl_create_kernel_wprogram(program, "finish_reduction_count2_cl");
+ kernel_hash_adjust_sizes = ezcl_create_kernel_wprogram(program, "hash_adjust_sizes_cl");
+ kernel_hash_setup = ezcl_create_kernel_wprogram(program, "hash_setup_cl");
+ kernel_hash_setup_local = ezcl_create_kernel_wprogram(program, "hash_setup_local_cl");
+ kernel_neighbor_init = ezcl_create_kernel_wprogram(program, "neighbor_init_cl");
+ kernel_calc_neighbors = ezcl_create_kernel_wprogram(program, "calc_neighbors_cl");
+ kernel_calc_neighbors_local = ezcl_create_kernel_wprogram(program, "calc_neighbors_local_cl");
+ kernel_calc_border_cells = ezcl_create_kernel_wprogram(program, "calc_border_cells_cl");
+ kernel_calc_border_cells2 = ezcl_create_kernel_wprogram(program, "calc_border_cells2_cl");
+ kernel_finish_scan = ezcl_create_kernel_wprogram(program, "finish_scan_cl");
+ kernel_get_border_data = ezcl_create_kernel_wprogram(program, "get_border_data_cl");
+ kernel_calc_layer1 = ezcl_create_kernel_wprogram(program, "calc_layer1_cl");
+ kernel_calc_layer1_sethash = ezcl_create_kernel_wprogram(program, "calc_layer1_sethash_cl");
+ kernel_calc_layer2 = ezcl_create_kernel_wprogram(program, "calc_layer2_cl");
+ kernel_get_border_data2 = ezcl_create_kernel_wprogram(program, "get_border_data2_cl");
+ kernel_calc_layer2_sethash = ezcl_create_kernel_wprogram(program, "calc_layer2_sethash_cl");
+ kernel_copy_mesh_data = ezcl_create_kernel_wprogram(program, "copy_mesh_data_cl");
+ kernel_fill_mesh_ghost = ezcl_create_kernel_wprogram(program, "fill_mesh_ghost_cl");
+ kernel_fill_neighbor_ghost = ezcl_create_kernel_wprogram(program, "fill_neighbor_ghost_cl");
+ kernel_set_corner_neighbor = ezcl_create_kernel_wprogram(program, "set_corner_neighbor_cl");
+ kernel_adjust_neighbors_local = ezcl_create_kernel_wprogram(program, "adjust_neighbors_local_cl");
+ kernel_hash_size = ezcl_create_kernel_wprogram(program, "calc_hash_size_cl");
+ kernel_finish_hash_size = ezcl_create_kernel_wprogram(program, "finish_reduction_minmax4_cl");
+ kernel_calc_spatial_coordinates = ezcl_create_kernel_wprogram(program, "calc_spatial_coordinates_cl");
+ kernel_do_load_balance_lower = ezcl_create_kernel_wprogram(program, "do_load_balance_lower_cl");
+ kernel_do_load_balance_middle = ezcl_create_kernel_wprogram(program, "do_load_balance_middle_cl");
+ kernel_do_load_balance_upper = ezcl_create_kernel_wprogram(program, "do_load_balance_upper_cl");
+#ifndef MINIMUM_PRECISION
+ kernel_do_load_balance_double = ezcl_create_kernel_wprogram(program, "do_load_balance_double_cl");
+#endif
+ kernel_do_load_balance_float = ezcl_create_kernel_wprogram(program, "do_load_balance_float_cl");
+ kernel_refine_smooth = ezcl_create_kernel_wprogram(program, "refine_smooth_cl");
+ kernel_coarsen_smooth = ezcl_create_kernel_wprogram(program, "coarsen_smooth_cl");
+ kernel_coarsen_check_block = ezcl_create_kernel_wprogram(program, "coarsen_check_block_cl");
+ kernel_rezone_all = ezcl_create_kernel_wprogram(program, "rezone_all_cl");
+ kernel_rezone_neighbors = ezcl_create_kernel_wprogram(program, "rezone_neighbors_cl");
+#ifndef MINIMUM_PRECISION
+ kernel_rezone_one_double = ezcl_create_kernel_wprogram(program, "rezone_one_double_cl");
+#endif
+ kernel_rezone_one_float = ezcl_create_kernel_wprogram(program, "rezone_one_float_cl");
+ kernel_copy_mpot_ghost_data = ezcl_create_kernel_wprogram(program, "copy_mpot_ghost_data_cl");
+ kernel_set_boundary_refinement = ezcl_create_kernel_wprogram(program, "set_boundary_refinement");
+ init_kernel_2stage_sum();
+ init_kernel_2stage_sum_int();
+ if (! have_boundary){
+ kernel_count_BCs = ezcl_create_kernel_wprogram(program, "count_BCs_cl");
+ }
+
+ ezcl_program_release(program);
+ if (mype == 0) printf("Finishing compile of kernels in mesh\n");
+#endif
+ }
+
+ //KDTree_Initialize(&tree);
+ if (ncells > 0) { // this is a restart.
+ nsizes.resize (numpe);
+ ndispl.resize (numpe);
+ if (parallel && numpe > 1) {
+#ifdef HAVE_MPI
+ int ncells_int = ncells;
+ MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ noffset=ndispl[mype];
+ ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
+#endif
+ } else {
+ noffset = 0;
+ ncells_global = ncells;
+ proc.resize (ncells);
+ calc_distribution(numpe);
+ }
+ calc_celltype(ncells);
+
+ } else {
+ int istart = 1,
+ jstart = 1,
+ iend = nx,
+ jend = ny,
+ nxx = nx,
+ nyy = ny;
+ if (have_boundary) {
+ istart = 0;
+ jstart = 0;
+ iend = nx + 1;
+ jend = ny + 1;
+ nxx = nx + 2;
+ nyy = ny + 2;
+ }
+
+ if (ndim == TWO_DIMENSIONAL) ncells = nxx * nyy - have_boundary * 4;
+ else ncells = nxx * nyy;
+
+ noffset = 0;
+ if (parallel) {
+ ncells_global = ncells;
+ nsizes.resize(numpe);
+ ndispl.resize(numpe);
+
+ for (int ip=0; ip<numpe; ip++){
+ nsizes[ip] = ncells_global/numpe;
+ if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+ }
+
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ ncells= nsizes[mype];
+ noffset=ndispl[mype];
+ }
+
+ allocate(ncells);
+ index.resize(ncells);
+
+ int ic = 0;
+
+ for (int jj = jstart; jj <= jend; jj++) {
+ for (int ii = istart; ii <= iend; ii++) {
+ if (have_boundary && ii == 0 && jj == 0 ) continue;
+ if (have_boundary && ii == 0 && jj == jend) continue;
+ if (have_boundary && ii == iend && jj == 0 ) continue;
+ if (have_boundary && ii == iend && jj == jend) continue;
+
+ if (ic >= (int)noffset && ic < (int)(ncells+noffset)){
+ int iclocal = ic-noffset;
+ index[iclocal] = ic;
+ i[iclocal] = ii;
+ j[iclocal] = jj;
+ level[iclocal] = 0;
+ }
+ ic++;
+ }
+ }
+
+ //if (numpe > 1 && (initial_order != HILBERT_SORT && initial_order != HILBERT_PARTITION) ) mem_factor = 2.0;
+ partition_cells(numpe, index, initial_order);
+
+ calc_celltype(ncells);
+ calc_spatial_coordinates(0);
+
+ // Start lev loop here
+ for (int ilevel=1; ilevel<=levmx; ilevel++) {
+
+ //int old_ncells = ncells;
+
+ ncells_ghost = ncells;
+ calc_neighbors_local();
+
+ kdtree_setup();
+
+ int nez;
+ vector<int> ind(ncells);
+
+ #ifdef FULL_PRECISION
+ KDTree_QueryCircleIntersect_Double(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
+ #else
+ KDTree_QueryCircleIntersect_Float(&tree, &nez, &(ind[0]), circ_radius, ncells, &x[0], &dx[0], &y[0], &dy[0]);
+ #endif
+
+ vector<int> mpot(ncells_ghost,0);
+
+ for (int ic=0; ic<nez; ++ic){
+ if (level[ind[ic]] < levmx) mpot[ind[ic]] = 1;
+ }
+
+ KDTree_Destroy(&tree);
+ // Refine the cells.
+ int icount = 0;
+ int jcount = 0;
+ int new_ncells = refine_smooth(mpot, icount, jcount);
+
+ MallocPlus dummy;
+ rezone_all(icount, jcount, mpot, 0, dummy);
+
+ ncells = new_ncells;
+
+ calc_spatial_coordinates(0);
+
+ #ifdef HAVE_MPI
+ if (parallel && numpe > 1) {
+ int ncells_int = ncells;
+ MPI_Allgather(&ncells_int, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ noffset=ndispl[mype];
+ ncells_global = ndispl[numpe-1] + nsizes[numpe-1];
+ }
+ #endif
+ } // End lev loop here
+ index.clear();
+ ncells_ghost = ncells;
+ }
+ int ncells_corners = 4;
+ int i_corner[] = { 0, 0,imax,imax};
+ int j_corner[] = { 0,jmax, 0,jmax};
+
+ for(int ic=0; ic<ncells_corners; ic++){
+ for (int jj = j_corner[ic]*IPOW2(levmx); jj < (j_corner[ic]+1)*IPOW2(levmx); jj++) {
+ for (int ii = i_corner[ic]*IPOW2(levmx); ii < (i_corner[ic]+1)*IPOW2(levmx); ii++) {
+ corners_i.push_back(ii);
+ corners_j.push_back(jj);
+ }
+ }
+ }
+}
+
+size_t Mesh::refine_smooth(vector<int> &mpot, int &icount, int &jcount)
+{
+ vector<int> mpot_old;
+
+ int newcount;
+ int newcount_global;
+
+ struct timeval tstart_lev2;
+
+ rezone_count(mpot, icount, jcount);
+
+#ifdef _OPENMP
+#pragma omp parallel
+{ //START Parallel Region
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+ newcount = icount;
+ newcount_global = newcount;
+
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+ if(newcount_global > 0 && levmx > 1) {
+
+ size_t my_ncells=ncells;
+ if (parallel) my_ncells=ncells_ghost;
+
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+ cpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
+
+ mpot_old.resize(my_ncells);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+ int levcount = 1;
+
+ while (newcount_global > 0 && levcount < levmx){
+
+ levcount++;
+#ifdef _OPENMP
+#pragma omp master
+{//MASTER START
+#endif
+
+ mpot.swap(mpot_old);
+ newcount=0;
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Update(&mpot_old[0], L7_INT, cell_handle);
+ }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+ int upperBound, lowerBound;
+ get_bounds(upperBound, lowerBound);
+ int mynewcount = newcount; //All threads get a mynewcount
+
+#ifdef _OPENMP
+#pragma omp for reduction(+:newcount)
+#endif
+ for(uint ic = 0; ic < ncells; ic++) {
+ // for(uint ic = lowerBound; ic < upperBound; ic++){
+ int lev = level[ic];
+ mpot[ic] = mpot_old[ic];
+ if(mpot_old[ic] > 0) continue;
+
+ int nl = nlft[ic];
+ if (nl >= 0 && nl < (int)ncells_ghost) {
+ int ll = level[nl];
+ if(mpot_old[nl] > 0) ll++;
+
+ if(ll - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+
+ ll = level[nl];
+ if (ll > lev) {
+ int nlt = ntop[nl];
+ if (nlt >= 0 && nlt < (int)ncells_ghost) {
+ int llt = level[nlt];
+ if(mpot_old[nlt] > 0) llt++;
+
+ if(llt - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+ }
+ }
+ }
+
+ int nr = nrht[ic];
+ if (nr >= 0 && nr < (int)ncells_ghost) {
+ int lr = level[nr];
+ if(mpot_old[nr] > 0) lr++;
+
+ if(lr - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+
+ lr = level[nr];
+ if (lr > lev) {
+ int nrt = ntop[nr];
+ if (nrt >= 0 && nrt < (int)ncells_ghost) {
+ int lrt = level[nrt];
+ if(mpot_old[nrt] > 0) lrt++;
+
+ if(lrt - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+ }
+ }
+ }
+
+ int nt = ntop[ic];
+ if (nt >= 0 && nt < (int)ncells_ghost) {
+ int lt = level[nt];
+ if(mpot_old[nt] > 0) lt++;
+
+ if(lt - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+
+ lt = level[nt];
+ if (lt > lev) {
+ int ntr = nrht[nt];
+ if (ntr >= 0 && ntr < (int)ncells_ghost) {
+ int ltr = level[ntr];
+ if(mpot_old[ntr] > 0) ltr++;
+
+ if(ltr - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+ }
+ }
+ }
+
+ int nb = nbot[ic];
+ if (nb >= 0 && nb < (int)ncells_ghost) {
+ int lb = level[nb];
+ if(mpot_old[nb] > 0) lb++;
+
+ if(lb - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+
+ lb = level[nb];
+ if (lb > lev) {
+ int nbr = nrht[nb];
+ if (nbr >= 0 && nbr < (int)ncells_ghost) {
+ int lbr = level[nbr];
+ if(mpot_old[nbr] > 0) lbr++;
+
+ if(lbr - lev > 1) {
+ mpot[ic]=1;
+ mynewcount++;
+ continue;
+ }
+ }
+ }
+ }
+ }
+#ifdef _OPENMP
+#pragma omp atomic
+#endif
+ newcount += mynewcount;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+{
+#endif
+ icount += newcount;
+ newcount_global = newcount;
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+ } // while (newcount_global > 0 && levcount < levmx);
+
+ }
+
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Update(&mpot[0], L7_INT, cell_handle);
+ }
+#endif
+
+ mpot_old.clear();
+ mpot_old.resize(ncells_ghost);
+
+ mpot_old.swap(mpot);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(uint ic=0; ic<ncells; ic++) {
+ mpot[ic] = mpot_old[ic];
+ if (mpot_old[ic] >= 0) continue;
+ if (mpot_old[ic] <= -1000000) continue;
+ if ( is_upper_right(i[ic],j[ic]) ) {
+ int nr = nrht[ic];
+ int lr = level[nr];
+ if (mpot_old[nr] > 0) lr++;
+ int nt = ntop[ic];
+ int lt = level[nt];
+ if (mpot_old[nt] > 0) lt++;
+ if (lr > level[ic] || lt > level[ic]) mpot[ic] = 0;
+ } else if ( is_upper_left(i[ic],j[ic] ) ) {
+ int nl = nlft[ic];
+ int ll = level[nl];
+ if (mpot_old[nl] > 0) ll++;
+ int nt = ntop[ic];
+ int lt = level[nt];
+ if (mpot_old[nt] > 0) lt++;
+ if (ll > level[ic] || lt > level[ic]) mpot[ic] = 0;
+ } else if ( is_lower_right(i[ic],j[ic] ) ) {
+ int nr = nrht[ic];
+ int lr = level[nr];
+ if (mpot_old[nr] > 0) lr++;
+ int nb = nbot[ic];
+ int lb = level[nb];
+ if (mpot_old[nb] > 0) lb++;
+ if (lr > level[ic] || lb > level[ic]) mpot[ic] = 0;
+ } else if ( is_lower_left(i[ic],j[ic] ) ) {
+ int nl = nlft[ic];
+ int ll = level[nl];
+ if (mpot_old[nl] > 0) ll++;
+ int nb = nbot[ic];
+ int lb = level[nb];
+ if (mpot_old[nb] > 0) lb++;
+ if (ll > level[ic] || lb > level[ic]) mpot[ic] = 0;
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Update(&mpot[0], L7_INT, cell_handle);
+ }
+#endif
+
+ mpot_old.swap(mpot);
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(uint ic=0; ic<ncells; ic++) {
+ int n1=0, n2=0, n3=0;
+ mpot[ic] = mpot_old[ic];
+ if (mpot_old[ic] >= 0) continue;
+ if (mpot_old[ic] <= -1000000) continue;
+ if ( is_upper_right(i[ic],j[ic]) ) {
+ n1 = nbot[ic];
+ n2 = nlft[ic];
+ n3 = nlft[n1];
+ } else if ( is_upper_left(i[ic],j[ic] ) ) {
+ n1 = nbot[ic];
+ n2 = nrht[ic];
+ n3 = nrht[n1];
+ } else if ( is_lower_right(i[ic],j[ic] ) ) {
+ n1 = ntop[ic];
+ n2 = nlft[ic];
+ n3 = nlft[n1];
+ } else if ( is_lower_left(i[ic],j[ic] ) ) {
+ n1 = ntop[ic];
+ n2 = nrht[ic];
+ n3 = nrht[n1];
+ }
+ if (n3 < 0) {
+ mpot[ic] = 0;
+ } else {
+ int lev1 = level[n1];
+ int lev2 = level[n2];
+ int lev3 = level[n3];
+ if (mpot_old[n1] > 0) lev1++;
+ if (mpot_old[n2] > 0) lev2++;
+ if (mpot_old[n3] > 0) lev3++;
+
+ if (mpot_old[n1] != -1 || lev1 != level[ic] ||
+ mpot_old[n2] != -1 || lev2 != level[ic] ||
+ mpot_old[n3] != -1 || lev3 != level[ic]) {
+ mpot[ic] = 0;
+ }
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Update(&mpot[0], L7_INT, cell_handle);
+ }
+#endif
+
+#ifdef _OPENMP
+}//END MASTER
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++) {
+ if (celltype[ic] < 0) {
+ switch (celltype[ic]) {
+ case LEFT_BOUNDARY:
+ mpot[ic] = mpot[nrht[ic]];
+ break;
+ case RIGHT_BOUNDARY:
+ mpot[ic] = mpot[nlft[ic]];
+ break;
+ case BOTTOM_BOUNDARY:
+ mpot[ic] = mpot[ntop[ic]];
+ break;
+ case TOP_BOUNDARY:
+ mpot[ic] = mpot[nbot[ic]];
+ break;
+ }
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+}//END Parallel Region
+#endif
+
+ newcount = ncells + rezone_count(mpot, icount, jcount);
+
+#ifdef HAVE_MPI
+ int icount_global = icount;
+ int jcount_global = jcount;
+ if (parallel) {
+ int count[2], global_count[2];
+ count[0] = icount;
+ count[1] = jcount;
+ MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ icount_global = global_count[0];
+ jcount_global = global_count[1];
+ }
+ do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
+#else
+ do_rezone = (icount != 0 || jcount != 0) ? true : false;
+#endif
+
+
+ if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_REFINE_SMOOTH] += cpu_timer_stop(tstart_lev2);
+
+ return(newcount);
+}
+
+#ifdef HAVE_OPENCL
+int Mesh::gpu_refine_smooth(cl_mem &dev_mpot, int &icount, int &jcount)
+{
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+ size_t block_size = global_work_size/local_work_size;
+
+ int icount_global = icount;
+ int jcount_global = jcount;
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ int count[2], count_global[2];
+ count[0] = icount;
+ count[1] = jcount;
+ MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ icount_global = count_global[0];
+ jcount_global = count_global[1];
+ }
+#endif
+
+ int levcount = 1;
+ //int which_smooth=0;
+
+ if(icount_global > 0 && levcount < levmx) {
+ size_t result_size = 1;
+ cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ int newcount = icount;
+ int newcount_global = icount_global;
+ while (newcount_global > 0 && levcount < levmx) {
+ levcount++;
+
+ gpu_counters[MESH_COUNTER_REFINE_SMOOTH]++;
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+ }
+#endif
+
+ if (icount_global) {
+ ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+ ezcl_set_kernel_arg(kernel_refine_smooth, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 1, sizeof(cl_int), (void *)&ncells_ghost);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 2, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 3, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 4, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 5, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 6, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 8, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_refine_smooth, 9, sizeof(cl_mem), (void *)&dev_mpot_old);
+ ezcl_set_kernel_arg(kernel_refine_smooth,10, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_refine_smooth,11, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_refine_smooth,12, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_refine_smooth,13, local_work_size*sizeof(cl_int), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_refine_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
+
+ int result;
+ ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
+
+ //printf("result = %d after %d refine smooths\n",result,which_smooth);
+ //which_smooth++;
+
+ icount = result;
+ }
+
+ newcount = icount-newcount;
+ newcount_global = newcount;
+#ifdef HAVE_MPI
+ if (parallel) {
+ MPI_Allreduce(&newcount, &newcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ }
+#endif
+ icount_global += newcount_global;
+ //printf("DEBUG -- icount %d icount_global %d newcount %d newcount_global %d\n",icount,icount_global,newcount,newcount_global);
+ }
+
+ ezcl_device_memory_delete(dev_mpot_old);
+ ezcl_device_memory_delete(dev_redscratch);
+ ezcl_device_memory_delete(dev_result);
+ }
+
+ if (jcount_global) {
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+ }
+#endif
+
+ cl_mem dev_mpot_old = ezcl_malloc(NULL, const_cast<char *>("dev_mpot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ if (jcount) {
+ ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 1, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 2, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 3, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 4, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 5, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 8, sizeof(cl_mem), (void *)&dev_mpot_old);
+ ezcl_set_kernel_arg(kernel_coarsen_smooth, 9, sizeof(cl_mem), (void *)&dev_mpot);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_smooth, 1, NULL, &global_work_size, &local_work_size, NULL);
+ }
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+ }
+#endif
+
+ if (jcount) {
+ size_t result_size = 1;
+ cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_device_memory_swap(&dev_mpot_old, &dev_mpot);
+
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 1, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 2, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 3, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 4, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 5, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 8, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block, 9, sizeof(cl_mem), (void *)&dev_mpot_old);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block,10, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block,11, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block,12, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_coarsen_check_block,13, local_work_size*sizeof(cl_int), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_coarsen_check_block, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_rezone_count(block_size, local_work_size, dev_redscratch, dev_result);
+
+ int result;
+ ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &result, NULL);
+
+ //printf("result = %d after coarsen smooth\n",result);
+
+ jcount = result;
+
+ ezcl_device_memory_delete(dev_redscratch);
+ ezcl_device_memory_delete(dev_result);
+ }
+
+ jcount_global = jcount;
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ MPI_Allreduce(&jcount, &jcount_global, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ }
+#endif
+
+ ezcl_device_memory_delete(dev_mpot_old);
+ }
+
+ if (icount_global || jcount_global) {
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ L7_Dev_Update(dev_mpot, L7_INT, cell_handle);
+ }
+#endif
+
+ size_t result_size = 1;
+ cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+ dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 1, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 2, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 3, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 4, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 5, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 7, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 8, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 9, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 10, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 11, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_set_boundary_refinement, 12, local_work_size*sizeof(cl_int2), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_set_boundary_refinement, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_rezone_count2(block_size, local_work_size, dev_redscratch, dev_result);
+
+ int my_result[2];
+ ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, 1*sizeof(cl_int2), &my_result, NULL);
+ //printf("Result is %lu icount %d jcount %d\n", ncells+my_result[0]-my_result[1],my_result[0],my_result[1]);
+ icount = my_result[0];
+ jcount = my_result[1];
+
+ icount_global = icount;
+ jcount_global = jcount;
+#ifdef HAVE_MPI
+ if (parallel) {
+ int count[2], count_global[2];
+ count[0] = icount;
+ count[1] = jcount;
+ MPI_Allreduce(&count, &count_global, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ icount_global = count_global[0];
+ jcount_global = count_global[1];
+ }
+#endif
+
+ gpu_rezone_scan(block_size, local_work_size, dev_ioffset, dev_result);
+
+ //ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int), &my_result, NULL);
+ //printf("After scan, Result is %d\n", my_result[0]);
+
+ ezcl_device_memory_delete(dev_result);
+ ezcl_device_memory_delete(dev_redscratch);
+
+ } else {
+ ezcl_device_memory_delete(dev_mpot);
+ dev_mpot = NULL;
+ }
+
+ gpu_do_rezone = (icount_global != 0 || jcount_global != 0) ? true : false;
+
+ if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_REFINE_SMOOTH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+
+ return ncells+icount-jcount;
+}
+#endif
+
+void Mesh::terminate(void)
+{
+ mesh_memory.memory_delete(i);
+ mesh_memory.memory_delete(j);
+ mesh_memory.memory_delete(level);
+ mesh_memory.memory_delete(celltype);
+ if (neighbor_remap) {
+ mesh_memory.memory_delete(nlft);
+ mesh_memory.memory_delete(nrht);
+ mesh_memory.memory_delete(nbot);
+ mesh_memory.memory_delete(ntop);
+ }
+
+#ifdef HAVE_OPENCL
+ hash_lib_terminate();
+
+ ezcl_device_memory_delete(dev_levtable);
+ ezcl_device_memory_delete(dev_levdx);
+ ezcl_device_memory_delete(dev_levdy);
+ ezcl_device_memory_delete(dev_levibeg);
+ ezcl_device_memory_delete(dev_leviend);
+ ezcl_device_memory_delete(dev_levjbeg);
+ ezcl_device_memory_delete(dev_levjend);
+
+ ezcl_device_memory_delete(dev_level);
+ ezcl_device_memory_delete(dev_i);
+ ezcl_device_memory_delete(dev_j);
+ ezcl_device_memory_delete(dev_celltype);
+ if (neighbor_remap && dev_nlft != NULL){
+ ezcl_device_memory_delete(dev_nlft);
+ ezcl_device_memory_delete(dev_nrht);
+ ezcl_device_memory_delete(dev_nbot);
+ ezcl_device_memory_delete(dev_ntop);
+ }
+
+ ezcl_kernel_release(kernel_reduction_scan2);
+ ezcl_kernel_release(kernel_reduction_count);
+ ezcl_kernel_release(kernel_reduction_count2);
+ ezcl_kernel_release(kernel_hash_adjust_sizes);
+ ezcl_kernel_release(kernel_hash_setup);
+ ezcl_kernel_release(kernel_hash_setup_local);
+ ezcl_kernel_release(kernel_neighbor_init);
+ ezcl_kernel_release(kernel_calc_neighbors);
+ ezcl_kernel_release(kernel_calc_neighbors_local);
+ ezcl_kernel_release(kernel_calc_border_cells);
+ ezcl_kernel_release(kernel_calc_border_cells2);
+ ezcl_kernel_release(kernel_finish_scan);
+ ezcl_kernel_release(kernel_get_border_data);
+ ezcl_kernel_release(kernel_calc_layer1);
+ ezcl_kernel_release(kernel_calc_layer1_sethash);
+ ezcl_kernel_release(kernel_calc_layer2);
+ ezcl_kernel_release(kernel_get_border_data2);
+ ezcl_kernel_release(kernel_calc_layer2_sethash);
+ //ezcl_kernel_release(kernel_calc_neighbors_local2);
+ ezcl_kernel_release(kernel_copy_mesh_data);
+ ezcl_kernel_release(kernel_fill_mesh_ghost);
+ ezcl_kernel_release(kernel_fill_neighbor_ghost);
+ ezcl_kernel_release(kernel_set_corner_neighbor);
+ ezcl_kernel_release(kernel_adjust_neighbors_local);
+ //ezcl_kernel_release(kernel_copy_ghost_data);
+ //ezcl_kernel_release(kernel_adjust_neighbors);
+ ezcl_kernel_release(kernel_hash_size);
+ ezcl_kernel_release(kernel_finish_hash_size);
+ ezcl_kernel_release(kernel_calc_spatial_coordinates);
+ ezcl_kernel_release(kernel_do_load_balance_lower);
+ ezcl_kernel_release(kernel_do_load_balance_middle);
+ ezcl_kernel_release(kernel_do_load_balance_upper);
+#ifndef MINIMUM_PRECISION
+ ezcl_kernel_release(kernel_do_load_balance_double);
+#endif
+ ezcl_kernel_release(kernel_do_load_balance_float);
+ ezcl_kernel_release(kernel_refine_smooth);
+ ezcl_kernel_release(kernel_coarsen_smooth);
+ ezcl_kernel_release(kernel_coarsen_check_block);
+ ezcl_kernel_release(kernel_rezone_all);
+ ezcl_kernel_release(kernel_rezone_neighbors);
+#ifndef MINIMUM_PRECISION
+ ezcl_kernel_release(kernel_rezone_one_double);
+#endif
+ ezcl_kernel_release(kernel_rezone_one_float);
+ ezcl_kernel_release(kernel_copy_mpot_ghost_data);
+ ezcl_kernel_release(kernel_set_boundary_refinement);
+ terminate_kernel_2stage_sum();
+ terminate_kernel_2stage_sum_int();
+ if (! have_boundary){
+ ezcl_kernel_release(kernel_count_BCs);
+ }
+#endif
+#if defined(HAVE_J7) && defined(HAVE_MPI)
+ if (parallel) mesh_memory.pfini();
+#endif
+}
+
+int Mesh::rezone_count(vector<int> mpot, int &icount, int &jcount)
+{
+ int my_icount=0;
+ int my_jcount=0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction (+:my_jcount,my_icount)
+#endif
+ for (uint ic=0; ic<ncells; ++ic){
+ if (mpot[ic] < 0) {
+ if (celltype[ic] == REAL_CELL) {
+ // remove all but cell that will remain to get count right when split
+ // across processors
+ if (! is_lower_left(i[ic],j[ic]) ) my_jcount--;
+ } else {
+ // either upper right or lower left will remain for boundary cells
+ if (! (is_upper_right(i[ic],j[ic]) || is_lower_left(i[ic],j[ic]) ) ) my_jcount--;
+ }
+ }
+
+ if (mpot[ic] > 0) {
+ //printf("mpot[%d] = %d level %d levmx %d\n",ic,mpot[ic],level[ic],levmx);
+ if (celltype[ic] == REAL_CELL){
+ my_icount += 3;
+ } else {
+ my_icount ++;
+ }
+ }
+ }
+ //printf("icount is %d\n",my_icount);
+ icount = my_icount;
+ jcount = my_jcount;
+
+ return(icount+jcount);
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_rezone_count2(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ /*
+ __kernel void finish_reduction_count2_cl(
+ const int isize, // 0
+ __global int *redscratch, // 1
+ __global int *result, // 2
+ __local int *tile) // 3
+ */
+ ezcl_set_kernel_arg(kernel_reduction_count2, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduction_count2, 1, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduction_count2, 2, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_reduction_count2, 3, local_work_size*sizeof(cl_int2), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count2, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+
+void Mesh::gpu_rezone_count(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ /*
+ __kernel void finish_reduction_count_cl(
+ const int isize, // 0
+ __global int *redscratch, // 1
+ __global int *result, // 2
+ __local int *tile) // 3
+ */
+ ezcl_set_kernel_arg(kernel_reduction_count, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduction_count, 1, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduction_count, 2, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_reduction_count, 3, local_work_size*sizeof(cl_int), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_count, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+
+void Mesh::gpu_rezone_scan(size_t block_size, size_t local_work_size, cl_mem dev_ioffset, cl_mem &dev_result)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ /*
+ __kernel void finish_reduction_scan_cl(
+ const int isize, // 0
+ __global int *ioffset, // 1
+ __global int *result, // 2
+ __local int *tile) // 3
+ */
+ ezcl_set_kernel_arg(kernel_reduction_scan2, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduction_scan2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_reduction_scan2, 2, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_reduction_scan2, 3, local_work_size*sizeof(cl_uint2), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_scan2, 1, NULL, &local_work_size, &local_work_size, NULL);
+}
+#endif
+
+void Mesh::kdtree_setup()
+{
+ KDTree_Initialize(&tree);
+
+ TBounds box;
+ for (uint ic=0; ic<ncells; ic++) {
+ box.min.x = x[ic];
+ box.max.x = x[ic]+dx[ic];
+ box.min.y = y[ic];
+ box.max.y = y[ic]+dy[ic];
+ KDTree_AddElement(&tree, &box);
+ }
+}
+
+void Mesh::calc_spatial_coordinates(int ibase)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ x.resize(ncells);
+ dx.resize(ncells);
+ y.resize(ncells);
+ dy.resize(ncells);
+
+#ifdef _OPENMP
+#pragma omp parallel
+ {
+#endif
+
+ int lowerBounds, upperBounds;
+ set_bounds(ncells);
+ get_bounds(lowerBounds, upperBounds);
+
+ if (have_boundary) {
+ for (uint ic = lowerBounds; ic < upperBounds; ic++) {
+ int lev = level[ic];
+ x[ic] = xmin + (lev_deltax[lev] * (i[ic] - ibase));
+ dx[ic] = lev_deltax[lev];
+ y[ic] = ymin + (lev_deltay[lev] * (j[ic] - ibase));
+ dy[ic] = lev_deltay[lev];
+ }
+ } else {
+ for (uint ic = lowerBounds; ic < upperBounds; ic++) {
+ int lev = level[ic];
+ x[ic] = xmin + (lev_deltax[lev] * (i[ic] - lev_ibegin[lev]));
+ dx[ic] = lev_deltax[lev];
+ y[ic] = ymin + (lev_deltay[lev] * (j[ic] - lev_jbegin[lev]));
+ dy[ic] = lev_deltay[lev];
+ }
+ }
+
+ cpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += cpu_timer_stop(tstart_cpu);
+
+#ifdef _OPENMP
+#pragma omp barrier
+ } // end parallel region
+#endif
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_calc_spatial_coordinates(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ cl_event calc_spatial_coordinates_event;
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t local_work_size = MIN(ncells, TILE_SIZE);
+ size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+// Only coded for base 0 and have boundary
+// Need:
+// xmin
+// ymin
+//
+// lev_deltax -- dev_levdx
+// lev_deltay -- dev_levdy
+// x
+// dx
+// y
+// dy
+// level
+// i
+// j
+
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 1, sizeof(cl_real_t), (void *)&xmin);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 2, sizeof(cl_real_t), (void *)&ymin);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 3, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 4, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 5, sizeof(cl_mem), (void *)&dev_x);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 6, sizeof(cl_mem), (void *)&dev_dx);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 7, sizeof(cl_mem), (void *)&dev_y);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 8, sizeof(cl_mem), (void *)&dev_dy);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 9, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 10, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_calc_spatial_coordinates, 11, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_spatial_coordinates, 1, NULL, &global_work_size, &local_work_size, &calc_spatial_coordinates_event);
+
+ ezcl_wait_for_events(1, &calc_spatial_coordinates_event);
+ ezcl_event_release(calc_spatial_coordinates_event);
+
+ gpu_timers[MESH_TIMER_CALC_SPATIAL_COORDINATES] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::calc_minmax(void)
+{
+ xmin=+1.0e30, ymin=+1.0e30, zmin=+1.0e30;
+
+ for (uint ic=0; ic<ncells; ic++){
+ if (x[ic] < xmin) xmin = x[ic];
+ }
+ for (uint ic=0; ic<ncells; ic++){
+ if (y[ic] < ymin) ymin = y[ic];
+ }
+ if (ndim > TWO_DIMENSIONAL) {
+ for (uint ic=0; ic<ncells; ic++){
+ if (z[ic] < zmin) zmin = z[ic];
+ }
+ }
+
+ xmax=-1.0e30, ymax=-1.0e30, zmax=-1.0e30;
+ real_t xhigh, yhigh, zhigh;
+
+ for (uint ic=0; ic<ncells; ic++){
+ xhigh = x[ic]+dx[ic];
+ if (xhigh > xmax) xmax = xhigh;
+ }
+ for (uint ic=0; ic<ncells; ic++){
+ yhigh = y[ic]+dy[ic];
+ if (yhigh > ymax) ymax = yhigh;
+ }
+ if (ndim > TWO_DIMENSIONAL) {
+ for (uint ic=0; ic<ncells; ic++){
+ zhigh = z[ic]+dz[ic];
+ if (zhigh > zmax) zmax = zhigh;
+ }
+ }
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ real_t xmin_global,xmax_global,ymin_global,ymax_global;
+ MPI_Allreduce(&xmin, &xmin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+ MPI_Allreduce(&xmax, &xmax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+ MPI_Allreduce(&ymin, &ymin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+ MPI_Allreduce(&ymax, &ymax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+ xmin = xmin_global;
+ xmax = xmax_global;
+ ymin = ymin_global;
+ ymax = ymax_global;
+ }
+#endif
+
+}
+void Mesh::calc_centerminmax(void)
+{
+ xcentermin=+1.0e30, ycentermin=+1.0e30, zcentermin=+1.0e30;
+ xcentermax=-1.0e30, ycentermax=-1.0e30, zcentermax=-1.0e30;
+ real_t xmid, ymid, zmid;
+
+ for (uint ic=0; ic<ncells; ic++){
+ xmid = x[ic]+0.5*dx[ic];
+ if (xmid < xcentermin) xcentermin = xmid;
+ if (xmid > xcentermax) xcentermax = xmid;
+ }
+ for (uint ic=0; ic<ncells; ic++){
+ ymid = y[ic]+0.5*dy[ic];
+ if (ymid < ycentermin) ycentermin = ymid;
+ if (ymid > ycentermax) ycentermax = ymid;
+ }
+ if (ndim > TWO_DIMENSIONAL) {
+ for (uint ic=0; ic<ncells; ic++){
+ zmid = z[ic]+0.5*dz[ic];
+ if (zmid < zcentermin) zcentermin = zmid;
+ if (zmid > zcentermax) zcentermax = zmid;
+ }
+ }
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ real_t xcentermin_global,xcentermax_global,ycentermin_global,ycentermax_global;
+ MPI_Allreduce(&xcentermin, &xcentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+ MPI_Allreduce(&xcentermax, &xcentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+ MPI_Allreduce(&ycentermin, &ycentermin_global, 1, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+ MPI_Allreduce(&ycentermax, &ycentermax_global, 1, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+ xcentermin = xcentermin_global;
+ xcentermax = xcentermax_global;
+ ycentermin = ycentermin_global;
+ ycentermax = ycentermax_global;
+ }
+#endif
+
+}
+
+void Mesh::rezone_all(int icount, int jcount, vector<int> mpot, int have_state, MallocPlus &state_memory)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ if (! do_rezone) {
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ index.clear();
+ index.resize(ncells);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++){
+ index[ic]=ic;
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+
+ } else {
+
+// sign for jcount is different in GPU and CPU code -- abs is a quick fix
+ int add_ncells = icount - abs(jcount);
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_counters[MESH_COUNTER_REZONE]++;
+
+ static vector<int> celltype_save;
+
+ static int new_ncells;
+
+ static int *i_old, *j_old, *level_old;
+
+ static int ifirst;
+ static int ilast;
+ static int jfirst;
+ static int jlast;
+ static int level_first;
+ static int level_last;
+
+ static vector<int> new_ic;
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ celltype_save.resize(ncells);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ if (have_state) {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = 0; ic < (int)ncells; ic++){
+ celltype_save[ic] = celltype[ic];
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ new_ncells = ncells + add_ncells;
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+// int ref_entry_count = 0;
+ if (have_state){
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++) {
+// if (mpot[ic] > 0) ref_entry_count++;
+ if (mpot[ic] < 0) {
+ // Normal cell coarsening
+ if (is_lower_left(i[ic],j[ic]) ) mpot[ic] = -2;
+ // Boundary cell case
+ if (celltype[ic] != REAL_CELL && is_upper_right(i[ic],j[ic]) ) mpot[ic] = -3;
+ }
+ }
+ }
+
+ // Initialize new variables
+// int *i_old, *j_old, *level_old;
+
+ int flags = RESTART_DATA;
+#ifdef HAVE_J7
+ if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ i_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "i_old", flags);
+ j_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "j_old", flags);
+ level_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "level_old", flags);
+
+ mesh_memory.memory_swap(&i, &i_old);
+ mesh_memory.memory_swap(&j, &j_old);
+ mesh_memory.memory_swap(&level, &level_old);
+
+ index.clear();
+ index.resize(new_ncells);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ static vector<int> order; // Vector of refined mesh traversal order; set to -1 to indicate errors.
+ //
+ //vector<int> invorder(4, -1); // Vector mapping location from base index.
+
+ //int ref_entry = 0;
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ // Insert new cells into the mesh at the point of refinement.
+ order.resize(4, -1); // Vector of refined mesh traversal order; set to -1 to indicate errors.
+
+ ifirst = 0;
+ ilast = 0;
+ jfirst = 0;
+ jlast = 0;
+ level_first = 0;
+ level_last = 0;
+
+ if (parallel) {
+#ifdef HAVE_MPI
+ MPI_Request req[12];
+ MPI_Status status[12];
+
+ static int prev = MPI_PROC_NULL;
+ static int next = MPI_PROC_NULL;
+
+ if (mype != 0) prev = mype-1;
+ if (mype < numpe - 1) next = mype+1;
+
+ MPI_Isend(&i_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
+ MPI_Irecv(&ifirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
+
+ MPI_Isend(&i_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
+ MPI_Irecv(&ilast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
+
+ MPI_Isend(&j_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
+ MPI_Irecv(&jfirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
+
+ MPI_Isend(&j_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
+ MPI_Irecv(&jlast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
+
+ MPI_Isend(&level_old[ncells-1], 1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
+ MPI_Irecv(&level_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
+
+ MPI_Isend(&level_old[0], 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
+ MPI_Irecv(&level_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
+
+ MPI_Waitall(12, req, status);
+#endif
+ }
+
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef REZONE_NO_OPTIMIZATION
+ vector<int> invorder(4, -1); // Vector mapping location from base index.
+ for (int ic = 0, nc = 0; ic < (int)ncells; ic++)
+ {
+ if (mpot[ic] == 0 || mpot[ic] == -1000000)
+ { // No change is needed; copy the old cell straight to the new mesh at this location.
+ index[ic] = nc;
+ i[nc] = i_old[ic];
+ j[nc] = j_old[ic];
+ level[nc] = level_old[ic];
+ nc++;
+ } // Complete no change needed.
+
+ else if (mpot[ic] < 0)
+ { // Coarsening is needed; remove this cell and the other three and replace them with one.
+ index[ic] = nc;
+ if (mpot[ic] <= -2) {
+ //printf(" %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
+ i[nc] = i_old[ic]/2;
+ j[nc] = j_old[ic]/2;
+ level[nc] = level_old[ic] - 1;
+ nc++;
+ }
+ } // Coarsening complete.
+
+ else if (mpot[ic] > 0)
+ { // Refinement is needed; insert four cells where once was one.
+ index[ic] = nc;
+ if (celltype[ic] == REAL_CELL)
+ {
+ set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
+ level_first, level_last, i_old, j_old, level_old);
+
+ // Create the cells in the correct order and orientation.
+ for (int ii = 0; ii < 4; ii++)
+ { level[nc] = level_old[ic] + 1;
+ switch (order[ii])
+ { case SW:
+ // lower left
+ invorder[SW] = ii;
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ nc++;
+ break;
+
+ case SE:
+ // lower right
+ invorder[SE] = ii;
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ nc++;
+ break;
+
+ case NW:
+ // upper left
+ invorder[NW] = ii;
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ nc++;
+ break;
+
+ case NE:
+ // upper right
+ invorder[NE] = ii;
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ nc++;
+ break; } } // Complete cell refinement.
+ } // Complete real cell refinement.
+
+ else if (celltype[ic] == LEFT_BOUNDARY) {
+ // lower
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // upper
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == RIGHT_BOUNDARY) {
+ // lower
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // upper
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == BOTTOM_BOUNDARY) {
+ // left
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // right
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == TOP_BOUNDARY) {
+ // right
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // left
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ } // Complete refinement needed.
+ } // Complete addition of new cells to the mesh.
+
+ mesh_memory.memory_delete(i_old);
+ mesh_memory.memory_delete(j_old);
+ mesh_memory.memory_delete(level_old);
+
+ calc_celltype(new_ncells);
+
+ if (have_state){
+ flags = RESTART_DATA;
+ MallocPlus state_memory_old = state_memory;
+ malloc_plus_memory_entry *memory_item;
+
+ for (memory_item = state_memory_old.memory_entry_by_name_begin();
+ memory_item != state_memory_old.memory_entry_by_name_end();
+ memory_item = state_memory_old.memory_entry_by_name_next() ) {
+ //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+ if (memory_item->mem_elsize == 8) {
+ double *state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
+ "state_temp_double", flags);
+
+ double *mem_ptr_double = (double *)memory_item->mem_ptr;
+
+ //ref_entry = 0;
+ for (int ic=0, nc=0; ic<(int)ncells; ic++) {
+
+ if (mpot[ic] == 0) {
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+ } else if (mpot[ic] < 0){
+ if (mpot[ic] == -2) {
+ int nr = nrht[ic];
+ int nt = ntop[ic];
+ int nrt = nrht[nt];
+ state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
+ mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
+ nc++;
+ }
+ if (mpot[ic] == -3) {
+ int nl = nlft[ic];
+ int nb = nbot[ic];
+ int nlb = nlft[nb];
+ state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
+ mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
+ nc++;
+ }
+ } else if (mpot[ic] > 0){
+ // lower left
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ // lower right
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ if (celltype_save[ic] == REAL_CELL){
+ // upper left
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ // upper right
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+ }
+ }
+ }
+
+ state_memory.memory_replace(mem_ptr_double, state_temp_double);
+ } else if (memory_item->mem_elsize == 4) {
+ float *state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
+ "state_temp_float", flags);
+
+ float *mem_ptr_float = (float *)memory_item->mem_ptr;
+
+ for (int ic=0, nc=0; ic<(int)ncells; ic++) {
+
+ if (mpot[ic] == 0) {
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+ } else if (mpot[ic] < 0){
+ if (mpot[ic] == -2) {
+ int nr = nrht[ic];
+ int nt = ntop[ic];
+ int nrt = nrht[nt];
+ state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
+ mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
+ nc++;
+ }
+ if (mpot[ic] == -3) {
+ int nl = nlft[ic];
+ int nb = nbot[ic];
+ int nlb = nlft[nb];
+ state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
+ mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
+ nc++;
+ }
+ } else if (mpot[ic] > 0){
+ // lower left
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ // lower right
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ if (celltype_save[ic] == REAL_CELL){
+ // upper left
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ // upper right
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+ }
+ }
+ }
+
+ state_memory.memory_replace(mem_ptr_float, state_temp_float);
+ }
+ }
+ }
+#else
+ // Data parallel optimizations for thread parallel -- slows down serial
+ // code by about 25%
+ static vector<int> add_count;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ add_count.resize(ncells);
+ new_ic.resize(ncells+1);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = 0; ic < (int)ncells; ic++){
+ if (mpot[ic] == 0) {
+ add_count[ic] = 1;
+ } else if (mpot[ic] < 0) {
+ if (mpot[ic] == -2){
+ add_count[ic] = 1;
+ } else {
+ add_count[ic] = 0;
+ }
+ } else if (mpot[ic] > 0) {
+ if (celltype[ic] != REAL_CELL) {
+ add_count[ic] = 2;
+ } else {
+ add_count[ic] = 4;
+ }
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+ scan (&add_count[0], &new_ic[0], ncells);
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = 0; ic < (int)ncells; ic++) {
+ vector<int> invorder(4, -1); // Vector mapping location from base index.
+ int nc = new_ic[ic];
+ if (mpot[ic] == 0)
+ { // No change is needed; copy the old cell straight to the new mesh at this location.
+ index[ic] = nc;
+ i[nc] = i_old[ic];
+ j[nc] = j_old[ic];
+ level[nc] = level_old[ic];
+ } // Complete no change needed.
+
+ else if (mpot[ic] < 0)
+ { // Coarsening is needed; remove this cell and the other three and replace them with one.
+ index[ic] = nc;
+ if (mpot[ic] <= -2) {
+ //printf(" %d: DEBUG -- coarsening cell %d nc %d\n",mype,ic,nc);
+ i[nc] = i_old[ic]/2;
+ j[nc] = j_old[ic]/2;
+ level[nc] = level_old[ic] - 1;
+ }
+ } // Coarsening complete.
+
+ else if (mpot[ic] > 0)
+ { // Refinement is needed; insert four cells where once was one.
+ index[ic] = nc;
+ if (celltype[ic] == REAL_CELL)
+ {
+ int order[4];
+ set_refinement_order(&order[0], ic, ifirst, ilast, jfirst, jlast,
+ level_first, level_last, i_old, j_old, level_old);
+
+ // Create the cells in the correct order and orientation.
+ for (int ii = 0; ii < 4; ii++) {
+ level[nc] = level_old[ic] + 1;
+ switch (order[ii]) {
+ case SW:
+ // lower left
+ invorder[SW] = ii;
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ nc++;
+ break;
+
+ case SE:
+ // lower right
+ invorder[SE] = ii;
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ nc++;
+ break;
+
+ case NW:
+ // upper left
+ invorder[NW] = ii;
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ nc++;
+ break;
+
+ case NE:
+ // upper right
+ invorder[NE] = ii;
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ nc++;
+ break;
+ }
+ } // Complete cell refinement.
+ } // Complete real cell refinement.
+
+ else if (celltype[ic] == LEFT_BOUNDARY) {
+ // lower
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // upper
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == RIGHT_BOUNDARY) {
+ // lower
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // upper
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == BOTTOM_BOUNDARY) {
+ // left
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // right
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2 + 1;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ else if (celltype[ic] == TOP_BOUNDARY) {
+ // right
+ i[nc] = i_old[ic]*2 + 1;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+
+ // left
+ i[nc] = i_old[ic]*2;
+ j[nc] = j_old[ic]*2;
+ level[nc] = level_old[ic] + 1;
+ nc++;
+ }
+ } // Complete refinement needed.
+ } // Complete addition of new cells to the mesh.
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ mesh_memory.memory_delete(i_old);
+ mesh_memory.memory_delete(j_old);
+ mesh_memory.memory_delete(level_old);
+#ifdef _OPENMP
+ } // end master region
+#endif
+
+ calc_celltype_threaded(new_ncells);
+
+ if (have_state){
+
+ static MallocPlus state_memory_old;
+ static malloc_plus_memory_entry *memory_begin;
+ static malloc_plus_memory_entry *memory_end;
+ static malloc_plus_memory_entry *memory_next;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ state_memory_old = state_memory;
+
+ memory_begin = state_memory_old.memory_entry_by_name_begin();
+ memory_end = state_memory_old.memory_entry_by_name_end();
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ for (malloc_plus_memory_entry *memory_item = memory_begin;
+ memory_item != memory_end;
+ memory_item = memory_next ) {
+ //ref_entry = 0;
+ //printf("DEBUG -- memory_item->mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+ if (memory_item->mem_elsize == 8) {
+
+ static double *state_temp_double, *mem_ptr_double;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ state_temp_double = (double *)state_memory.memory_malloc(new_ncells, sizeof(double),
+ "state_temp_double", flags);
+ mem_ptr_double = (double *)memory_item->mem_ptr;
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ //ref_entry = 0;
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic=0; ic<(int)ncells; ic++) {
+
+ int nc = new_ic[ic];
+ if (mpot[ic] == 0) {
+ state_temp_double[nc] = mem_ptr_double[ic];
+ } else if (mpot[ic] < 0){
+ if (mpot[ic] == -2) {
+ int nr = nrht[ic];
+ int nt = ntop[ic];
+ int nrt = nrht[nt];
+ state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nr] +
+ mem_ptr_double[nt] + mem_ptr_double[nrt])*0.25;
+ }
+ if (mpot[ic] == -3) {
+ int nl = nlft[ic];
+ int nb = nbot[ic];
+ int nlb = nlft[nb];
+ state_temp_double[nc] = (mem_ptr_double[ic] + mem_ptr_double[nl] +
+ mem_ptr_double[nb] + mem_ptr_double[nlb])*0.25;
+ }
+ } else if (mpot[ic] > 0){
+ // lower left
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ // lower right
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ if (celltype_save[ic] == REAL_CELL){
+ // upper left
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+
+ // upper right
+ state_temp_double[nc] = mem_ptr_double[ic];
+ nc++;
+ }
+ }
+ } // end cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ state_memory.memory_replace(mem_ptr_double, state_temp_double);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ } else if (memory_item->mem_elsize == 4) {
+
+ static float *state_temp_float, *mem_ptr_float;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ state_temp_float = (float *)state_memory.memory_malloc(new_ncells, sizeof(float),
+ "state_temp_float", flags);
+ mem_ptr_float = (float *)memory_item->mem_ptr;
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic=0; ic<(int)ncells; ic++) {
+
+ int nc = new_ic[ic];
+ if (mpot[ic] == 0) {
+ state_temp_float[nc] = mem_ptr_float[ic];
+ } else if (mpot[ic] < 0){
+ if (mpot[ic] == -2) {
+ int nr = nrht[ic];
+ int nt = ntop[ic];
+ int nrt = nrht[nt];
+ state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nr] +
+ mem_ptr_float[nt] + mem_ptr_float[nrt])*0.25;
+ }
+ if (mpot[ic] == -3) {
+ int nl = nlft[ic];
+ int nb = nbot[ic];
+ int nlb = nlft[nb];
+ state_temp_float[nc] = (mem_ptr_float[ic] + mem_ptr_float[nl] +
+ mem_ptr_float[nb] + mem_ptr_float[nlb])*0.25;
+ }
+ } else if (mpot[ic] > 0){
+ // lower left
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ // lower right
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ if (celltype_save[ic] == REAL_CELL){
+ // upper left
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+
+ // upper right
+ state_temp_float[nc] = mem_ptr_float[ic];
+ nc++;
+ }
+ }
+ } // end cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ state_memory.memory_replace(mem_ptr_float, state_temp_float);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ } // mem elem size 4 bytes
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ memory_next = state_memory_old.memory_entry_by_name_next();
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ } // memory item iteration
+
+ } // if have state
+ // End of data parallel optimizations
+#endif
+
+ if (neighbor_remap) {
+ int flags = 0;
+ static int *nlft_old, *nrht_old, *nbot_old, *ntop_old;
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ nlft_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nlft_old", flags);
+ nrht_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nrht_old", flags);
+ nbot_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "nbot_old", flags);
+ ntop_old = (int *)mesh_memory.memory_malloc(new_ncells, sizeof(int), "ntop_old", flags);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ flags = RESTART_DATA;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = 0; ic < new_ncells; ic++){
+ nlft_old[ic] = -1;
+ nrht_old[ic] = -1;
+ nbot_old[ic] = -1;
+ ntop_old[ic] = -1;
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ mesh_memory.memory_swap(&nlft, &nlft_old);
+ mesh_memory.memory_swap(&nrht, &nrht_old);
+ mesh_memory.memory_swap(&nbot, &nbot_old);
+ mesh_memory.memory_swap(&ntop, &ntop_old);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = 0; ic < (int)ncells; ic++){
+ int nc = index[ic];
+
+ if (mpot[ic] == 0){
+ if (nlft_old[ic] < (int)ncells && nlft_old[ic] >= 0){
+ nlft[nc] = (mpot[nlft_old[ic]] == 0) ? index[nlft_old[ic]] : -1;
+ }
+ if (nrht_old[ic] < (int)ncells && nrht_old[ic] >= 0){
+ nrht[nc] = (mpot[nrht_old[ic]] == 0) ? index[nrht_old[ic]] : -1;
+ }
+ if (nbot_old[ic] < (int)ncells && nbot_old[ic] >= 0){
+ nbot[nc] = (mpot[nbot_old[ic]] == 0) ? index[nbot_old[ic]] : -1;
+ }
+ if (ntop_old[ic] < (int)ncells && ntop_old[ic] >= 0){
+ ntop[nc] = (mpot[ntop_old[ic]] == 0) ? index[ntop_old[ic]] : -1;
+ }
+ } else if (mpot[ic] <= -2) {
+ nlft[nc] = -1;
+ nrht[nc] = -1;
+ nbot[nc] = -1;
+ ntop[nc] = -1;
+ } else if (mpot[ic] > 0){
+ nlft[nc] = -1;
+ nlft[nc+1] = -1;
+ nrht[nc] = -1;
+ nrht[nc+1] = -1;
+ nbot[nc] = -1;
+ nbot[nc+1] = -1;
+ ntop[nc] = -1;
+ ntop[nc+1] = -1;
+ if (celltype[nc] == REAL_CELL){
+ nlft[nc+2] = -1;
+ nlft[nc+3] = -1;
+ nrht[nc+2] = -1;
+ nrht[nc+3] = -1;
+ nbot[nc+2] = -1;
+ nbot[nc+3] = -1;
+ ntop[nc+2] = -1;
+ ntop[nc+3] = -1;
+ }
+ }
+ if (mpot[ic] > 0){
+ nc++;
+ switch(celltype[nc]){
+ case LEFT_BOUNDARY:
+ nlft[nc] = nc;
+ break;
+ case RIGHT_BOUNDARY:
+ nrht[nc] = nc;
+ break;
+ case BOTTOM_BOUNDARY:
+ nbot[nc] = nc;
+ break;
+ case TOP_BOUNDARY:
+ ntop[nc] = nc;
+ break;
+ }
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ nlft_old = (int *)mesh_memory.memory_delete(nlft_old);
+ nrht_old = (int *)mesh_memory.memory_delete(nrht_old);
+ nbot_old = (int *)mesh_memory.memory_delete(nbot_old);
+ ntop_old = (int *)mesh_memory.memory_delete(ntop_old);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ } else {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ nlft = (int *)mesh_memory.memory_delete(nlft);
+ nrht = (int *)mesh_memory.memory_delete(nrht);
+ nbot = (int *)mesh_memory.memory_delete(nbot);
+ ntop = (int *)mesh_memory.memory_delete(ntop);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ //ncells = nc;
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ noffset=ndispl[mype];
+ ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
+ }
+#endif
+
+ cpu_timers[MESH_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ } // if do_rezone
+
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_rezone_all(int icount, int jcount, cl_mem &dev_mpot, MallocPlus &gpu_state_memory)
+{
+ if (! gpu_do_rezone) return;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ gpu_counters[MESH_COUNTER_REZONE]++;
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ assert(dev_mpot);
+ assert(dev_level);
+ assert(dev_i);
+ assert(dev_j);
+ assert(dev_celltype);
+ assert(dev_ioffset);
+ assert(dev_levdx);
+ assert(dev_levdy);
+
+ int add_ncells = icount - jcount;
+
+// int global_icount = icount;
+// int global_jcount = jcount;
+
+ size_t old_ncells = ncells;
+ size_t new_ncells = ncells + add_ncells;
+
+#ifdef HAVE_MPI
+ //int global_add_ncells = add_ncells;
+
+// if (parallel) {
+// int count[2], global_count[2];
+// count[0] = icount;
+// count[1] = jcount;
+// MPI_Allreduce(&count, &global_count, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+// global_icount = global_count[0];
+// global_jcount = global_count[1];
+// //global_add_ncells = global_icount + global_jcount;
+// }
+#endif
+
+ int ifirst = 0;
+ int ilast = 0;
+ int jfirst = 0;
+ int jlast = 0;
+ int level_first = 0;
+ int level_last = 0;
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ int i_tmp_first, i_tmp_last;
+ int j_tmp_first, j_tmp_last;
+ int level_tmp_first, level_tmp_last;
+
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, 1*sizeof(cl_int), &i_tmp_first, NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, 1*sizeof(cl_int), &j_tmp_first, NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, 1*sizeof(cl_int), &level_tmp_first, NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &i_tmp_last, NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &j_tmp_last, NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, (old_ncells-1)*sizeof(cl_int), 1*sizeof(cl_int), &level_tmp_last, NULL);
+
+ MPI_Request req[12];
+ MPI_Status status[12];
+
+ static int prev = MPI_PROC_NULL;
+ static int next = MPI_PROC_NULL;
+
+ if (mype != 0) prev = mype-1;
+ if (mype < numpe - 1) next = mype+1;
+
+ MPI_Isend(&i_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+0);
+ MPI_Irecv(&ifirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+1);
+
+ MPI_Isend(&i_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+2);
+ MPI_Irecv(&ilast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+3);
+
+ MPI_Isend(&j_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+4);
+ MPI_Irecv(&jfirst, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+5);
+
+ MPI_Isend(&j_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+6);
+ MPI_Irecv(&jlast, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+7);
+
+ MPI_Isend(&level_tmp_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+8);
+ MPI_Irecv(&level_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+9);
+
+ MPI_Isend(&level_tmp_first, 1,MPI_INT,prev,1,MPI_COMM_WORLD,req+10);
+ MPI_Irecv(&level_last, 1,MPI_INT,next,1,MPI_COMM_WORLD,req+11);
+
+ MPI_Waitall(12, req, status);
+ }
+#endif
+
+/*
+ if (new_ncells != old_ncells){
+ ncells = new_ncells;
+ }
+*/
+
+ size_t mem_request = (int)((float)new_ncells*mem_factor);
+ cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ cl_mem dev_ijadd;
+
+ vector<int>ijadd(6);
+ if (numpe > 1) {
+ ijadd[0] = ifirst;
+ ijadd[1] = ilast;
+ ijadd[2] = jfirst;
+ ijadd[3] = jlast;
+ ijadd[4] = level_first;
+ ijadd[5] = level_last;
+ }
+
+ size_t six = 6;
+ dev_ijadd = ezcl_malloc(NULL, const_cast<char *>("dev_ijadd"), &six, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ ezcl_enqueue_write_buffer(command_queue, dev_ijadd, CL_TRUE, 0, 6*sizeof(cl_int), (void*)&ijadd[0], NULL);
+
+ cl_mem dev_indexoffset = ezcl_malloc(NULL, const_cast<char *>("dev_indexoffset"), &old_ncells, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+ int stencil = 0;
+ if (localStencil) stencil = 1;
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((old_ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+ ezcl_set_kernel_arg(kernel_rezone_all, 0, sizeof(cl_int), (void *)&old_ncells);
+ ezcl_set_kernel_arg(kernel_rezone_all, 1, sizeof(cl_int), (void *)&stencil);
+ ezcl_set_kernel_arg(kernel_rezone_all, 2, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_rezone_all, 3, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_rezone_all, 4, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_rezone_all, 5, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_rezone_all, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_rezone_all, 7, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_rezone_all, 8, sizeof(cl_mem), (void *)&dev_level_new);
+ ezcl_set_kernel_arg(kernel_rezone_all, 9, sizeof(cl_mem), (void *)&dev_i_new);
+ ezcl_set_kernel_arg(kernel_rezone_all, 10, sizeof(cl_mem), (void *)&dev_j_new);
+ ezcl_set_kernel_arg(kernel_rezone_all, 11, sizeof(cl_mem), (void *)&dev_celltype_new);
+ ezcl_set_kernel_arg(kernel_rezone_all, 12, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_rezone_all, 13, sizeof(cl_mem), (void *)&dev_indexoffset);
+ ezcl_set_kernel_arg(kernel_rezone_all, 14, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_rezone_all, 15, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_rezone_all, 16, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_rezone_all, 17, sizeof(cl_mem), (void *)&dev_ijadd);
+ ezcl_set_kernel_arg(kernel_rezone_all, 18, local_work_size * sizeof(cl_uint), NULL);
+ //ezcl_set_kernel_arg(kernel_rezone_all, 19, local_work_size * sizeof(cl_real4_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_all, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ MallocPlus gpu_state_memory_old = gpu_state_memory;
+ malloc_plus_memory_entry *memory_item;
+
+ for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
+ memory_item != gpu_state_memory_old.memory_entry_by_name_end();
+ memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
+ //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+ cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
+
+ if (memory_item->mem_elsize == 8){
+#ifndef MINIMUM_PRECISION
+ cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_double), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
+
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 0, sizeof(cl_int), (void *)&old_ncells);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 1, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 2, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 3, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 4, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 5, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 6, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 7, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 8, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_rezone_one_double, 9, sizeof(cl_mem), (void *)&dev_indexoffset);
+ ezcl_set_kernel_arg(kernel_rezone_one_double,10, sizeof(cl_mem), (void *)&dev_state_mem_ptr);
+ ezcl_set_kernel_arg(kernel_rezone_one_double,11, sizeof(cl_mem), (void *)&dev_state_var_new);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_double, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+#else
+ printf("ERROR -- can't have double type for state variable\n");
+ exit(1);
+#endif
+ } else if (memory_item->mem_elsize == 4){
+ cl_mem dev_state_var_new = (cl_mem)gpu_state_memory.memory_malloc(max(old_ncells,new_ncells), sizeof(cl_float), const_cast<char *>("dev_state_var_new"), DEVICE_REGULAR_MEMORY);
+
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 0, sizeof(cl_int), (void *)&old_ncells);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 1, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 2, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 3, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 4, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 5, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 6, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 7, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 8, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_rezone_one_float, 9, sizeof(cl_mem), (void *)&dev_indexoffset);
+ ezcl_set_kernel_arg(kernel_rezone_one_float,10, sizeof(cl_mem), (void *)&dev_state_mem_ptr);
+ ezcl_set_kernel_arg(kernel_rezone_one_float,11, sizeof(cl_mem), (void *)&dev_state_var_new);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_one_float, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+ }
+ }
+
+ if (neighbor_remap & ! parallel) {
+ size_t mem_request = (int)((float)new_ncells*mem_factor);
+ cl_mem dev_nlft_new = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_nrht_new = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_nbot_new = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_ntop_new = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_neighbor_init, 0, sizeof(cl_int), (void *)&new_ncells);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 1, sizeof(cl_mem), (void *)&dev_nlft_new);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 2, sizeof(cl_mem), (void *)&dev_nrht_new);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 3, sizeof(cl_mem), (void *)&dev_nbot_new);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 4, sizeof(cl_mem), (void *)&dev_ntop_new);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 0, sizeof(cl_int), (void *)&old_ncells);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 1, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 2, sizeof(cl_mem), (void *)&dev_indexoffset);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 3, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 4, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 5, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 6, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 7, sizeof(cl_mem), (void *)&dev_celltype_new);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 8, sizeof(cl_mem), (void *)&dev_nlft_new);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 9, sizeof(cl_mem), (void *)&dev_nrht_new);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 10, sizeof(cl_mem), (void *)&dev_nbot_new);
+ ezcl_set_kernel_arg(kernel_rezone_neighbors, 11, sizeof(cl_mem), (void *)&dev_ntop_new);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_rezone_neighbors, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_device_memory_swap(&dev_nlft, &dev_nlft_new);
+ ezcl_device_memory_swap(&dev_nrht, &dev_nrht_new);
+ ezcl_device_memory_swap(&dev_nbot, &dev_nbot_new);
+ ezcl_device_memory_swap(&dev_ntop, &dev_ntop_new);
+
+ ezcl_device_memory_delete(dev_nlft_new);
+ ezcl_device_memory_delete(dev_nrht_new);
+ ezcl_device_memory_delete(dev_nbot_new);
+ ezcl_device_memory_delete(dev_ntop_new);
+ } else {
+ ezcl_device_memory_delete(dev_nlft);
+ ezcl_device_memory_delete(dev_nrht);
+ ezcl_device_memory_delete(dev_nbot);
+ ezcl_device_memory_delete(dev_ntop);
+ dev_nlft = NULL;
+ dev_nrht = NULL;
+ dev_nbot = NULL;
+ dev_ntop = NULL;
+ }
+
+ ezcl_device_memory_delete(dev_indexoffset);
+
+ if (new_ncells != old_ncells){
+ resize_old_device_memory(new_ncells);
+ }
+
+ ezcl_device_memory_swap(&dev_celltype, &dev_celltype_new);
+ ezcl_device_memory_swap(&dev_level, &dev_level_new);
+ ezcl_device_memory_swap(&dev_i, &dev_i_new);
+ ezcl_device_memory_swap(&dev_j, &dev_j_new);
+
+ ezcl_device_memory_delete(dev_mpot);
+ ezcl_device_memory_delete(dev_ijadd);
+ ezcl_device_memory_delete(dev_ioffset);
+
+ ezcl_device_memory_delete(dev_i_new);
+ ezcl_device_memory_delete(dev_j_new);
+ ezcl_device_memory_delete(dev_celltype_new);
+ ezcl_device_memory_delete(dev_level_new);
+
+#ifdef HAVE_MPI
+ if (parallel) {
+ int new_ncells = ncells + add_ncells;
+ MPI_Allgather(&new_ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ noffset=ndispl[mype];
+ ncells_global = ndispl[numpe-1]+nsizes[numpe-1];
+ }
+#endif
+
+ gpu_timers[MESH_TIMER_REZONE_ALL] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::calc_neighbors(int ncells)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ if (do_rezone) {
+
+ int flags = INDEX_ARRAY_MEMORY;
+
+#if defined (HAVE_J7)
+ if (parallel) flags |= LOAD_BALANCE_MEMORY;
+#endif
+
+ static int nlft_size = 0;
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+ if (nlft != NULL){
+ nlft_size = mesh_memory.get_memory_size(nlft);
+ }
+
+ if (nlft_size < ncells){
+ if (nlft != NULL){
+ nlft = (int *)mesh_memory.memory_delete(nlft);
+ nrht = (int *)mesh_memory.memory_delete(nrht);
+ nbot = (int *)mesh_memory.memory_delete(nbot);
+ ntop = (int *)mesh_memory.memory_delete(ntop);
+ }
+
+ nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
+ nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
+ nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
+ ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ if (nlft_size < ncells){
+ int lowerBounds, upperBounds;
+ get_bounds(lowerBounds, upperBounds);
+
+ for(int ic=lowerBounds; ic<upperBounds; ic++){
+ nlft[ic] = -1;
+ nrht[ic] = -1;
+ nbot[ic] = -1;
+ ntop[ic] = -1;
+ }
+ }
+
+ if (calc_neighbor_type == HASH_TABLE) {
+
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ int jmaxsize = (jmax+1)*IPOW2(levmx);
+ int imaxsize = (imax+1)*IPOW2(levmx);
+
+ int *hash;
+
+#ifdef _OPENMP
+ hash = compact_hash_init_openmp(ncells, imaxsize, jmaxsize, 0);
+#else
+ hash = compact_hash_init(ncells, imaxsize, jmaxsize, 0);
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(int ic=0; ic<ncells; ic++){
+ int lev = level[ic];
+
+ bool need_hash = (nlft[ic] == -1 || nrht[ic] == -1 || nbot[ic] == -1 || ntop[ic] == -1) ? true : false;
+
+ if (! need_hash){
+ if ( (level[nlft[ic]] > lev && ntop[nlft[ic]] == -1) ||
+ (level[nrht[ic]] > lev && ntop[nrht[ic]] == -1) ||
+ (level[nbot[ic]] > lev && nrht[nbot[ic]] == -1) ||
+ (level[ntop[ic]] > lev && nrht[ntop[ic]] == -1) ) need_hash = true;
+ }
+
+ if (need_hash) {
+ int levmult = IPOW2(levmx-lev);
+ int ii = i[ic]*levmult;
+ int jj = j[ic]*levmult;
+
+ write_hash(ic,jj*imaxsize+ii,hash);
+ }
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ //fprintf(fp,"DEBUG ncells is %lu\n",ncells);
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic=0; ic<(int)ncells; ic++){
+ int ii = i[ic];
+ int jj = j[ic];
+ int lev = level[ic];
+ int levmult = IPOW2(levmx-lev);
+ int iicur = ii*levmult;
+ int iilft = max( (ii-1)*levmult, 0 );
+ int iirht = min( (ii+1)*levmult, imaxsize-1);
+ int jjcur = jj*levmult;
+ int jjbot = max( (jj-1)*levmult, 0 );
+ int jjtop = min( (jj+1)*levmult, jmaxsize-1);
+
+ int nlftval = nlft[ic];
+ int nrhtval = nrht[ic];
+ int nbotval = nbot[ic];
+ int ntopval = ntop[ic];
+
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (nlftval < 0 && iicur < 1*IPOW2(levmx) ) nlftval = ic;
+ if (nbotval < 0 && jjcur < 1*IPOW2(levmx) ) nbotval = ic;
+ if (nrhtval < 0 && iicur > imax*IPOW2(levmx)-1) nrhtval = ic;
+ if (ntopval < 0 && jjcur > jmax*IPOW2(levmx)-1) ntopval = ic;
+ // Boundary cells next to corner boundary need special checks
+ if (nlftval < 0 && iicur == 1*IPOW2(levmx) && (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nlftval = ic;
+ if (nbotval < 0 && jjcur == 1*IPOW2(levmx) && (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) nbotval = ic;
+ if (nrhtval < 0 && iirht == imax*IPOW2(levmx) && (jjcur < 1*IPOW2(levmx) || jjcur >= jmax*IPOW2(levmx) ) ) nrhtval = ic;
+ if (ntopval < 0 && jjtop == jmax*IPOW2(levmx) && (iicur < 1*IPOW2(levmx) || iicur >= imax*IPOW2(levmx) ) ) ntopval = ic;
+
+ // need to check for finer neighbor first
+ // Right and top neighbor don't change for finer, so drop through to same size
+ // Left and bottom need to be half of same size index for finer test
+ if (lev != levmx) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ //int iirhtfiner = (iicur+iirht)/2;
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ //int jjtopfiner = (jjcur+jjtop)/2;
+ if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilftfiner, hash);
+ if (nbotval < 0) nbotval = read_hash(jjbotfiner*imaxsize+iicur, hash);
+ }
+
+ // same size neighbor
+ if (nlftval < 0) nlftval = read_hash(jjcur*imaxsize+iilft, hash);
+ if (nrhtval < 0) nrhtval = read_hash(jjcur*imaxsize+iirht, hash);
+ if (nbotval < 0) nbotval = read_hash(jjbot*imaxsize+iicur, hash);
+ if (ntopval < 0) ntopval = read_hash(jjtop*imaxsize+iicur, hash);
+
+ // Now we need to take care of special case where bottom and left boundary need adjustment since
+ // expected cell doesn't exist on these boundaries if it is finer than current cell
+ if (lev != levmx) {
+ if (jjcur < 1*IPOW2(levmx)) {
+ if (nrhtval < 0) {
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nrhtval = read_hash(jjtopfiner*imaxsize+iirht, hash);
+ }
+ if (nlftval < 0) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nlftval = read_hash(jjtopfiner*imaxsize+iilftfiner, hash);
+ }
+ }
+
+ if (iicur < 1*IPOW2(levmx)) {
+ if (ntopval < 0) {
+ int iirhtfiner = (iicur+iirht)/2;
+ ntopval = read_hash(jjtop*imaxsize+iirhtfiner, hash);
+ }
+ if (nbotval < 0) {
+ int iirhtfiner = (iicur+iirht)/2;
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ nbotval = read_hash(jjbotfiner*imaxsize+iirhtfiner, hash);
+ }
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (nlftval < 0) {
+ iilft -= iicur-iilft;
+ int jjlft = (jj/2)*2*levmult;
+ nlftval = read_hash(jjlft*imaxsize+iilft, hash);
+ }
+ if (nrhtval < 0) {
+ int jjrht = (jj/2)*2*levmult;
+ nrhtval = read_hash(jjrht*imaxsize+iirht, hash);
+ }
+ if (nbotval < 0) {
+ jjbot -= jjcur-jjbot;
+ int iibot = (ii/2)*2*levmult;
+ nbotval = read_hash(jjbot*imaxsize+iibot, hash);
+ }
+ if (ntopval < 0) {
+ int iitop = (ii/2)*2*levmult;
+ ntopval = read_hash(jjtop*imaxsize+iitop, hash);
+ }
+ }
+
+ nlft[ic] = nlftval;
+ nrht[ic] = nrhtval;
+ nbot[ic] = nbotval;
+ ntop[ic] = ntopval;
+
+ //printf("neighbors[%d] = %d %d %d %d\n",ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ write_hash_collision_report();
+ read_hash_collision_report();
+
+ compact_hash_delete(hash);
+
+ if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
+#ifdef _OPENMP
+ } // master block
+#endif
+
+ } else if (calc_neighbor_type == KDTREE) {
+
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ TBounds box;
+ vector<int> index_list(IPOW2(levmx*levmx) );
+
+ int num;
+
+ ibase = 0;
+ calc_spatial_coordinates(ibase);
+
+ kdtree_setup();
+
+ if (TIMING_LEVEL >= 2) {
+ cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ for (int ic=0; ic<ncells; ic++) {
+
+ //left
+ nlft[ic] = ic;
+ box.min.x = x[ic]-0.25*dx[ic];
+ box.max.x = x[ic]-0.25*dx[ic];
+ box.min.y = y[ic]+0.25*dy[ic];
+ box.max.y = y[ic]+0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nlft[ic]=index_list[0];
+
+ //right
+ nrht[ic] = ic;
+ box.min.x = x[ic]+1.25*dx[ic];
+ box.max.x = x[ic]+1.25*dx[ic];
+ box.min.y = y[ic]+0.25*dy[ic];
+ box.max.y = y[ic]+0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nrht[ic]=index_list[0];
+
+ //bot
+ nbot[ic] = ic;
+ box.min.x = x[ic]+0.25*dx[ic];
+ box.max.x = x[ic]+0.25*dx[ic];
+ box.min.y = y[ic]-0.25*dy[ic];
+ box.max.y = y[ic]-0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nbot[ic]=index_list[0];
+
+ //top
+ ntop[ic] = ic;
+ box.min.x = x[ic]+0.25*dx[ic];
+ box.max.x = x[ic]+0.25*dx[ic];
+ box.min.y = y[ic]+1.25*dy[ic];
+ box.max.y = y[ic]+1.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) ntop[ic]=index_list[0];
+ } // End main loop over cells.
+
+ KDTree_Destroy(&tree);
+
+ if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+ } // calc_neighbor_type
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ ncells_ghost = ncells;
+
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
+}
+
+void Mesh::calc_neighbors_local(void)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ if (do_rezone) {
+
+ int flags = INDEX_ARRAY_MEMORY;
+
+#if defined (HAVE_J7)
+ if (parallel) flags |= LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ cpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+ if (mesh_memory.get_memory_size(nlft) < ncells){
+ if (nlft != NULL) nlft = (int *)mesh_memory.memory_delete(nlft);
+ if (nrht != NULL) nrht = (int *)mesh_memory.memory_delete(nrht);
+ if (nbot != NULL) nbot = (int *)mesh_memory.memory_delete(nbot);
+ if (ntop != NULL) ntop = (int *)mesh_memory.memory_delete(ntop);
+ nlft = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nlft", flags);
+ nrht = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nrht", flags);
+ nbot = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "nbot", flags);
+ ntop = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "ntop", flags);
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ int lowerBound, upperBound;
+ set_bounds(ncells);
+ get_bounds(lowerBound, upperBound);
+ for (int ic = lowerBound; ic < upperBound; ic++){
+ nlft[ic] = -98;
+ nrht[ic] = -98;
+ nbot[ic] = -98;
+ ntop[ic] = -98;
+ }
+
+ if (calc_neighbor_type == HASH_TABLE) {
+
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ ncells_ghost = ncells;
+
+ // Find maximum i column and j row for this processor
+ static int jmintile, imintile, jmaxtile, imaxtile;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ jmintile = (jmax+1)*IPOW2(levmx);
+ imintile = (imax+1)*IPOW2(levmx);
+ jmaxtile = 0;
+ imaxtile = 0;
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ int my_jmintile = jmintile;
+ int my_imintile = imintile;
+ int my_jmaxtile = 0;
+ int my_imaxtile = 0;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(uint ic=0; ic<ncells; ic++){
+ int lev = level[ic];
+// if (lev < 0 || lev > levmx) printf("DEBUG -- cell %d lev %d\n",ic,level[ic]);
+ if ( j[ic] *IPOW2(levmx-lev) < my_jmintile) my_jmintile = j[ic] *IPOW2(levmx-lev) ;
+ if ((j[ic]+1)*IPOW2(levmx-lev)-1 > my_jmaxtile) my_jmaxtile = (j[ic]+1)*IPOW2(levmx-lev)-1;
+ if ( i[ic] *IPOW2(levmx-lev) < my_imintile) my_imintile = i[ic] *IPOW2(levmx-lev) ;
+ if ((i[ic]+1)*IPOW2(levmx-lev)-1 > my_imaxtile) my_imaxtile = (i[ic]+1)*IPOW2(levmx-lev)-1;
+ }
+#ifdef _OPENMP
+#pragma omp critical
+ {
+#endif
+ if (my_jmintile < jmintile) jmintile = my_jmintile;
+ if (my_imintile < imintile) imintile = my_imintile;
+ if (my_jmaxtile > jmaxtile) jmaxtile = my_jmaxtile;
+ if (my_imaxtile > imaxtile) imaxtile = my_imaxtile;
+#ifdef _OPENMP
+ } // end critical region
+#pragma omp barrier
+#endif
+
+ //if (DEBUG) fprintf(fp,"%d: Tile Sizes are imin %d imax %d jmin %d jmax %d\n",mype,imintile,imaxtile,jmintile,jmaxtile);
+
+ // Expand size by 2*coarse_cells for ghost cells
+ int jminsize = max(jmintile-2*IPOW2(levmx),0);
+ int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
+ int iminsize = max(imintile-2*IPOW2(levmx),0);
+ int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
+ //if (DEBUG) fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+
+ //fprintf(fp,"DEBUG -- ncells %lu\n",ncells);
+
+ static int *hash;
+
+#ifdef _OPENMP
+ hash = compact_hash_init_openmp(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
+#else
+ hash = compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, 0);
+#endif
+
+ //printf("%d: DEBUG -- noffset %d cells %d\n",mype,noffset,ncells);
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+
+ static int imaxcalc, jmaxcalc;
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(uint ic=0; ic<ncells; ic++){
+ int cellnumber = ic+noffset;
+ int lev = level[ic];
+ int levmult = IPOW2(levmx-lev);
+ int ii = i[ic]*levmult-iminsize;
+ int jj = j[ic]*levmult-jminsize;
+
+ write_hash(cellnumber, jj*(imaxsize-iminsize)+ii, hash);
+ } // end for loop
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_HASH_SETUP] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ // Set neighbors to global cell numbers from hash
+ jmaxcalc = (jmax+1)*IPOW2(levmx);
+ imaxcalc = (imax+1)*IPOW2(levmx);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++){
+ int ii = i[ic];
+ int jj = j[ic];
+ int lev = level[ic];
+ int levmult = IPOW2(levmx-lev);
+
+ int iicur = ii*levmult-iminsize;
+ int iilft = max( (ii-1)*levmult, 0 )-iminsize;
+ int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+ int jjcur = jj*levmult-jminsize;
+ int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
+ int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+ int nlftval = -1;
+ int nrhtval = -1;
+ int nbotval = -1;
+ int ntopval = -1;
+
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (iicur < 1*IPOW2(levmx) -iminsize) nlftval = ic+noffset;
+ if (jjcur < 1*IPOW2(levmx) -jminsize) nbotval = ic+noffset;
+ if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = ic+noffset;
+ if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = ic+noffset;
+ // Boundary cells next to corner boundary need special checks
+ if (iicur == 1*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = ic+noffset;
+ if (jjcur == 1*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = ic+noffset;
+ if (iirht == imax*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = ic+noffset;
+ if (jjtop == jmax*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = ic+noffset;
+
+ // need to check for finer neighbor first
+ // Right and top neighbor don't change for finer, so drop through to same size
+ // Left and bottom need to be half of same size index for finer test
+ if (lev != levmx) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ if (nlftval < 0) nlftval = read_hash(jjcur *(imaxsize-iminsize)+iilftfiner, hash);
+ if (nbotval < 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+ }
+
+ // same size neighbor
+ if (nlftval < 0) {
+ int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+ if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev) nlftval = nlfttry;
+ }
+ if (nrhtval < 0) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+ if (nbotval < 0) {
+ int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+ if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev) nbotval = nbottry;
+ }
+ if (ntopval < 0) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+
+ // Now we need to take care of special case where bottom and left boundary need adjustment since
+ // expected cell doesn't exist on these boundaries if it is finer than current cell
+ if (lev != levmx) {
+ if (jjcur < 1*IPOW2(levmx)) {
+ if (nrhtval < 0) {
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+ }
+ if (nlftval < 0) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+ }
+ }
+
+ if (iicur < 1*IPOW2(levmx)) {
+ if (ntopval < 0) {
+ int iirhtfiner = (iicur+iirht)/2;
+ ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ if (nbotval < 0) {
+ int iirhtfiner = (iicur+iirht)/2;
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (nlftval < 0) {
+ iilft -= iicur-iilft;
+ int jjlft = (jj/2)*2*levmult-jminsize;
+ int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+ if (nlfttry >= 0 && nlfttry < (int)ncells && level[nlfttry] == lev-1) nlftval = nlfttry;
+ }
+ if (nrhtval < 0) {
+ int jjrht = (jj/2)*2*levmult-jminsize;
+ int nrhttry = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
+ if (nrhttry >= 0 && nrhttry < (int)ncells && level[nrhttry] == lev-1) nrhtval = nrhttry;
+ }
+ if (nbotval < 0) {
+ jjbot -= jjcur-jjbot;
+ int iibot = (ii/2)*2*levmult-iminsize;
+ int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+ if (nbottry >= 0 && nbottry < (int)ncells && level[nbottry] == lev-1) nbotval = nbottry;
+ }
+ if (ntopval < 0) {
+ int iitop = (ii/2)*2*levmult-iminsize;
+ int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
+ if (ntoptry >= 0 && ntoptry < (int)ncells && level[ntoptry] == lev-1) ntopval = ntoptry;
+ }
+ }
+
+ nlft[ic] = nlftval;
+ nrht[ic] = nrhtval;
+ nbot[ic] = nbotval;
+ ntop[ic] = ntopval;
+
+ //fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ print_local();
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH 0 numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nlft numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nlft[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nrht numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nrht[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nbot numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nbot[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n ntop numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash)-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",ntop[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_HASH_QUERY] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ static int num_comm_partners;
+
+ static vector<int> iminsize_global;
+ static vector<int> imaxsize_global;
+ static vector<int> jminsize_global;
+ static vector<int> jmaxsize_global;
+ static vector<int> comm_partner;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ iminsize_global.resize(numpe);
+ imaxsize_global.resize(numpe);
+ jminsize_global.resize(numpe);
+ jmaxsize_global.resize(numpe);
+ comm_partner.resize(numpe,-1);
+
+ MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+ num_comm_partners = 0;
+ for (int ip = 0; ip < numpe; ip++){
+ if (ip == mype) continue;
+ if (iminsize_global[ip] > imaxtile) continue;
+ if (imaxsize_global[ip] < imintile) continue;
+ if (jminsize_global[ip] > jmaxtile) continue;
+ if (jmaxsize_global[ip] < jmintile) continue;
+ comm_partner[num_comm_partners] = ip;
+ num_comm_partners++;
+ //if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ static vector<int> border_cell;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ border_cell.resize(ncells);
+
+#ifdef BOUNDS_CHECK
+ for (uint ic=0; ic<ncells; ic++){
+ int nl = nlft[ic];
+ if (nl != -1){
+ nl -= noffset;
+ if (nl<0 || nl>= (int)ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+ }
+ int nr = nrht[ic];
+ if (nr != -1){
+ nr -= noffset;
+ if (nr<0 || nr>= (int)ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+ }
+ int nb = nbot[ic];
+ if (nb != -1){
+ nb -= noffset;
+ if (nb<0 || nb>= (int)ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+ }
+ int nt = ntop[ic];
+ if (nt != -1){
+ nt -= noffset;
+ if (nt<0 || nt>= (int)ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+ }
+ }
+#endif
+
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ static vector<int> border_cell_out;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ border_cell_out.resize(ncells);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++){
+ int iborder_cell = 0;
+
+ // left neighbor is undefined -- or -- if left is at finer level check left top for undefined
+ if (nlft[ic] == -1 || (level[nlft[ic]-noffset] > level[ic] && ntop[nlft[ic]-noffset] == -1) ){
+ iborder_cell |= 0x0001;
+ }
+ if (nrht[ic] == -1 || (level[nrht[ic]-noffset] > level[ic] && ntop[nrht[ic]-noffset] == -1) ){
+ iborder_cell |= 0x0002;
+ }
+ if (nbot[ic] == -1 || (level[nbot[ic]-noffset] > level[ic] && nrht[nbot[ic]-noffset] == -1) ) {
+ iborder_cell |= 0x0004;
+ }
+ if (ntop[ic] == -1 || (level[ntop[ic]-noffset] > level[ic] && nrht[ntop[ic]-noffset] == -1) ) {
+ iborder_cell |= 0x0008;
+ }
+
+ border_cell[ic] = iborder_cell;
+ }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ic++){
+ int iborder_cell = border_cell[ic];
+
+ if (iborder_cell == 0) {
+
+ int nl = nlft[ic]-noffset;
+ if (nl >= 0 && nl < (int)ncells) {
+ if ((border_cell[nl] & 0x0001) == 0x0001) {
+ iborder_cell |= 0x0016;
+ } else if (level[nl] > level[ic]){
+ int ntl = ntop[nl]-noffset;
+ if (ntl >= 0 && ntl < (int)ncells && (border_cell[ntl] & 0x0001) == 0x0001) {
+ iborder_cell |= 0x0016;
+ }
+ }
+ }
+ int nr = nrht[ic]-noffset;
+ if (nr >= 0 && nr < (int)ncells) {
+ if ((border_cell[nrht[ic]-noffset] & 0x0002) == 0x0002) {
+ iborder_cell |= 0x0032;
+ } else if (level[nr] > level[ic]){
+ int ntr = ntop[nr]-noffset;
+ if (ntr >= 0 && ntr < (int)ncells && (border_cell[ntr] & 0x0002) == 0x0002) {
+ iborder_cell |= 0x0032;
+ }
+ }
+ }
+ int nb = nbot[ic]-noffset;
+ if (nb >= 0 && nb < (int)ncells) {
+ if ((border_cell[nb] & 0x0004) == 0x0004) {
+ iborder_cell |= 0x0064;
+ } else if (level[nb] > level[ic]){
+ int nrb = nrht[nb]-noffset;
+ if (nrb >= 0 && nrb < (int)ncells && (border_cell[nrb] & 0x0004) == 0x0004) {
+ iborder_cell |= 0x0064;
+ }
+ }
+ }
+ int nt = ntop[ic]-noffset;
+ if (nt >= 0 && nt < (int)ncells) {
+ if ((border_cell[nt] & 0x0008) == 0x0008) {
+ iborder_cell |= 0x0128;
+ } else if (level[nt] > level[ic]){
+ int nrt = nrht[nt]-noffset;
+ if (nrt >= 0 && nrt < (int)ncells && (border_cell[nrt] & 0x0008) == 0x0008) {
+ iborder_cell |= 0x0128;
+ }
+ }
+ }
+ }
+
+ border_cell_out[ic] = iborder_cell;
+ }
+// indent offset
+
+ vector<int> border_cell_num;
+
+ static int nbsize_local;
+
+ static vector<int> border_cell_i;
+ static vector<int> border_cell_j;
+ static vector<int> border_cell_level;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for (int ic=0; ic<(int)ncells; ic++){
+ if (border_cell_out[ic] > 0) border_cell_num.push_back(ic+noffset);
+ }
+ //printf("%d: border cell size is %d\n",mype,border_cell_num.size());
+
+ nbsize_local = border_cell_num.size();
+
+ border_cell_i.resize(nbsize_local);
+ border_cell_j.resize(nbsize_local);
+ border_cell_level.resize(nbsize_local);
+
+ for (int ic = 0; ic <nbsize_local; ic++){
+ int cell_num = border_cell_num[ic]-noffset;
+ border_cell_i[ic] = i[cell_num];
+ border_cell_j[ic] = j[cell_num];
+ border_cell_level[ic] = level[cell_num];
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"%d: Border cell size is %d\n",mype,nbsize_local);
+ for (int ib = 0; ib <nbsize_local; ib++){
+ fprintf(fp,"%d: Border cell %d is %d i %d j %d level %d\n",mype,ib,border_cell_num[ib],
+ border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_FIND_BOUNDARY] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ // Allocate push database
+
+ static int **send_database;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ send_database = (int**)malloc(num_comm_partners*sizeof(int *));
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ // Compute the overlap between processor bounding boxes and set up push database
+
+ static vector<int> send_buffer_count;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ send_buffer_count.resize(num_comm_partners);
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ int icount = 0;
+ for (int ib = 0; ib <nbsize_local; ib++){
+ int lev = border_cell_level[ib];
+ int levmult = IPOW2(levmx-lev);
+ if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] &&
+ border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] &&
+ border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] &&
+ border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
+ // border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+ send_database[ip][icount] = ib;
+ icount++;
+ }
+ }
+ send_buffer_count[ip]=icount;
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ // Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and
+ // send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
+ // It will return receive_count_total for use in allocations
+
+ static int receive_count_total;
+ int i_push_handle = 0;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ i_push_handle = 0;
+ L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
+ send_database, &receive_count_total, &i_push_handle);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
+ for (int ic = 0; ic < send_buffer_count[ip]; ic++){
+ int ib = send_database[ip][ic];
+ fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
+ border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+ }
+ }
+#ifdef _OPENMP
+ }
+#endif
+ }
+
+ // Can now free the send database. Other arrays are vectors and will automatically
+ // deallocate
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ free(send_database[ip]);
+ }
+ free(send_database);
+#ifdef _OPENMP
+ }
+#endif
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_PUSH_SETUP] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ // Push the data needed to the adjacent processors
+ static int *border_cell_num_local;
+ static int *border_cell_i_local;
+ static int *border_cell_j_local;
+ static int *border_cell_level_local;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
+ border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
+ border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
+ border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
+
+ L7_Push_Update(&border_cell_num[0], border_cell_num_local, i_push_handle);
+ L7_Push_Update(&border_cell_i[0], border_cell_i_local, i_push_handle);
+ L7_Push_Update(&border_cell_j[0], border_cell_j_local, i_push_handle);
+ L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
+
+ L7_Push_Free(&i_push_handle);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ nbsize_local = receive_count_total;
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for (int ic = 0; ic < nbsize_local; ic++) {
+ fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
+ border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
+ }
+#ifdef _OPENMP
+ }
+#endif
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_PUSH_BOUNDARY] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_LOCAL_LIST] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering before layer 1\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash));
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+#ifdef _OPENMP
+ }
+#endif
+ }
+
+ vector<int> border_cell_needed_local;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ border_cell_needed_local.resize(nbsize_local, 0);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ // Layer 1
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for (int ic =0; ic<nbsize_local; ic++){
+ int jj = border_cell_j_local[ic];
+ int ii = border_cell_i_local[ic];
+ int lev = border_cell_level_local[ic];
+ int levmult = IPOW2(levmx-lev);
+
+ int iicur = ii*levmult-iminsize;
+ int iilft = max( (ii-1)*levmult, 0 )-iminsize;
+ int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+ int jjcur = jj*levmult-jminsize;
+ int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
+ int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+ //fprintf(fp,"DEBUG layer ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
+
+ int iborder = 0;
+
+ // Test for cell to left
+ if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 && (jjcur+jjtop)/2 < jmaxsize-jminsize){
+ int nlftval = -1;
+ // Check for finer cell left and bottom side
+ if (lev != levmx){ // finer neighbor
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+ // Also check for finer cell left and top side
+ if (nlftval < 0) {
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+ }
+ }
+
+ if (nlftval < 0 && iilft >= 0) { // same size
+ int nlfttry = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+ // we have to test for same level or it could be a finer cell one cell away that it is matching
+ if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev) {
+ nlftval = nlfttry;
+ }
+ }
+
+ if (lev != 0 && nlftval < 0 && iilft-(iicur-iilft) >= 0){ // coarser neighbor
+ iilft -= iicur-iilft;
+ int jjlft = (jj/2)*2*levmult-jminsize;
+ int nlfttry = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+ // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+ if (nlfttry-noffset >= 0 && nlfttry-noffset < (int)ncells && level[nlfttry-noffset] == lev-1) {
+ nlftval = nlfttry;
+ }
+ }
+ if (nlftval >= 0) iborder |= 0x0001;
+ }
+
+ // Test for cell to right
+ if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
+ int nrhtval = -1;
+ // right neighbor -- finer, same size and coarser
+ nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+ // right neighbor -- finer right top test
+ if (nrhtval < 0 && lev != levmx){
+ int jjtopfiner = (jjcur+jjtop)/2;
+ nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+ }
+ if (nrhtval < 0 && lev != 0) { // test for coarser, but not directly above
+ int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
+ if (jjrhtcoarser != jjcur) {
+ int nrhttry = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
+ if (nrhttry-noffset >= 0 && nrhttry-noffset < (int)ncells && level[nrhttry-noffset] == lev-1) {
+ nrhtval = nrhttry;
+ }
+ }
+ }
+ if (nrhtval > 0) iborder |= 0x0002;
+ }
+
+ // Test for cell to bottom
+ if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
+ int nbotval = -1;
+ // Check for finer cell below and left side
+ if (lev != levmx){ // finer neighbor
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+ // Also check for finer cell below and right side
+ if (nbotval < 0) {
+ int iirhtfiner = (iicur+iirht)/2;
+ nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ }
+
+ if (nbotval < 0 && jjbot >= 0) { // same size
+ int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+ // we have to test for same level or it could be a finer cell one cell away that it is matching
+ if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev) {
+ nbotval = nbottry;
+ }
+ }
+
+ if (lev != 0 && nbotval < 0 && jjbot-(jjcur-jjbot) >= 0){ // coarser neighbor
+ jjbot -= jjcur-jjbot;
+ int iibot = (ii/2)*2*levmult-iminsize;
+ int nbottry = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+ // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+ if (nbottry-noffset >= 0 && nbottry-noffset < (int)ncells && level[nbottry-noffset] == lev-1) {
+ nbotval = nbottry;
+ }
+ }
+ if (nbotval >= 0) iborder |= 0x0004;
+ }
+
+ // Test for cell to top
+ if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
+ int ntopval = -1;
+ // top neighbor -- finer, same size and coarser
+ ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+ // top neighbor -- finer top right test
+ if (ntopval < 0 && lev != levmx){
+ int iirhtfiner = (iicur+iirht)/2;
+ ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ if (ntopval < 0 && lev != 0) { // test for coarser, but not directly above
+ int iitopcoarser = (ii/2)*2*levmult-iminsize;
+ if (iitopcoarser != iicur) {
+ int ntoptry = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
+ if (ntoptry-noffset >= 0 && ntoptry-noffset < (int)ncells && level[ntoptry-noffset] == lev-1) {
+ ntopval = ntoptry;
+ }
+ }
+ }
+ if (ntopval > 0) iborder |= 0x0008;
+ }
+
+ if (iborder) border_cell_needed_local[ic] = iborder;
+ }
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] == 0) continue;
+ fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ }
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ }
+
+ // Walk through cell array and set hash to border local index plus ncells+noffset for next pass
+ //fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] == 0) continue;
+ //fprintf(fp,"%d: index %d cell %d i %d j %d\n",mype,ic,border_cell_num_local[ic],border_cell_i_local[ic],border_cell_j_local[ic]);
+ int lev = border_cell_level_local[ic];
+ int levmult = IPOW2(levmx-lev);
+ int ii = border_cell_i_local[ic]*levmult-iminsize;
+ int jj = border_cell_j_local[ic]*levmult-jminsize;
+
+ write_hash(ncells+noffset+ic, jj*(imaxsize-iminsize)+ii, hash);
+ }
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_LAYER1] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ print_local();
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering for 1 layer\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+ }
+
+ // Layer 2
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ for (int ic =0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] > 0) continue;
+ int jj = border_cell_j_local[ic];
+ int ii = border_cell_i_local[ic];
+ int lev = border_cell_level_local[ic];
+ int levmult = IPOW2(levmx-lev);
+
+ int iicur = ii*levmult-iminsize;
+ int iilft = max( (ii-1)*levmult, 0 )-iminsize;
+ int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+ int jjcur = jj*levmult-jminsize;
+ int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
+ int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+ //fprintf(fp," DEBUG layer2 ic %d num %d i %d j %d lev %d\n",ic,border_cell_num_local[ic],ii,jj,lev);
+
+ int iborder = 0;
+
+ // Test for cell to left
+ if (iicur-(iicur-iilft)/2 >= 0 && iicur-(iicur-iilft)/2 < imaxsize-iminsize && jjcur >= 0 && (jjcur+jjtop)/2 < jmaxsize-jminsize){
+ // Check for finer cell left and bottom side
+ if (lev != levmx){ // finer neighbor
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ int nl = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+ if (nl >= (int)(ncells+noffset) && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+ iborder = 0x0001;
+ } else {
+ // Also check for finer cell left and top side
+ int jjtopfiner = (jjcur+jjtop)/2;
+ int nlt = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+ if ( nlt >= (int)(ncells+noffset) && (border_cell_needed_local[nlt-ncells-noffset] & 0x0001) == 0x0001) {
+ iborder = 0x0001;
+ }
+ }
+ }
+ if ( (iborder & 0x0001) == 0 && iilft >= 0) { //same size
+ int nl = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+ int levcheck = -1;
+ if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
+ levcheck = level[nl-noffset];
+ } else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nl-ncells-noffset];
+ }
+ if (nl >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+ iborder = 0x0001;
+ } else if (lev != 0 && iilft-(iicur-iilft) >= 0){ // coarser neighbor
+ iilft -= iicur-iilft;
+ int jjlft = (jj/2)*2*levmult-jminsize;
+ nl = read_hash(jjlft*(imaxsize-iminsize)+iilft, hash);
+ levcheck = -1;
+ if (nl-noffset >= 0 && nl-noffset < (int)ncells) {
+ levcheck = level[nl-noffset];
+ } else if (nl >= 0 && (int)(nl-ncells-noffset) >= 0 && (int)(nl-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nl-ncells-noffset];
+ }
+ // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+ if (nl >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nl-ncells-noffset] & 0x0001) == 0x0001) {
+ iborder = 0x0001;
+ }
+ }
+ }
+ }
+
+ // Test for cell to right
+ if (iirht < imaxsize-iminsize && iirht >= 0 && jjcur >= 0 && jjtop < jmaxsize-jminsize) {
+ // right neighbor -- finer, same size and coarser
+ int nr = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+ if (nr >= (int)(ncells+noffset) && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
+ iborder = 0x0002;
+ } else if (lev != levmx){
+ // right neighbor -- finer right top test
+ int jjtopfiner = (jjcur+jjtop)/2;
+ int nrt = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+ if (nrt >= (int)(ncells+noffset) && (border_cell_needed_local[nrt-ncells-noffset] & 0x0002) == 0x0002) {
+ iborder = 0x0002;
+ }
+ }
+ if ( (iborder & 0x0002) == 0 && lev != 0) { // test for coarser, but not directly right
+ int jjrhtcoarser = (jj/2)*2*levmult-jminsize;
+ if (jjrhtcoarser != jjcur) {
+ int nr = read_hash(jjrhtcoarser*(imaxsize-iminsize)+iirht, hash);
+ int levcheck = -1;
+ if (nr-noffset >= 0 && nr-noffset < (int)ncells) {
+ levcheck = level[nr-noffset];
+ } else if (nr >= 0 && (int)(nr-ncells-noffset) >= 0 && (int)(nr-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nr-ncells-noffset];
+ }
+ if (nr >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nr-ncells-noffset] & 0x0002) == 0x0002) {
+ iborder = 0x0002;
+ }
+ }
+ }
+ }
+
+ // Test for cell to bottom
+ if (iicur >= 0 && (iicur+iirht)/2 < imaxsize-iminsize && jjcur-(jjcur-jjbot)/2 >= 0 && jjcur-(jjcur-jjbot)/2 < jmaxsize-jminsize){
+ // Check for finer cell below and left side
+ if (lev != levmx){ // finer neighbor
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ int nb = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+ if (nb >= (int)(ncells+noffset) && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+ iborder = 0x0004;
+ } else {
+ // Also check for finer cell below and right side
+ int iirhtfiner = (iicur+iirht)/2;
+ int nbr = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+ if (nbr >= (int)(ncells+noffset) && (border_cell_needed_local[nbr-ncells-noffset] & 0x0004) == 0x0004) {
+ iborder = 0x0004;
+ }
+ }
+ }
+ if ( (iborder & 0x0004) == 0 && jjbot >= 0) { //same size
+ int nb = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+ int levcheck = -1;
+ if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+ levcheck = level[nb-noffset];
+ } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nb-ncells-noffset];
+ }
+ if (nb >= (int)(ncells+noffset) && levcheck == lev && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+ iborder = 0x0004;
+ } else if (lev != 0 && jjbot-(jjcur-jjbot) >= 0){ // coarser neighbor
+ jjbot -= jjcur-jjbot;
+ int iibot = (ii/2)*2*levmult-iminsize;
+ nb = read_hash(jjbot*(imaxsize-iminsize)+iibot, hash);
+ levcheck = -1;
+ if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+ levcheck = level[nb-noffset];
+ } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nb-ncells-noffset];
+ }
+ // we have to test for coarser level or it could be a same size cell one or two cells away that it is matching
+ if (nb >= (int)(ncells+noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0004) == 0x0004) {
+ iborder = 0x0004;
+ }
+ }
+ }
+ }
+
+ // Test for cell to top
+ if (iirht < imaxsize-iminsize && iicur >= 0 && jjtop >= 0 && jjtop < jmaxsize-jminsize) {
+ // top neighbor -- finer, same size and coarser
+ int nt = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+ if (nt >= (int)(ncells+noffset) && (border_cell_needed_local[nt-ncells-noffset] & 0x0008) == 0x0008) {
+ iborder = 0x0008;
+ } else if (lev != levmx){
+ int iirhtfiner = (iicur+iirht)/2;
+ int ntr = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+ if ( ntr >= (int)(ncells+noffset) && (border_cell_needed_local[ntr-ncells-noffset] & 0x0008) == 0x0008) {
+ iborder = 0x0008;
+ }
+ }
+ if ( (iborder & 0x0008) == 0 && lev != 0) { // test for coarser, but not directly above
+ int iitopcoarser = (ii/2)*2*levmult-iminsize;
+ if (iitopcoarser != iicur) {
+ int nb = read_hash(jjtop*(imaxsize-iminsize)+iitopcoarser, hash);
+ int levcheck = -1;
+ if (nb-noffset >= 0 && nb-noffset < (int)ncells) {
+ levcheck = level[nb-noffset];
+ } else if (nb >= 0 && (int)(nb-ncells-noffset) >= 0 && (int)(nb-ncells-noffset) < nbsize_local) {
+ levcheck = border_cell_level_local[nb-ncells-noffset];
+ }
+ if (nb-noffset >= (int)(ncells-noffset) && levcheck == lev-1 && (border_cell_needed_local[nb-ncells-noffset] & 0x0008) == 0x0008) {
+ iborder = 0x0008;
+ }
+ }
+ }
+ }
+
+ if (iborder) border_cell_needed_local[ic] = iborder |= 0x0016;
+ }
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ vector<int> indices_needed;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ if (DEBUG) {
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] < 0x0016) fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ }
+ }
+
+ int inew = 0;
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] <= 0) continue;
+ indices_needed.push_back(border_cell_num_local[ic]);
+
+ border_cell_num_local[inew] = border_cell_num_local[ic];
+ border_cell_i_local[inew] = border_cell_i_local[ic];
+ border_cell_j_local[inew] = border_cell_j_local[ic];
+ border_cell_level_local[inew] = border_cell_level_local[ic];
+ // border_cell_num_local is not used after -- could be commented out?
+ // border_cell_needed_local[inew] = 1;
+
+ inew++;
+ }
+ nbsize_local = inew;
+
+ free(border_cell_num_local);
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ // Walk through cell array and set hash to global cell values
+ //fprintf(fp,"%d: DEBUG new hash jminsize %d jmaxsize %d iminsize %d imaxsize %d\n",mype,jminsize,jmaxsize,iminsize,imaxsize);
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(int ic=0; ic<nbsize_local; ic++){
+ int lev = border_cell_level_local[ic];
+ int levmult = IPOW2(levmx-lev);
+
+ int ii = border_cell_i_local[ic]*levmult-iminsize;
+ int jj = border_cell_j_local[ic]*levmult-jminsize;
+
+ write_hash(-(ncells+ic), jj*(imaxsize-iminsize)+ii, hash);
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_LAYER2] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ print_local();
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering for 2 layer\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+#ifdef _OPENMP
+ } // end master region
+#endif
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_LAYER_LIST] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ int nghost = nbsize_local;
+ ncells_ghost = ncells + nghost;
+
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ celltype = (int *)mesh_memory.memory_realloc(ncells_ghost, celltype);
+ i = (int *)mesh_memory.memory_realloc(ncells_ghost, i);
+ j = (int *)mesh_memory.memory_realloc(ncells_ghost, j);
+ level = (int *)mesh_memory.memory_realloc(ncells_ghost, level);
+ nlft = (int *)mesh_memory.memory_realloc(ncells_ghost, nlft);
+ nrht = (int *)mesh_memory.memory_realloc(ncells_ghost, nrht);
+ nbot = (int *)mesh_memory.memory_realloc(ncells_ghost, nbot);
+ ntop = (int *)mesh_memory.memory_realloc(ncells_ghost, ntop);
+ memory_reset_ptrs();
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int ic = ncells; ic < (int)ncells_ghost; ic++){
+ nlft[ic] = -1;
+ nrht[ic] = -1;
+ nbot[ic] = -1;
+ ntop[ic] = -1;
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_COPY_MESH_DATA] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for(int ic=0; ic<nbsize_local; ic++){
+ int ii = border_cell_i_local[ic];
+ int jj = border_cell_j_local[ic];
+ int lev = border_cell_level_local[ic];
+ if (ii < lev_ibegin[lev]) celltype[ncells+ic] = LEFT_BOUNDARY;
+ if (ii > lev_iend[lev]) celltype[ncells+ic] = RIGHT_BOUNDARY;
+ if (jj < lev_jbegin[lev]) celltype[ncells+ic] = BOTTOM_BOUNDARY;
+ if (jj > lev_jend[lev]) celltype[ncells+ic] = TOP_BOUNDARY;
+ i[ncells+ic] = ii;
+ j[ncells+ic] = jj;
+ level[ncells+ic] = lev;
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ free(border_cell_i_local);
+ free(border_cell_j_local);
+ free(border_cell_level_local);
+#ifdef _OPENMP
+ } // end master region
+#endif
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_FILL_MESH_GHOST] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"After copying i,j, level to ghost cells\n");
+ print_local();
+#ifdef _OPENMP
+ } // end master region
+#endif
+ }
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells_ghost; ic++){
+ int ii = i[ic];
+ int jj = j[ic];
+ int lev = level[ic];
+ int levmult = IPOW2(levmx-lev);
+
+ int iicur = ii*levmult-iminsize;
+ int iilft = max( (ii-1)*levmult, 0 )-iminsize;
+ int iirht = min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+ int jjcur = jj*levmult-jminsize;
+ int jjbot = max( (jj-1)*levmult, 0 )-jminsize;
+ int jjtop = min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+
+ //fprintf(fp,"DEBUG neigh ic %d nlft %d ii %d levmult %d iminsize %d icheck %d\n",ic,nlft[ic],ii,levmult,iminsize,(max( ii *levmult-1, 0))-iminsize);
+
+ int nlftval = nlft[ic];
+ int nrhtval = nrht[ic];
+ int nbotval = nbot[ic];
+ int ntopval = ntop[ic];
+
+ if (nlftval == -1){
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (iicur < 1*IPOW2(levmx) -iminsize) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // Boundary cells next to corner boundary need special checks
+ if (iicur == 1*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // need to check for finer neighbor first
+ // Right and top neighbor don't change for finer, so drop through to same size
+ // Left and bottom need to be half of same size index for finer test
+ if (lev != levmx) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ if (nlftval == -1 && iilftfiner >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilftfiner, hash);
+ }
+
+ // same size neighbor
+ if (nlftval == -1 && iilft >= 0) nlftval = read_hash(jjcur*(imaxsize-iminsize)+iilft, hash);
+
+ // Now we need to take care of special case where bottom and left boundary need adjustment since
+ // expected cell doesn't exist on these boundaries if it is finer than current cell
+ if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
+ if (nlftval == -1) {
+ int iilftfiner = iicur-(iicur-iilft)/2;
+ int jjtopfiner = (jjcur+jjtop)/2;
+ if (jjtopfiner < jmaxsize-jminsize && iilftfiner >= 0) nlftval = read_hash(jjtopfiner*(imaxsize-iminsize)+iilftfiner, hash);
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (nlftval == -1) {
+ int iilftcoarser = iilft - (iicur-iilft);
+ int jjlft = (jj/2)*2*levmult-jminsize;
+ if (iilftcoarser >=0) nlftval = read_hash(jjlft*(imaxsize-iminsize)+iilftcoarser, hash);
+ }
+ }
+
+ if (nlftval != -1) nlft[ic] = nlftval;
+ }
+
+ if (nrhtval == -1) {
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (iicur > imax*IPOW2(levmx)-1-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // Boundary cells next to corner boundary need special checks
+ if (iirht == imax*IPOW2(levmx)-iminsize && (jjcur < 1*IPOW2(levmx)-jminsize || jjcur >= jmax*IPOW2(levmx)-jminsize ) ) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // same size neighbor
+ if (nrhtval == -1 && iirht < imaxsize-iminsize) nrhtval = read_hash(jjcur*(imaxsize-iminsize)+iirht, hash);
+
+ // Now we need to take care of special case where bottom and left boundary need adjustment since
+ // expected cell doesn't exist on these boundaries if it is finer than current cell
+ if (jjcur < 1*IPOW2(levmx) && lev != levmx) {
+ if (nrhtval == -1) {
+ int jjtopfiner = (jjcur+jjtop)/2;
+ if (jjtopfiner < jmaxsize-jminsize && iirht < imaxsize-iminsize) nrhtval = read_hash(jjtopfiner*(imaxsize-iminsize)+iirht, hash);
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (nrhtval == -1) {
+ int jjrht = (jj/2)*2*levmult-jminsize;
+ if (iirht < imaxsize-iminsize) nrhtval = read_hash(jjrht*(imaxsize-iminsize)+iirht, hash);
+ }
+ }
+ if (nrhtval != -1) nrht[ic] = nrhtval;
+ }
+
+ if (nbotval == -1) {
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (jjcur < 1*IPOW2(levmx) -jminsize) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+ // Boundary cells next to corner boundary need special checks
+ if (jjcur == 1*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) nbotval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // need to check for finer neighbor first
+ // Right and top neighbor don't change for finer, so drop through to same size
+ // Left and bottom need to be half of same size index for finer test
+ if (lev != levmx) {
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ if (nbotval == -1 && jjbotfiner >= 0) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iicur, hash);
+ }
+
+ // same size neighbor
+ if (nbotval == -1 && jjbot >=0) nbotval = read_hash(jjbot*(imaxsize-iminsize)+iicur, hash);
+
+ // Now we need to take care of special case where bottom and left boundary need adjustment since
+ // expected cell doesn't exist on these boundaries if it is finer than current cell
+ if (iicur < 1*IPOW2(levmx) && lev != levmx) {
+ if (nbotval == -1) {
+ int iirhtfiner = (iicur+iirht)/2;
+ int jjbotfiner = jjcur-(jjcur-jjbot)/2;
+ if (jjbotfiner >= 0 && iirhtfiner < imaxsize-iminsize) nbotval = read_hash(jjbotfiner*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (nbotval == -1) {
+ int jjbotcoarser = jjbot - (jjcur-jjbot);
+ int iibot = (ii/2)*2*levmult-iminsize;
+ if (jjbotcoarser >= 0 && iibot >= 0) nbotval = read_hash(jjbotcoarser*(imaxsize-iminsize)+iibot, hash);
+ }
+ }
+ if (nbotval != -1) nbot[ic] = nbotval;
+ }
+
+ if (ntopval == -1) {
+ // Taking care of boundary cells
+ // Force each boundary cell to point to itself on its boundary direction
+ if (jjcur > jmax*IPOW2(levmx)-1-jminsize) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+ // Boundary cells next to corner boundary need special checks
+ if (jjtop == jmax*IPOW2(levmx)-jminsize && (iicur < 1*IPOW2(levmx)-iminsize || iicur >= imax*IPOW2(levmx)-iminsize ) ) ntopval = read_hash(jjcur*(imaxsize-iminsize)+iicur, hash);
+
+ // same size neighbor
+ if (ntopval == -1 && jjtop < jmaxsize-jminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iicur, hash);
+
+ if (iicur < 1*IPOW2(levmx)) {
+ if (ntopval == -1) {
+ int iirhtfiner = (iicur+iirht)/2;
+ if (jjtop < jmaxsize-jminsize && iirhtfiner < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iirhtfiner, hash);
+ }
+ }
+
+ // coarser neighbor
+ if (lev != 0){
+ if (ntopval == -1) {
+ int iitop = (ii/2)*2*levmult-iminsize;
+ if (jjtop < jmaxsize-jminsize && iitop < imaxsize-iminsize) ntopval = read_hash(jjtop*(imaxsize-iminsize)+iitop, hash);
+ }
+ }
+ if (ntopval != -1) ntop[ic] = ntopval;
+ }
+
+ //fprintf(fp,"%d: neighbors[%d] = %d %d %d %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"After setting neighbors through ghost cells\n");
+ print_local();
+#ifdef _OPENMP
+ } // end master region
+#endif
+ }
+
+/*
+ // Set neighbors to global cell numbers from hash
+ for (uint ic=0; ic<ncells; ic++){
+ ii = i[ic];
+ jj = j[ic];
+ lev = level[ic];
+ levmult = IPOW2(levmx-lev);
+ //fprintf(fp,"%d:Neighbors input for ic %d ii %d jj %d levmult %d lev %d\n",mype,ic, ii, jj, levmult,lev);
+ //fprintf(fp,"%d:Neighbors befor ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ if (nlft[ic] == -1) nlft[ic] = hash[( jj *levmult )-jminsize][(max( ii *levmult-1, 0 ))-iminsize];
+ if (celltype[ic] == BOTTOM_BOUNDARY && nlft[ic] == -1){
+ if (nlft[ic] == -1) nlft[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
+ }
+ if (nrht[ic] == -1) nrht[ic] = hash[( jj *levmult )-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
+ if (celltype[ic] == BOTTOM_BOUNDARY && nrht[ic] == -1){
+ if (nrht[ic] == -1) nrht[ic] = hash[(jj+1)*levmult-jminsize][(min( (ii+1)*levmult, imaxcalc-1))-iminsize];
+ //if (ic == 3 && mype == 0) printf("DEBUG line %d -- ic %d celltype %d nrht %d\n",__line__,ic,celltype[ic],nrht[ic]);
+ //printf("DEBUG line %d -- ic %d celltype %d nrht %d jj %d ii %d\n",__line__,ic,celltype[ic],nrht[ic],(jj+1)*levmult-jminsize,(min( (ii+1)*levmult, imaxcalc-1))-iminsize);
+ }
+ if (nbot[ic] == -1) nbot[ic] = hash[(max( jj *levmult-1, 0) )-jminsize][( ii *levmult )-iminsize];
+ if (celltype[ic] == LEFT_BOUNDARY && nbot[ic] == -1){
+ if (nbot[ic] == -1) nbot[ic] = hash[(max( jj *levmult-1, 0) )-jminsize][( ii *levmult+1 )-iminsize];
+ }
+ if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult, jmaxcalc-1))-jminsize][( ii *levmult )-iminsize];
+ if (celltype[ic] == LEFT_BOUNDARY && ntop[ic] == -1){
+ if (ntop[ic] == -1) ntop[ic] = hash[(min( (jj+1)*levmult, jmaxcalc-1))-jminsize][( ii *levmult+1 )-iminsize];
+ }
+ //fprintf(fp,"%d:Neighbors after ic %d nlft %d nrht %d nbot %d ntop %d\n",mype,ic,nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+*/
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"After setting corner neighbors\n");
+ print_local();
+#ifdef _OPENMP
+ } // end master region
+#endif
+ }
+
+ // Adjusting neighbors to local indices
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells_ghost; ic++){
+ //fprintf(fp,"%d: ic %d nlft %d noffset %d ncells %ld\n",mype,ic,nlft[ic],noffset,ncells);
+ if (nlft[ic] <= -(int)ncells && nlft[ic] > -(int)ncells_ghost){
+ nlft[ic] = abs(nlft[ic]);
+ } else if (nlft[ic] >= noffset && nlft[ic] < (int)(noffset+ncells)) {
+ nlft[ic] -= noffset;
+ }
+ if (nrht[ic] <= -(int)ncells && nrht[ic] > -(int)ncells_ghost){
+ nrht[ic] = abs(nrht[ic]);
+ } else if (nrht[ic] >= noffset && nrht[ic] < (int)(noffset+ncells)) {
+ nrht[ic] -= noffset;
+ }
+ if (nbot[ic] <= -(int)ncells && nbot[ic] > -(int)ncells_ghost){
+ nbot[ic] = abs(nbot[ic]);
+ } else if (nbot[ic] >= noffset && nbot[ic] < (int)(noffset+ncells)) {
+ nbot[ic] -= noffset;
+ }
+ if (ntop[ic] <= -(int)ncells && ntop[ic] > -(int)ncells_ghost){
+ ntop[ic] = abs(ntop[ic]);
+ } else if (ntop[ic] >= noffset && ntop[ic] < (int)(noffset+ncells)) {
+ ntop[ic] -= noffset;
+ }
+ }
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ fprintf(fp,"After adjusting neighbors to local indices\n");
+ print_local();
+#ifdef _OPENMP
+ } // end master region
+#endif
+ }
+
+ if (TIMING_LEVEL >= 2) {
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_NEIGH_ADJUST] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
+ offtile_local_count++;
+ offtile_ratio_local /= offtile_local_count;
+ //printf("%d ncells size is %ld ncells_ghost size is %ld nghost %d\n",mype,ncells,ncells_ghost,nghost);
+ //fprintf(fp,"%d ncells_ghost size is %ld nghost %d\n",mype,ncells_ghost,nghost);
+
+ if (cell_handle) L7_Free(&cell_handle);
+ cell_handle=0;
+
+ if (DEBUG) {
+ fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
+ for (int ig = 0; ig<nghost; ig++){
+ fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ig,indices_needed[ig]);
+ }
+ }
+ L7_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
+
+ if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_SETUP_COMM] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+ } // end master region
+#endif
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ print_local();
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nlft numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+ if ( (hashval >= 0 && hashval < (int)ncells) ) {
+ fprintf(fp,"%5d",nlft[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nrht numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if ( ii >= iminsize && ii < imaxsize ) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+ if ( hashval >= 0 && hashval < (int)ncells ) {
+ fprintf(fp,"%5d",nrht[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nbot numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if ( ii >= iminsize && ii < imaxsize ) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+ if ( hashval >= 0 && hashval < (int)ncells ) {
+ fprintf(fp,"%5d",nbot[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n ntop numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if ( ii >= iminsize && ii < imaxsize ) {
+ int hashval = read_hash((jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), hash) -noffset;
+ if ( hashval >= 0 && hashval < (int)ncells ) {
+ fprintf(fp,"%5d",ntop[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+#ifdef _OPENMP
+ } // end master region
+#endif
+ } // end DEBUG
+
+ if (DEBUG) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ print_local();
+
+ for (uint ic=0; ic<ncells; ic++){
+ fprintf(fp,"%d: before update ic %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+ mype,ic,i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+ int ig=0;
+ for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
+ fprintf(fp,"%d: after update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+ mype,ic,indices_needed[ig],i[ic],j[ic],level[ic],nlft[ic],nrht[ic],nbot[ic],ntop[ic]);
+ }
+#ifdef _OPENMP
+ } // end master region
+#endif
+ } // end DEBUG
+
+ } // if numpe > 1
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ write_hash_collision_report();
+ read_hash_collision_report();
+ compact_hash_delete(hash);
+
+#ifdef BOUNDS_CHECK
+ {
+ for (uint ic=0; ic<ncells; ic++){
+ int nl = nlft[ic];
+ if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+ if (level[nl] > level[ic]){
+ int ntl = ntop[nl];
+ if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
+ }
+ int nr = nrht[ic];
+ if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+ if (level[nr] > level[ic]){
+ int ntr = ntop[nr];
+ if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
+ }
+ int nb = nbot[ic];
+ if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+ if (level[nb] > level[ic]){
+ int nrb = nrht[nb];
+ if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
+ }
+ int nt = ntop[ic];
+ if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+ if (level[nt] > level[ic]){
+ int nrt = nrht[nt];
+ if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
+ }
+ }
+ }
+#endif
+
+#ifdef _OPENMP
+ } // end master region
+#pragma omp barrier
+#endif
+
+ } else if (calc_neighbor_type == KDTREE) {
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ TBounds box;
+ vector<int> index_list(IPOW2(levmx*levmx) );
+
+ int num;
+
+ ibase = 0;
+ calc_spatial_coordinates(ibase);
+
+ kdtree_setup();
+
+ if (TIMING_LEVEL >= 2) {
+ cpu_timers[MESH_TIMER_KDTREE_SETUP] += cpu_timer_stop(tstart_lev2);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ for (uint ic=0; ic<ncells; ic++) {
+
+ //left
+ nlft[ic] = ic;
+ box.min.x = x[ic]-0.25*dx[ic];
+ box.max.x = x[ic]-0.25*dx[ic];
+ box.min.y = y[ic]+0.25*dy[ic];
+ box.max.y = y[ic]+0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nlft[ic]=index_list[0];
+
+ //right
+ nrht[ic] = ic;
+ box.min.x = x[ic]+1.25*dx[ic];
+ box.max.x = x[ic]+1.25*dx[ic];
+ box.min.y = y[ic]+0.25*dy[ic];
+ box.max.y = y[ic]+0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nrht[ic]=index_list[0];
+
+ //bot
+ nbot[ic] = ic;
+ box.min.x = x[ic]+0.25*dx[ic];
+ box.max.x = x[ic]+0.25*dx[ic];
+ box.min.y = y[ic]-0.25*dy[ic];
+ box.max.y = y[ic]-0.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) nbot[ic]=index_list[0];
+
+ //top
+ ntop[ic] = ic;
+ box.min.x = x[ic]+0.25*dx[ic];
+ box.max.x = x[ic]+0.25*dx[ic];
+ box.min.y = y[ic]+1.25*dy[ic];
+ box.max.y = y[ic]+1.25*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) ntop[ic]=index_list[0];
+ } // End main loop over cells.
+
+ KDTree_Destroy(&tree);
+
+ if (TIMING_LEVEL >= 2) cpu_timers[MESH_TIMER_KDTREE_QUERY] += cpu_timer_stop(tstart_lev2);
+
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+ } // calc_neighbor_type
+
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[MESH_TIMER_CALC_NEIGHBORS] += cpu_timer_stop(tstart_cpu);
+}
+
+#ifdef HAVE_OPENCL
+void Mesh::gpu_calc_neighbors(void)
+{
+ if (! gpu_do_rezone) return;
+
+ ulong gpu_hash_table_size = 0;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ struct timeval tstart_lev2;
+ cpu_timer_start(&tstart_lev2);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+ assert(dev_levtable);
+ assert(dev_level);
+ assert(dev_i);
+ assert(dev_j);
+
+ size_t mem_request = (int)((float)ncells*mem_factor);
+
+ size_t local_work_size = MIN(ncells, TILE_SIZE);
+ size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+ //printf("DEBUG file %s line %d dev_nlft %p size %d\n",__FILE__,__LINE__,dev_nlft,ezcl_get_device_mem_nelements(dev_nlft));
+
+ if (dev_nlft == NULL || ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells) {
+ dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_neighbor_init, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 1, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 2, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 3, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_neighbor_init, 4, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_neighbor_init, 1, NULL, &global_work_size, &local_work_size, NULL);
+ }
+
+ int imaxsize = (imax+1)*IPOW2(levmx);
+ int jmaxsize = (jmax+1)*IPOW2(levmx);
+
+ int gpu_hash_method = METHOD_UNSET;
+// allow input.c to control hash types and methods
+ if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
+//=========
+
+ size_t hashsize;
+
+ uint hash_report_level = 1;
+ cl_mem dev_hash_header = NULL;
+ cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize, jmaxsize, gpu_hash_method, hash_report_level,
+ &gpu_hash_table_size, &hashsize, &dev_hash_header);
+
+ /*
+ const int isize, // 0
+ const int levmx, // 1
+ const int imaxsize, // 2
+ __global const int *levtable, // 3
+ __global const int *level, // 4
+ __global const int *i, // 5
+ __global const int *j, // 6
+ __global const ulong *hash_header, // 7
+ __global int *hash) // 8
+ */
+
+ cl_event hash_setup_event;
+
+ ezcl_set_kernel_arg(kernel_hash_setup, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_hash_setup, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_hash_setup, 2, sizeof(cl_int), (void *)&imaxsize);
+ ezcl_set_kernel_arg(kernel_hash_setup, 3, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_hash_setup, 4, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_hash_setup, 5, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_hash_setup, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_hash_setup, 7, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_hash_setup, 8, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_hash_setup, 9, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_hash_setup, 10, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_hash_setup, 11, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_hash_setup, 12, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup, 1, NULL, &global_work_size, &local_work_size, &hash_setup_event);
+
+ ezcl_wait_for_events(1, &hash_setup_event);
+ ezcl_event_release(hash_setup_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ /*
+ const int isize, // 0
+ const int levmx, // 1
+ const int imax, // 2
+ const int jmax, // 3
+ const int imaxsize, // 4
+ const int jmaxsize, // 5
+ __global const int *levtable, // 6
+ __global const int *level, // 7
+ __global const int *i, // 8
+ __global const int *j, // 9
+ __global int *nlft, // 10
+ __global int *nrht, // 11
+ __global int *nbot, // 12
+ __global int *ntop, // 13
+ __global const ulong *hash_header, // 14
+ __global int *hash) // 15
+ */
+
+ cl_event calc_neighbors_event;
+
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 2, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 3, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 4, sizeof(cl_int), (void *)&imaxsize);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 5, sizeof(cl_int), (void *)&jmaxsize);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 6, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 8, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 9, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 10, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 11, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 12, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 13, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 14, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_neighbors, 15, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors, 1, NULL, &global_work_size, &local_work_size, &calc_neighbors_event);
+
+ ezcl_wait_for_events(1, &calc_neighbors_event);
+ ezcl_event_release(calc_neighbors_event);
+
+ gpu_compact_hash_delete(dev_hash, dev_hash_header);
+
+ if (TIMING_LEVEL >= 2) gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+
+ gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+
+
+void Mesh::gpu_calc_neighbors_local(void)
+{
+ if (! gpu_do_rezone) return;
+
+ ulong gpu_hash_table_size = 0;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ gpu_counters[MESH_COUNTER_CALC_NEIGH]++;
+
+ ncells_ghost = ncells;
+
+ assert(dev_levtable);
+ assert(dev_level);
+ assert(dev_i);
+ assert(dev_j);
+
+ size_t one = 1;
+ cl_mem dev_check = ezcl_malloc(NULL, const_cast<char *>("dev_check"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ size_t mem_request = (int)((float)ncells*mem_factor);
+ dev_nlft = ezcl_malloc(NULL, const_cast<char *>("dev_nlft"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_nrht = ezcl_malloc(NULL, const_cast<char *>("dev_nrht"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_nbot = ezcl_malloc(NULL, const_cast<char *>("dev_nbot"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_ntop = ezcl_malloc(NULL, const_cast<char *>("dev_ntop"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ size_t local_work_size = 64;
+ size_t global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+ size_t block_size = global_work_size/local_work_size;
+
+ //printf("DEBUG file %s line %d lws = %d gws %d bs %d ncells %d\n",__FILE__,__LINE__,
+ // local_work_size, global_work_size, block_size, ncells);
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_sizes = ezcl_malloc(NULL, const_cast<char *>("dev_sizes"), &one, sizeof(cl_int4), CL_MEM_READ_WRITE, 0);
+
+#ifdef BOUNDS_CHECK
+ if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_level) < (int)ncells ){
+ printf("%d: Warning ncells %ld size dev_i %d dev_j %d dev_level %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
+ }
+#endif
+
+ /*
+ __kernel void calc_hash_size_cl(
+ const int ncells, // 0
+ const int levmx, // 1
+ __global int *levtable, // 2
+ __global int *level, // 3
+ __global int *i, // 4
+ __global int *j, // 5
+ __global int4 *redscratch, // 6
+ __global int4 *sizes, // 7
+ __local int4 *tile) // 8
+ */
+
+ ezcl_set_kernel_arg(kernel_hash_size, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_hash_size, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_hash_size, 2, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_hash_size, 3, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_hash_size, 4, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_hash_size, 5, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_hash_size, 6, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_hash_size, 7, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_hash_size, 8, local_work_size*sizeof(cl_int4), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_size, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ if (block_size > 1) {
+ /*
+ __kernel void finish_reduction_minmax4_cl(
+ const int isize, // 0
+ __global int4 *redscratch, // 1
+ __global int4 *sizes, // 2
+ __local int4 *tile) // 3
+ */
+ ezcl_set_kernel_arg(kernel_finish_hash_size, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_finish_hash_size, 1, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_finish_hash_size, 2, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_finish_hash_size, 3, local_work_size*sizeof(cl_int4), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_hash_size, 1, NULL, &local_work_size, &local_work_size, NULL);
+ }
+
+ ezcl_device_memory_delete(dev_redscratch);
+
+ cl_int sizes[4];
+ ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
+
+ int imintile = sizes[0];
+ int imaxtile = sizes[1];
+ int jmintile = sizes[2];
+ int jmaxtile = sizes[3];
+
+ // Expand size by 2*coarse_cells for ghost cells
+ // TODO: May want to get fancier here and calc based on cell level
+ int jminsize = max(jmintile-2*IPOW2(levmx),0);
+ int jmaxsize = min(jmaxtile+2*IPOW2(levmx),(jmax+1)*IPOW2(levmx));
+ int iminsize = max(imintile-2*IPOW2(levmx),0);
+ int imaxsize = min(imaxtile+2*IPOW2(levmx),(imax+1)*IPOW2(levmx));
+ //fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize,imaxsize,jminsize,jmaxsize);
+
+ //ezcl_enqueue_write_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes, NULL);
+
+ int gpu_hash_method = METHOD_UNSET;
+// allow imput.c to control hash types and methods
+ if (choose_hash_method != METHOD_UNSET) gpu_hash_method = choose_hash_method;
+//=========
+
+ size_t hashsize;
+
+ uint hash_report_level = 1;
+ cl_mem dev_hash_header = NULL;
+ cl_mem dev_hash = gpu_compact_hash_init(ncells, imaxsize-iminsize, jmaxsize-jminsize, gpu_hash_method, hash_report_level, &gpu_hash_table_size, &hashsize, &dev_hash_header);
+
+ int csize = corners_i.size();
+#ifdef BOUNDS_CHECK
+ for (int ic=0; ic<csize; ic++){
+ if (corners_i[ic] >= iminsize) continue;
+ if (corners_j[ic] >= jminsize) continue;
+ if (corners_i[ic] < imaxsize) continue;
+ if (corners_j[ic] < jmaxsize) continue;
+ if ( (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) < 0 ||
+ (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize) > (int)hashsize){
+ printf("%d: Warning corners i %d j %d hash %d\n",mype,corners_i[ic],corners_j[ic],
+ (corners_j[ic]-jminsize)*(imaxsize-iminsize)+(corners_i[ic]-iminsize));
+ }
+ }
+#endif
+
+ size_t corners_local_work_size = MIN(csize, TILE_SIZE);
+ size_t corners_global_work_size = ((csize+corners_local_work_size - 1) /corners_local_work_size) * corners_local_work_size;
+
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 0, sizeof(cl_int), (void *)&csize);
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 2, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 3, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 4, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_hash_adjust_sizes, 5, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_adjust_sizes, 1, NULL, &corners_global_work_size, &corners_local_work_size, NULL);
+
+ if (DEBUG){
+ vector<int> sizes_tmp(4);
+ ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
+ int iminsize_tmp = sizes_tmp[0];
+ int imaxsize_tmp = sizes_tmp[1];
+ int jminsize_tmp = sizes_tmp[2];
+ int jmaxsize_tmp = sizes_tmp[3];
+ fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
+ }
+
+ local_work_size = 128;
+ global_work_size = ((ncells + local_work_size - 1) /local_work_size) * local_work_size;
+
+#ifdef BOUNDS_CHECK
+ {
+ vector<int> i_tmp(ncells);
+ vector<int> j_tmp(ncells);
+ vector<int> level_tmp(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+ for (int ic=0; ic<(int)ncells; ic++){
+ int lev = level_tmp[ic];
+ for ( int jj = j_tmp[ic]*IPOW2(levmx-lev)-jminsize; jj < (j_tmp[ic]+1)*IPOW2(levmx-lev)-jminsize; jj++) {
+ for (int ii = i_tmp[ic]*IPOW2(levmx-lev)-iminsize; ii < (i_tmp[ic]+1)*IPOW2(levmx-lev)-iminsize; ii++) {
+ if (jj < 0 || jj >= (jmaxsize-jminsize) || ii < 0 || ii >= (imaxsize-iminsize) ) {
+ printf("%d: Warning ncell %d writes to hash out-of-bounds at line %d ii %d jj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,ic,__LINE__,ii,jj,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+ }
+ }
+ }
+ }
+#endif
+
+ //printf("%d: lws %d gws %d \n",mype,local_work_size,global_work_size);
+ cl_event hash_setup_local_event;
+
+ /*
+ const int isize, // 0
+ const int levmx, // 1
+ const int imax, // 2
+ const int jmax, // 3
+ const int noffset, // 4
+ __global int *sizes, // 5
+ __global int *levtable, // 6
+ __global int *level, // 7
+ __global int *i, // 8
+ __global int *j, // 9
+ __global const ulong *hash_heaer, // 10
+ __global int *hash) // 11
+ */
+
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 2, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 3, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 4, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 8, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 9, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 10, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_hash_setup_local, 11, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_hash_setup_local, 1, NULL, &global_work_size, &local_work_size, &hash_setup_local_event);
+
+ ezcl_wait_for_events(1, &hash_setup_local_event);
+ ezcl_event_release(hash_setup_local_event);
+
+ if (DEBUG){
+ vector<int> sizes_tmp(4);
+ ezcl_enqueue_read_buffer(command_queue, dev_sizes, CL_TRUE, 0, 1*sizeof(cl_int4), &sizes_tmp[0], NULL);
+ int iminsize_tmp = sizes_tmp[0];
+ int imaxsize_tmp = sizes_tmp[1];
+ int jminsize_tmp = sizes_tmp[2];
+ int jmaxsize_tmp = sizes_tmp[3];
+ fprintf(fp,"%d: Sizes are imin %d imax %d jmin %d jmax %d\n",mype,iminsize_tmp,imaxsize_tmp,jminsize_tmp,jmaxsize_tmp);
+ }
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_HASH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+#ifdef BOUNDS_CHECK
+ {
+ if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_i) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_j) < (int)ncells ||
+ ezcl_get_device_mem_nelements(dev_level) < (int)ncells ) {
+ printf("%d: Warning -- sizes for dev_neigh too small ncells %ld neigh %d %d %d %d %d %d %d\n",mype,ncells,ezcl_get_device_mem_nelements(dev_nlft),ezcl_get_device_mem_nelements(dev_nrht),ezcl_get_device_mem_nelements(dev_nbot),ezcl_get_device_mem_nelements(dev_ntop),ezcl_get_device_mem_nelements(dev_i),ezcl_get_device_mem_nelements(dev_j),ezcl_get_device_mem_nelements(dev_level));
+ }
+ vector<int> level_tmp(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+ int iflag = 0;
+ for (int ic=0; ic<ncells; ic++){
+ if (levmx-level_tmp[ic] < 0 || levmx-level_tmp[ic] > levmx) {
+ printf("%d: Warning level value bad ic %d level %d ncells %d\n",mype,ic,level_tmp[ic],ncells);
+ iflag++;
+ }
+ }
+ if (ezcl_get_device_mem_nelements(dev_levtable) < levmx+1) printf("%d Warning levtable too small levmx is %d devtable size is %d\n",mype,levmx,ezcl_get_device_mem_nelements(dev_levtable));
+#ifdef HAVE_MPI
+ if (iflag > 20) {fflush(stdout); L7_Terminate(); exit(0);}
+#endif
+ }
+#endif
+
+#ifdef BOUNDS_CHECK
+ {
+ int jmaxcalc = (jmax+1)*IPOW2(levmx);
+ int imaxcalc = (imax+1)*IPOW2(levmx);
+ vector<int> i_tmp(ncells);
+ vector<int> j_tmp(ncells);
+ vector<int> level_tmp(ncells);
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+ for (int ic=0; ic<(int)ncells; ic++){
+ int ii = i_tmp[ic];
+ int jj = j_tmp[ic];
+ int lev = level_tmp[ic];
+ int levmult = IPOW2(levmx-lev);
+ int jjj=jj *levmult-jminsize;
+ int iii=max( ii *levmult-1, 0 )-iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ jjj=jj *levmult-jminsize;
+ iii=min( (ii+1)*levmult, imaxcalc-1)-iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ jjj=max( jj *levmult-1, 0) -jminsize;
+ iii=ii *levmult -iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ jjj=min( (jj+1)*levmult, jmaxcalc-1)-jminsize;
+ iii=ii *levmult -iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ int nlftval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((max( ii *levmult-1, 0 ))-iminsize)];
+ int nrhtval = hash_tmp[(( jj *levmult )-jminsize)*(imaxsize-iminsize)+((min( (ii+1)*levmult, imaxcalc-1))-iminsize)];
+ int nbotval = hash_tmp[((max( jj *levmult-1, 0) )-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
+ int ntopval = hash_tmp[((min( (jj+1)*levmult, jmaxcalc-1))-jminsize)*(imaxsize-iminsize)+(( ii *levmult )-iminsize)];
+
+ if (nlftval == INT_MIN){
+ jjj = jj*levmult-jminsize;
+ iii = ii*levmult-iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+ if (nrhtval == INT_MIN){
+ jjj = jj*levmult-jminsize;
+ iii = ii*levmult-iminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+ if (nbotval == INT_MIN) {
+ iii = ii*levmult-iminsize;
+ jjj = jj*levmult-jminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+ if (ntopval == INT_MIN) {
+ iii = ii*levmult-iminsize;
+ jjj = jj*levmult-jminsize;
+ if (jjj < 0 || jjj >= (jmaxsize-jminsize) || iii < 0 || iii >= (imaxsize-iminsize) ) printf("%d: Warning at line %d iii %d jjj %d iminsize %d imaxsize %d jminsize %d jmaxsize %d\n",mype,__LINE__,iii,jjj,iminsize,imaxsize,jminsize,jmaxsize);
+ }
+ }
+ }
+#endif
+
+ cl_event calc_neighbors_local_event;
+
+ /*
+ const int isize, // 0
+ const int levmx, // 1
+ const int imaxsize, // 2
+ const int jmaxsize, // 3
+ const int noffset, // 4
+ __global int *sizes, // 5
+ __global int *levtable, // 6
+ __global int *level, // 7
+ __global int *i, // 8
+ __global int *j, // 9
+ __global int *nlft, // 10
+ __global int *nrht, // 11
+ __global int *nbot, // 12
+ __global int *ntop, // 13
+ __global const ulong *hash_heaer, // 14
+ __global int *hash) // 15
+ */
+
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 2, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 3, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 4, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 8, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 9, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 10, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 11, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 12, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 13, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 14, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_neighbors_local, 15, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_neighbors_local, 1, NULL, &global_work_size, &local_work_size, &calc_neighbors_local_event);
+
+ ezcl_wait_for_events(1, &calc_neighbors_local_event);
+ ezcl_event_release(calc_neighbors_local_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_HASH_QUERY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+ print_dev_local();
+
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+ cl_mem dev_hash_header_check = gpu_get_hash_header();
+ vector<ulong> hash_header_check(hash_header_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+ int gpu_hash_method = (int)hash_header_check[0];
+ ulong gpu_hash_table_size = hash_header_check[1];
+ ulong gpu_AA = hash_header_check[2];
+ ulong gpu_BB = hash_header_check[3];
+
+ vector<int> nlft_tmp(ncells_ghost);
+ vector<int> nrht_tmp(ncells_ghost);
+ vector<int> nbot_tmp(ncells_ghost);
+ vector<int> ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH 0 numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nlft numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nlft_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nrht numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nrht_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nbot numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",nbot_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n ntop numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0])-noffset;
+ if (hashval >= 0 && hashval < (int)ncells) {
+ fprintf(fp,"%5d",ntop_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+ }
+
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ vector<int> iminsize_global(numpe);
+ vector<int> imaxsize_global(numpe);
+ vector<int> jminsize_global(numpe);
+ vector<int> jmaxsize_global(numpe);
+ vector<int> comm_partner(numpe,-1);
+
+ MPI_Allgather(&iminsize, 1, MPI_INT, &iminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&imaxsize, 1, MPI_INT, &imaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&jminsize, 1, MPI_INT, &jminsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+ MPI_Allgather(&jmaxsize, 1, MPI_INT, &jmaxsize_global[0], 1, MPI_INT, MPI_COMM_WORLD);
+
+ int num_comm_partners = 0;
+ for (int ip = 0; ip < numpe; ip++){
+ if (ip == mype) continue;
+ if (iminsize_global[ip] > imaxtile) continue;
+ if (imaxsize_global[ip] < imintile) continue;
+ if (jminsize_global[ip] > jmaxtile) continue;
+ if (jmaxsize_global[ip] < jmintile) continue;
+ comm_partner[num_comm_partners] = ip;
+ num_comm_partners++;
+ //if (DEBUG) fprintf(fp,"%d: overlap with processor %d bounding box is %d %d %d %d\n",mype,ip,iminsize_global[ip],imaxsize_global[ip],jminsize_global[ip],jmaxsize_global[ip]);
+ }
+
+#ifdef BOUNDS_CHECK
+ {
+ vector<int> nlft_tmp(ncells_ghost);
+ vector<int> nrht_tmp(ncells_ghost);
+ vector<int> nbot_tmp(ncells_ghost);
+ vector<int> ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells*sizeof(cl_int), &ntop_tmp[0], NULL);
+ for (uint ic=0; ic<ncells; ic++){
+ int nl = nlft_tmp[ic];
+ if (nl != -1){
+ nl -= noffset;
+ if (nl<0 || nl>= ncells) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+ }
+ int nr = nrht_tmp[ic];
+ if (nr != -1){
+ nr -= noffset;
+ if (nr<0 || nr>= ncells) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+ }
+ int nb = nbot_tmp[ic];
+ if (nb != -1){
+ nb -= noffset;
+ if (nb<0 || nb>= ncells) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+ }
+ int nt = ntop_tmp[ic];
+ if (nt != -1){
+ nt -= noffset;
+ if (nt<0 || nt>= ncells) printf("%d: Warning at line %d cell %d ntop %d\n",mype,__LINE__,ic,nt);
+ }
+ }
+ }
+#endif
+
+ cl_mem dev_border_cell = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell1"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 1, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 2, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 3, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 4, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 5, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 6, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_border_cells, 7, sizeof(cl_mem), (void *)&dev_border_cell);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ cl_mem dev_border_cell_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell2"), &ncells, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ size_t one = 1;
+ cl_mem dev_nbsize = ezcl_malloc(NULL, const_cast<char *>("dev_nbsize"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_uint), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 1, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 2, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 3, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 4, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 5, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 6, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 7, sizeof(cl_mem), (void *)&dev_border_cell);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 8, sizeof(cl_mem), (void *)&dev_border_cell_new);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 9, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 10, sizeof(cl_mem), (void *)&dev_nbsize);
+ ezcl_set_kernel_arg(kernel_calc_border_cells2, 11, local_work_size*sizeof(cl_int), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_border_cells2, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_device_memory_swap(&dev_border_cell, &dev_border_cell_new);
+ ezcl_device_memory_delete(dev_border_cell_new);
+
+ int group_size = (int)(global_work_size/local_work_size);
+
+ ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
+ ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbsize);
+ ezcl_set_kernel_arg(kernel_finish_scan, 3, local_work_size*sizeof(cl_int), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &local_work_size, &local_work_size, NULL);
+
+ int nbsize_local;
+ ezcl_enqueue_read_buffer(command_queue, dev_nbsize, CL_TRUE, 0, 1*sizeof(cl_int), &nbsize_local, NULL);
+ ezcl_device_memory_delete(dev_nbsize);
+
+ //printf("%d: border cell size is %d global is %ld\n",mype,nbsize_local,nbsize_global);
+
+ vector<int> border_cell_num(nbsize_local);
+ vector<int> border_cell_i(nbsize_local);
+ vector<int> border_cell_j(nbsize_local);
+ vector<int> border_cell_level(nbsize_local);
+
+ // allocate new border memory
+ size_t nbsize_long = nbsize_local;
+ cl_mem dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_get_border_data, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_get_border_data, 1, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_get_border_data, 2, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_get_border_data, 3, sizeof(cl_mem), (void *)&dev_border_cell);
+ ezcl_set_kernel_arg(kernel_get_border_data, 4, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_get_border_data, 5, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_get_border_data, 6, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_get_border_data, 7, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_get_border_data, 8, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_get_border_data, 9, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_get_border_data, 10, sizeof(cl_mem), (void *)&dev_border_cell_num);
+ ezcl_set_kernel_arg(kernel_get_border_data, 11, local_work_size*sizeof(cl_uint), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_device_memory_delete(dev_ioffset);
+ ezcl_device_memory_delete(dev_border_cell);
+
+ // read gpu border cell data
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_level, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_level[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_num, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_num[0], NULL);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_FIND_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ // Allocate push database
+
+ int **send_database = (int**)malloc(num_comm_partners*sizeof(int *));
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ send_database[ip] = (int *)malloc(nbsize_local*sizeof(int));
+ }
+
+ // Compute the overlap between processor bounding boxes and set up push database
+
+ vector<int> send_buffer_count(num_comm_partners);
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ int icount = 0;
+ for (int ib = 0; ib <nbsize_local; ib++){
+ int lev = border_cell_level[ib];
+ int levmult = IPOW2(levmx-lev);
+ if (border_cell_i[ib]*levmult >= iminsize_global[comm_partner[ip]] &&
+ border_cell_i[ib]*levmult <= imaxsize_global[comm_partner[ip]] &&
+ border_cell_j[ib]*levmult >= jminsize_global[comm_partner[ip]] &&
+ border_cell_j[ib]*levmult <= jmaxsize_global[comm_partner[ip]] ) {
+ send_database[ip][icount] = ib;
+ icount++;
+ }
+ }
+ send_buffer_count[ip]=icount;
+ }
+
+ // Initialize L7_Push_Setup with num_comm_partners, comm_partner, send_database and
+ // send_buffer_count. L7_Push_Setup will copy data and determine recv_buffer_counts.
+ // It will return receive_count_total for use in allocations
+
+ int receive_count_total;
+ int i_push_handle = 0;
+ L7_Push_Setup(num_comm_partners, &comm_partner[0], &send_buffer_count[0],
+ send_database, &receive_count_total, &i_push_handle);
+
+ if (DEBUG) {
+ fprintf(fp,"DEBUG num_comm_partners %d\n",num_comm_partners);
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ fprintf(fp,"DEBUG comm partner is %d data count is %d\n",comm_partner[ip],send_buffer_count[ip]);
+ for (int ic = 0; ic < send_buffer_count[ip]; ic++){
+ int ib = send_database[ip][ic];
+ fprintf(fp,"DEBUG \t index %d cell number %d i %d j %d level %d\n",ib,border_cell_num[ib],
+ border_cell_i[ib],border_cell_j[ib],border_cell_level[ib]);
+ }
+ }
+ }
+
+ // Can now free the send database. Other arrays are vectors and will automatically
+ // deallocate
+
+ for (int ip = 0; ip < num_comm_partners; ip++){
+ free(send_database[ip]);
+ }
+ free(send_database);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_PUSH_SETUP] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+ // Push the data needed to the adjacent processors
+
+ int *border_cell_num_local = (int *)malloc(receive_count_total*sizeof(int));
+ int *border_cell_i_local = (int *)malloc(receive_count_total*sizeof(int));
+ int *border_cell_j_local = (int *)malloc(receive_count_total*sizeof(int));
+ int *border_cell_level_local = (int *)malloc(receive_count_total*sizeof(int));
+ L7_Push_Update(&border_cell_num[0], border_cell_num_local, i_push_handle);
+ L7_Push_Update(&border_cell_i[0], border_cell_i_local, i_push_handle);
+ L7_Push_Update(&border_cell_j[0], border_cell_j_local, i_push_handle);
+ L7_Push_Update(&border_cell_level[0], border_cell_level_local, i_push_handle);
+
+ L7_Push_Free(&i_push_handle);
+
+ ezcl_device_memory_delete(dev_border_cell_i);
+ ezcl_device_memory_delete(dev_border_cell_j);
+ ezcl_device_memory_delete(dev_border_cell_level);
+ ezcl_device_memory_delete(dev_border_cell_num);
+
+ nbsize_local = receive_count_total;
+
+ if (DEBUG) {
+ for (int ic = 0; ic < nbsize_local; ic++) {
+ fprintf(fp,"%d: Local Border cell %d is %d i %d j %d level %d\n",mype,ic,border_cell_num_local[ic],
+ border_cell_i_local[ic],border_cell_j_local[ic],border_cell_level_local[ic]);
+ }
+ }
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_PUSH_BOUNDARY] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ nbsize_long = nbsize_local;
+
+ dev_border_cell_num = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_num"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_border_cell_i = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_border_cell_j = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_border_cell_level = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_needed = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_needed_out = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_needed_out"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_enqueue_write_buffer(command_queue, dev_border_cell_num, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_num_local[0], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_border_cell_i, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_i_local[0], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_border_cell_j, CL_FALSE, 0, nbsize_local*sizeof(cl_int), &border_cell_j_local[0], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_border_cell_level, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_level_local[0], NULL);
+
+ //ezcl_enqueue_write_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
+
+ free(border_cell_i_local);
+ free(border_cell_j_local);
+ free(border_cell_level_local);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_LOCAL_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+ cl_mem dev_hash_header_check = gpu_get_hash_header();
+ vector<ulong> hash_header_check(hash_header_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+ int gpu_hash_method = (int)hash_header_check[0];
+ ulong gpu_hash_table_size = hash_header_check[1];
+ ulong gpu_AA = hash_header_check[2];
+ ulong gpu_BB = hash_header_check[3];
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering before layer 1\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+ }
+
+ size_t nb_local_work_size = 128;
+ size_t nb_global_work_size = ((nbsize_local + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
+
+ ezcl_set_kernel_arg(kernel_calc_layer1, 0, sizeof(cl_int), (void *)&nbsize_local);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 2, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 3, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 4, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 5, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 6, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 7, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 8, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 13, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_layer1, 14, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
+
+ if (DEBUG){
+ vector<int> border_cell_needed_local(nbsize_local);
+
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
+
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] == 0) continue;
+ fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ }
+ }
+
+ cl_event calc_layer1_sethash_event;
+
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 0, sizeof(cl_int), (void *)&nbsize_local);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 2, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 3, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 4, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 5, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 9, sizeof(cl_mem), (void *)&dev_border_cell_needed);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 10, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_layer1_sethash, 11, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer1_sethash, 1, NULL, &nb_global_work_size, &nb_local_work_size, &calc_layer1_sethash_event);
+
+ ezcl_wait_for_events(1, &calc_layer1_sethash_event);
+ ezcl_event_release(calc_layer1_sethash_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_LAYER1] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG) {
+ print_dev_local();
+
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+ cl_mem dev_hash_header_check = gpu_get_hash_header();
+ vector<ulong> hash_header_check(hash_header_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+ int gpu_hash_method = (int)hash_header_check[0];
+ ulong gpu_hash_table_size = hash_header_check[1];
+ ulong gpu_AA = hash_header_check[2];
+ ulong gpu_BB = hash_header_check[3];
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering for 1 layer\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+ }
+
+ group_size = (int)(nb_global_work_size/nb_local_work_size);
+
+ cl_mem dev_nbpacked = ezcl_malloc(NULL, const_cast<char *>("dev_nbpacked"), &one, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ size_t group_size_long = group_size;
+ dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &group_size_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_set_kernel_arg(kernel_calc_layer2, 0, sizeof(cl_int), (void *)&nbsize_local);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 2, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 3, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 4, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 5, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 6, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 7, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 8, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 9, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 10, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 11, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 12, sizeof(cl_mem), (void *)&dev_border_cell_needed);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 13, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 14, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 15, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 16, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 17, sizeof(cl_mem), (void *)&dev_nbpacked);
+ ezcl_set_kernel_arg(kernel_calc_layer2, 18, nb_local_work_size*sizeof(cl_mem), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2, 1, NULL, &nb_global_work_size, &nb_local_work_size, NULL);
+
+ if (DEBUG){
+ vector<int> border_cell_needed_local(nbsize_local);
+
+ ezcl_enqueue_read_buffer(command_queue, dev_border_cell_needed_out, CL_TRUE, 0, nbsize_local*sizeof(cl_int), &border_cell_needed_local[0], NULL);
+ for(int ic=0; ic<nbsize_local; ic++){
+ if (border_cell_needed_local[ic] <= 0) continue;
+ if (border_cell_needed_local[ic] < 0x0016) fprintf(fp,"%d: First set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ if (border_cell_needed_local[ic] >= 0x0016) fprintf(fp,"%d: Second set of needed cells ic %3d cell %3d type %3d\n",mype,ic,border_cell_num_local[ic],border_cell_needed_local[ic]);
+ }
+ }
+
+ free(border_cell_num_local);
+
+ ezcl_device_memory_delete(dev_border_cell_needed);
+
+ ezcl_set_kernel_arg(kernel_finish_scan, 0, sizeof(cl_int), (void *)&group_size);
+ ezcl_set_kernel_arg(kernel_finish_scan, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_finish_scan, 2, sizeof(cl_mem), (void *)&dev_nbpacked);
+ ezcl_set_kernel_arg(kernel_finish_scan, 3, nb_local_work_size*sizeof(cl_int), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_finish_scan, 1, NULL, &nb_local_work_size, &nb_local_work_size, NULL);
+
+ int nbpacked;
+ ezcl_enqueue_read_buffer(command_queue, dev_nbpacked, CL_TRUE, 0, 1*sizeof(cl_int), &nbpacked, NULL);
+ ezcl_device_memory_delete(dev_nbpacked);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_LAYER2] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ nbsize_long = nbsize_local;
+ cl_mem dev_border_cell_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_i_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_j_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_border_cell_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_border_cell_level_new"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_indices_needed = ezcl_malloc(NULL, const_cast<char *>("dev_indices_needed"), &nbsize_long, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ cl_event get_border_data2_event;
+
+ ezcl_set_kernel_arg(kernel_get_border_data2, 0, sizeof(cl_int), (void *)&nbsize_local);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 2, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 3, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 4, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 5, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 6, sizeof(cl_mem), (void *)&dev_border_cell_num);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 7, sizeof(cl_mem), (void *)&dev_border_cell_i_new);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 8, sizeof(cl_mem), (void *)&dev_border_cell_j_new);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 9, sizeof(cl_mem), (void *)&dev_border_cell_level_new);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 10, sizeof(cl_mem), (void *)&dev_indices_needed);
+ ezcl_set_kernel_arg(kernel_get_border_data2, 11, local_work_size*sizeof(cl_uint), NULL);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_get_border_data2, 1, NULL, &nb_global_work_size, &nb_local_work_size, &get_border_data2_event);
+
+ ezcl_device_memory_delete(dev_border_cell_num);
+
+ ezcl_device_memory_swap(&dev_border_cell_i, &dev_border_cell_i_new);
+ ezcl_device_memory_swap(&dev_border_cell_j, &dev_border_cell_j_new);
+ ezcl_device_memory_swap(&dev_border_cell_level, &dev_border_cell_level_new);
+
+ size_t nbp_local_work_size = 128;
+ size_t nbp_global_work_size = ((nbpacked + nbp_local_work_size - 1) /nbp_local_work_size) * nbp_local_work_size;
+
+ cl_event calc_layer2_sethash_event;
+
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 0, sizeof(cl_int), (void *)&nbpacked);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 2, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 3, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 4, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 5, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 6, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 7, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 8, sizeof(cl_mem), (void *)&dev_levibeg);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 9, sizeof(cl_mem), (void *)&dev_leviend);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 10, sizeof(cl_mem), (void *)&dev_levjbeg);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 11, sizeof(cl_mem), (void *)&dev_levjend);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 12, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 13, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 14, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 15, sizeof(cl_mem), (void *)&dev_indices_needed);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 16, sizeof(cl_mem), (void *)&dev_border_cell_needed_out);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 17, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_calc_layer2_sethash, 18, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_layer2_sethash, 1, NULL, &nbp_global_work_size, &nbp_local_work_size, &calc_layer2_sethash_event);
+
+ ezcl_wait_for_events(1, &calc_layer2_sethash_event);
+ ezcl_event_release(calc_layer2_sethash_event);
+
+ ezcl_device_memory_delete(dev_ioffset);
+
+ ezcl_wait_for_events(1, &get_border_data2_event);
+ ezcl_event_release(get_border_data2_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_LAYER_LIST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ vector<int> indices_needed(nbpacked);
+
+ // read gpu border cell data
+ ezcl_enqueue_read_buffer(command_queue, dev_indices_needed, CL_TRUE, 0, nbpacked*sizeof(cl_int), &indices_needed[0], NULL);
+
+ ezcl_device_memory_delete(dev_border_cell_i_new);
+ ezcl_device_memory_delete(dev_border_cell_j_new);
+ ezcl_device_memory_delete(dev_border_cell_level_new);
+
+ if (DEBUG) {
+ print_dev_local();
+
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_TRUE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+ cl_mem dev_hash_header_check = gpu_get_hash_header();
+ vector<ulong> hash_header_check(hash_header_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+ int gpu_hash_method = (int)hash_header_check[0];
+ ulong gpu_hash_table_size = hash_header_check[1];
+ ulong gpu_AA = hash_header_check[2];
+ ulong gpu_BB = hash_header_check[3];
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering for 2 layer\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+ fflush(fp);
+ }
+
+ ezcl_device_memory_delete(dev_border_cell_needed_out);
+
+ int nghost = nbpacked;
+ ncells_ghost = ncells + nghost;
+
+ //if (mype == 1) printf("%d: DEBUG before expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+ if (ezcl_get_device_mem_capacity(dev_celltype) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_i) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_j) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_level) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_nlft) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_nrht) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_nbot) < ncells_ghost ||
+ ezcl_get_device_mem_capacity(dev_ntop) < ncells_ghost ) {
+
+ //if (mype == 0) printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+ //printf("%d: DEBUG expanding memory ncells %ld ncells_ghost %ld capacity %ld\n",mype,ncells,ncells_ghost,ezcl_get_device_mem_capacity(dev_i));
+ mem_factor = (float)(ncells_ghost/ncells);
+ cl_mem dev_celltype_old = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_i_old = ezcl_malloc(NULL, const_cast<char *>("dev_i_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_j_old = ezcl_malloc(NULL, const_cast<char *>("dev_j_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_level_old = ezcl_malloc(NULL, const_cast<char *>("dev_level_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_nlft_old = ezcl_malloc(NULL, const_cast<char *>("dev_nlft_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_nrht_old = ezcl_malloc(NULL, const_cast<char *>("dev_nrht_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_nbot_old = ezcl_malloc(NULL, const_cast<char *>("dev_nbot_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_ntop_old = ezcl_malloc(NULL, const_cast<char *>("dev_ntop_old"), &ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_device_memory_swap(&dev_celltype_old, &dev_celltype);
+ ezcl_device_memory_swap(&dev_i_old, &dev_i );
+ ezcl_device_memory_swap(&dev_j_old, &dev_j );
+ ezcl_device_memory_swap(&dev_level_old, &dev_level );
+ ezcl_device_memory_swap(&dev_nlft_old, &dev_nlft );
+ ezcl_device_memory_swap(&dev_nrht_old, &dev_nrht );
+ ezcl_device_memory_swap(&dev_nbot_old, &dev_nbot );
+ ezcl_device_memory_swap(&dev_ntop_old, &dev_ntop );
+
+ cl_event copy_mesh_data_event;
+
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 1, sizeof(cl_mem), (void *)&dev_celltype_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 2, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 3, sizeof(cl_mem), (void *)&dev_i_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 4, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 5, sizeof(cl_mem), (void *)&dev_j_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 6, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 7, sizeof(cl_mem), (void *)&dev_level_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 8, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 9, sizeof(cl_mem), (void *)&dev_nlft_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 10, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 11, sizeof(cl_mem), (void *)&dev_nrht_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 12, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 13, sizeof(cl_mem), (void *)&dev_nbot_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 14, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 15, sizeof(cl_mem), (void *)&dev_ntop_old);
+ ezcl_set_kernel_arg(kernel_copy_mesh_data, 16, sizeof(cl_mem), (void *)&dev_ntop);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_mesh_data, 1, NULL, &global_work_size, &local_work_size, ©_mesh_data_event);
+
+ ezcl_device_memory_delete(dev_celltype_old);
+ ezcl_device_memory_delete(dev_i_old);
+ ezcl_device_memory_delete(dev_j_old);
+ ezcl_device_memory_delete(dev_level_old);
+ ezcl_device_memory_delete(dev_nlft_old);
+ ezcl_device_memory_delete(dev_nrht_old);
+ ezcl_device_memory_delete(dev_nbot_old);
+ ezcl_device_memory_delete(dev_ntop_old);
+
+ ezcl_wait_for_events(1, ©_mesh_data_event);
+ ezcl_event_release(copy_mesh_data_event);
+ }
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_COPY_MESH_DATA] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ nb_global_work_size = ((nbpacked + nb_local_work_size - 1) /nb_local_work_size) * nb_local_work_size;
+
+#ifdef BOUNDS_CHECK
+ if (ezcl_get_device_mem_nelements(dev_i) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_j) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_level) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_celltype) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+ printf("DEBUG size issue at %d\n",__LINE__);
+ }
+ if (ezcl_get_device_mem_nelements(dev_border_cell_i) < nbpacked ||
+ ezcl_get_device_mem_nelements(dev_border_cell_j) < nbpacked ||
+ ezcl_get_device_mem_nelements(dev_border_cell_level) < nbpacked ){
+ printf("DEBUG size issue at %d\n",__LINE__);
+ }
+#endif
+
+ cl_event fill_mesh_ghost_event;
+
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 0, sizeof(cl_int), (void *)&nbpacked);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 2, sizeof(cl_mem), (void *)&dev_levibeg);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 3, sizeof(cl_mem), (void *)&dev_leviend);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 4, sizeof(cl_mem), (void *)&dev_levjbeg);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 5, sizeof(cl_mem), (void *)&dev_levjend);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 6, sizeof(cl_mem), (void *)&dev_border_cell_i);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 7, sizeof(cl_mem), (void *)&dev_border_cell_j);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 8, sizeof(cl_mem), (void *)&dev_border_cell_level);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 9, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 10, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 11, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 12, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 13, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 14, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 15, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_fill_mesh_ghost, 16, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_mesh_ghost, 1, NULL, &nb_global_work_size, &nb_local_work_size, &fill_mesh_ghost_event);
+
+ ezcl_wait_for_events(1, &fill_mesh_ghost_event);
+ ezcl_event_release(fill_mesh_ghost_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_FILL_MESH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG){
+ fprintf(fp,"After copying i,j, level to ghost cells\n");
+ print_dev_local();
+ }
+
+ ezcl_device_memory_delete(dev_border_cell_i);
+ ezcl_device_memory_delete(dev_border_cell_j);
+ ezcl_device_memory_delete(dev_border_cell_level);
+
+ size_t ghost_local_work_size = 128;
+ size_t ghost_global_work_size = ((ncells_ghost + ghost_local_work_size - 1) /ghost_local_work_size) * ghost_local_work_size;
+
+ cl_event fill_neighbor_ghost_event;
+
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 0, sizeof(cl_int), (void *)&ncells_ghost);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 2, sizeof(cl_int), (void *)&imax);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 3, sizeof(cl_int), (void *)&jmax);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 4, sizeof(cl_mem), (void *)&dev_sizes);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 5, sizeof(cl_mem), (void *)&dev_levtable);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 6, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 7, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 8, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 9, sizeof(cl_mem), (void *)&dev_hash_header);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 10, sizeof(cl_mem), (void *)&dev_hash);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 11, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 12, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 13, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_fill_neighbor_ghost, 14, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_fill_neighbor_ghost, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &fill_neighbor_ghost_event);
+
+ ezcl_wait_for_events(1, &fill_neighbor_ghost_event);
+ ezcl_event_release(fill_neighbor_ghost_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_FILL_NEIGH_GHOST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG){
+ fprintf(fp,"After setting neighbors through ghost cells\n");
+ print_dev_local();
+ }
+
+#ifdef BOUNDS_CHECK
+ if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+ printf("%d: Warning sizes for set_corner_neighbor not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
+ }
+#endif
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_SET_CORNER_NEIGH] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ if (DEBUG){
+ fprintf(fp,"After setting corner neighbors\n");
+ print_dev_local();
+ }
+
+#ifdef BOUNDS_CHECK
+ if (ezcl_get_device_mem_nelements(dev_nlft) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nrht) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_nbot) < (int)ncells_ghost ||
+ ezcl_get_device_mem_nelements(dev_ntop) < (int)ncells_ghost ){
+ printf("%d: Warning sizes for adjust neighbors not right ncells ghost %d nlft size %d\n",mype,ncells_ghost,ezcl_get_device_mem_nelements(dev_nlft));
+ }
+ if (ezcl_get_device_mem_nelements(dev_indices_needed) < (int)(ncells_ghost-ncells) ){
+ printf("%d: Warning indices size wrong nghost %d size indices_needed\n",mype,ncells_ghost-ncells,ezcl_get_device_mem_nelements(dev_indices_needed));
+ }
+#endif
+
+ cl_event adjust_neighbors_local_event;
+
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 0, sizeof(cl_int), (void *)&ncells_ghost);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 1, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 2, sizeof(cl_int), (void *)&noffset);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 3, sizeof(cl_mem), (void *)&dev_indices_needed);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 4, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 5, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 6, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_adjust_neighbors_local, 7, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_adjust_neighbors_local, 1, NULL, &ghost_global_work_size, &ghost_local_work_size, &adjust_neighbors_local_event);
+
+ ezcl_device_memory_delete(dev_indices_needed);
+
+ if (DEBUG){
+ fprintf(fp,"After adjusting neighbors to local indices\n");
+ print_dev_local();
+ }
+
+ ezcl_wait_for_events(1, &adjust_neighbors_local_event);
+ ezcl_event_release(adjust_neighbors_local_event);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_NEIGH_ADJUST] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ cpu_timer_start(&tstart_lev2);
+ }
+
+ offtile_ratio_local = (offtile_ratio_local*(double)offtile_local_count) + ((double)nghost / (double)ncells);
+ offtile_local_count++;
+ offtile_ratio_local /= offtile_local_count;
+
+ if (cell_handle) L7_Free(&cell_handle);
+ cell_handle=0;
+
+ if (DEBUG){
+ fprintf(fp,"%d: SETUP ncells %ld noffset %d nghost %d\n",mype,ncells,noffset,nghost);
+ for (int ic=0; ic<nghost; ic++){
+ fprintf(fp,"%d: indices needed ic %d index %d\n",mype,ic,indices_needed[ic]);
+ }
+ }
+
+ L7_Dev_Setup(0, noffset, ncells, &indices_needed[0], nghost, &cell_handle);
+
+#ifdef BOUNDS_CHECK
+ {
+ vector<int> nlft_tmp(ncells_ghost);
+ vector<int> nrht_tmp(ncells_ghost);
+ vector<int> nbot_tmp(ncells_ghost);
+ vector<int> ntop_tmp(ncells_ghost);
+ vector<int> level_tmp(ncells_ghost);
+ vector<real_t> H_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+ for (uint ic=0; ic<ncells; ic++){
+ int nl = nlft_tmp[ic];
+ if (nl<0 || nl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mype,__LINE__,ic,nl);
+ if (level_tmp[nl] > level_tmp[ic]){
+ int ntl = ntop_tmp[nl];
+ if (ntl<0 || ntl>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mype,__LINE__,ic,ic+noffset,nl,ntl);
+ }
+ int nr = nrht_tmp[ic];
+ if (nr<0 || nr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mype,__LINE__,ic,nr);
+ if (level_tmp[nr] > level_tmp[ic]){
+ int ntr = ntop_tmp[nr];
+ if (ntr<0 || ntr>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mype,__LINE__,ic,ntr);
+ }
+ int nb = nbot_tmp[ic];
+ if (nb<0 || nb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mype,__LINE__,ic,nb);
+ if (level_tmp[nb] > level_tmp[ic]){
+ int nrb = nrht_tmp[nb];
+ if (nrb<0 || nrb>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mype,__LINE__,ic,nrb);
+ }
+ int nt = ntop_tmp[ic];
+ if (nt<0 || nt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d global %d ntop %d ncells %ld ncells_ghost %ld\n",mype,__LINE__,ic,ic+noffset,nt,ncells,ncells_ghost);
+ if (level_tmp[nt] > level_tmp[ic]){
+ int nrt = nrht_tmp[nt];
+ if (nrt<0 || nrt>= (int)ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mype,__LINE__,ic,nrt);
+ }
+ }
+ }
+#endif
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[MESH_TIMER_SETUP_COMM] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ }
+
+ if (DEBUG) {
+ print_dev_local();
+
+ vector<int> hash_tmp(hashsize);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash, CL_FALSE, 0, hashsize*sizeof(cl_int), &hash_tmp[0], NULL);
+
+ cl_mem dev_hash_header_check = gpu_get_hash_header();
+ vector<ulong> hash_header_check(hash_header_size);
+ ezcl_enqueue_read_buffer(command_queue, dev_hash_header_check, CL_TRUE, 0, hash_header_size*sizeof(cl_ulong), &hash_header_check[0], NULL);
+
+ int gpu_hash_method = (int)hash_header_check[0];
+ ulong gpu_hash_table_size = hash_header_check[1];
+ ulong gpu_AA = hash_header_check[2];
+ ulong gpu_BB = hash_header_check[3];
+
+ vector<int> nlft_tmp(ncells_ghost);
+ vector<int> nrht_tmp(ncells_ghost);
+ vector<int> nbot_tmp(ncells_ghost);
+ vector<int> ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+ int jmaxglobal = (jmax+1)*IPOW2(levmx);
+ int imaxglobal = (imax+1)*IPOW2(levmx);
+ fprintf(fp,"\n HASH numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ if (ii >= iminsize && ii < imaxsize) {
+ fprintf(fp,"%5d",read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) );
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nlft numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+ if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+ fprintf(fp,"%5d",nlft_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nrht numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+ if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+ fprintf(fp,"%5d",nrht_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n nbot numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+ if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+ fprintf(fp,"%5d",nbot_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+
+ fprintf(fp,"\n ntop numbering\n");
+ for (int jj = jmaxglobal-1; jj>=0; jj--){
+ fprintf(fp,"%2d: %4d:",mype,jj);
+ if (jj >= jminsize && jj < jmaxsize) {
+ for (int ii = 0; ii<imaxglobal; ii++){
+ int hashval = read_dev_hash(gpu_hash_method, gpu_hash_table_size, gpu_AA, gpu_BB, (jj-jminsize)*(imaxsize-iminsize)+(ii-iminsize), &hash_tmp[0]) -noffset;
+ if ( (ii >= iminsize && ii < imaxsize) && (hashval >= 0 && hashval < (int)ncells) ) {
+ fprintf(fp,"%5d",ntop_tmp[hashval]);
+ } else {
+ fprintf(fp," ");
+ }
+ }
+ }
+ fprintf(fp,"\n");
+ }
+ fprintf(fp,"%2d: ",mype);
+ for (int ii = 0; ii<imaxglobal; ii++){
+ fprintf(fp,"%4d:",ii);
+ }
+ fprintf(fp,"\n");
+ }
+
+ if (DEBUG) {
+ print_dev_local();
+
+ vector<int> i_tmp(ncells_ghost);
+ vector<int> j_tmp(ncells_ghost);
+ vector<int> level_tmp(ncells_ghost);
+ vector<int> nlft_tmp(ncells_ghost);
+ vector<int> nrht_tmp(ncells_ghost);
+ vector<int> nbot_tmp(ncells_ghost);
+ vector<int> ntop_tmp(ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+
+ for (uint ic=0; ic<ncells; ic++){
+ fprintf(fp,"%d: before update ic %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+ mype,ic,i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
+ }
+ int ig=0;
+ for (uint ic=ncells; ic<ncells_ghost; ic++, ig++){
+ fprintf(fp,"%d: after update ic %d off %d i %d j %d lev %d nlft %d nrht %d nbot %d ntop %d\n",
+ mype,ic,indices_needed[ig],i_tmp[ic],j_tmp[ic],level_tmp[ic],nlft_tmp[ic],nrht_tmp[ic],nbot_tmp[ic],ntop_tmp[ic]);
+ }
+ }
+ }
+#endif
+
+ ezcl_device_memory_delete(dev_sizes);
+ ezcl_device_memory_delete(dev_check);
+
+ gpu_compact_hash_delete(dev_hash, dev_hash_header);
+
+ gpu_timers[MESH_TIMER_CALC_NEIGHBORS] += (long)(cpu_timer_stop(tstart_cpu) * 1.0e9);
+}
+#endif
+
+void Mesh::print_calc_neighbor_type(void)
+{
+ if ( calc_neighbor_type == HASH_TABLE ) {
+ if (mype == 0) printf("Using hash tables to calculate neighbors\n");
+ if (mype == 0 && numpe == 1) final_hash_collision_report();
+ } else {
+ printf("hash table size %ld\n",ncells*(int)log(ncells)*sizeof(int));
+ if (mype == 0) printf("Using k-D tree to calculate neighbors\n");
+ }
+}
+
+int Mesh::get_calc_neighbor_type(void)
+{
+ return(calc_neighbor_type );
+}
+
+void Mesh::calc_celltype_threaded(size_t ncells)
+{
+ int flags=0;
+#ifdef HAVE_J7
+ if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
+ if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
+ celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
+ }
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (uint ic=0; ic<ncells; ++ic) {
+ celltype[ic] = REAL_CELL;
+ if (is_left_boundary(ic) ) celltype[ic] = LEFT_BOUNDARY;
+ if (is_right_boundary(ic) ) celltype[ic] = RIGHT_BOUNDARY;
+ if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
+ if (is_top_boundary(ic)) celltype[ic] = TOP_BOUNDARY;
+ }
+}
+
+void Mesh::calc_celltype(size_t ncells)
+{
+ int flags = 0;
+#ifdef HAVE_J7
+ if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+ if (celltype == NULL || mesh_memory.get_memory_size(celltype) < ncells) {
+ if (celltype != NULL) celltype = (int *)mesh_memory.memory_delete(celltype);
+ celltype = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "celltype", flags);
+ }
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+ for (uint ic=0; ic<ncells; ++ic) {
+ celltype[ic] = REAL_CELL;
+ if (is_left_boundary(ic) ) celltype[ic] = LEFT_BOUNDARY;
+ if (is_right_boundary(ic) ) celltype[ic] = RIGHT_BOUNDARY;
+ if (is_bottom_boundary(ic) ) celltype[ic] = BOTTOM_BOUNDARY;
+ if (is_top_boundary(ic)) celltype[ic] = TOP_BOUNDARY;
+ }
+}
+
+void Mesh::calc_symmetry(vector<int> &dsym, vector<int> &xsym, vector<int> &ysym)
+{
+ TBounds box;
+ vector<int> index_list( IPOW2(levmx*levmx) );
+
+ int num;
+ for (uint ic=0; ic<ncells; ic++) {
+ dsym[ic]=ic;
+ xsym[ic]=ic;
+ ysym[ic]=ic;
+
+ //diagonal symmetry
+ box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
+ box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
+ box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
+ box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) dsym[ic]=index_list[0];
+ //printf("ic %d dsym[ic] %d num %d\n",ic,dsym[ic],num);
+
+ //x symmetry
+ box.min.x = -1.0*(x[ic]+0.5*dx[ic]);
+ box.max.x = -1.0*(x[ic]+0.5*dx[ic]);
+ box.min.y = y[ic]+0.5*dy[ic];
+ box.max.y = y[ic]+0.5*dy[ic];
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) xsym[ic]=index_list[0];
+
+ //y symmetry
+ box.min.x = x[ic]+0.5*dx[ic];
+ box.max.x = x[ic]+0.5*dx[ic];
+ box.min.y = -1.0*(y[ic]+0.5*dy[ic]);
+ box.max.y = -1.0*(y[ic]+0.5*dy[ic]);
+ KDTree_QueryBoxIntersect(&tree, &num, &(index_list[0]), &box);
+ if (num == 1) ysym[ic]=index_list[0];
+
+ }
+}
+
+#ifdef HAVE_MPI
+void Mesh::do_load_balance_local(size_t numcells, float *weight, MallocPlus &state_memory)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ // To get rid of compiler warning
+ if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
+
+ int ncells_old = numcells;
+ int noffset_old = ndispl[mype];
+
+// Need to add weight array to load balance if it is not NULL
+// Need to add tolerance to when load balance is done
+
+ int do_load_balance_global = 0;
+ int nsizes_old = 0;
+
+ for (int ip=0; ip<numpe; ip++){
+ nsizes_old = nsizes[ip];
+
+ // Calc new,even partition of data across processors
+ nsizes[ip] = ncells_global/numpe;
+ // Account for leftover cells
+ if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+
+ if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
+ }
+
+ if (do_load_balance_global) {
+ cpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
+
+ mesh_memory.memory_delete(celltype);
+ mesh_memory.memory_delete(nlft);
+ mesh_memory.memory_delete(nrht);
+ mesh_memory.memory_delete(nbot);
+ mesh_memory.memory_delete(ntop);
+
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ ncells = nsizes[mype];
+ noffset=ndispl[mype];
+
+ // Indices of blocks to be added to load balance
+ int lower_block_start = noffset;
+ int lower_block_end = min(noffset_old-1, (int)(noffset+ncells-1));
+ int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
+ int upper_block_end = noffset+ncells-1;
+
+ int lower_block_size = max(lower_block_end-lower_block_start+1,0);
+ if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
+ int upper_block_size = max(upper_block_end-upper_block_start+1,0);
+ int indices_needed_count = lower_block_size + upper_block_size;
+
+ int in = 0;
+
+ vector<int> indices_needed(indices_needed_count);
+ for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
+ indices_needed[in]=iz;
+ }
+ for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
+ indices_needed[in]=iz;
+ }
+
+ int load_balance_handle = 0;
+ L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
+
+ //printf("\n%d: DEBUG load balance report\n",mype);
+
+ state_memory.memory_realloc_all(ncells_old+indices_needed_count);
+
+ MallocPlus state_memory_old = state_memory;
+
+
+ malloc_plus_memory_entry *memory_item;
+
+ for (memory_item = state_memory_old.memory_entry_by_name_begin();
+ memory_item != state_memory_old.memory_entry_by_name_end();
+ memory_item = state_memory_old.memory_entry_by_name_next() ) {
+
+ //if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+
+ if (memory_item->mem_elsize == 8) {
+ double *mem_ptr_double = (double *)memory_item->mem_ptr;
+
+ int flags = state_memory.get_memory_flags(mem_ptr_double);
+ double *state_temp_double = (double *) state_memory.memory_malloc(ncells, sizeof(double),
+ "state_temp_double", flags);
+
+ //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+ L7_Update(mem_ptr_double, L7_DOUBLE, load_balance_handle);
+ in = 0;
+ if(lower_block_size > 0) {
+ for(; in < MIN(lower_block_size, (int)ncells); in++) {
+ state_temp_double[in] = mem_ptr_double[ncells_old + in];
+ }
+ }
+
+ for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+ state_temp_double[in] = mem_ptr_double[ic];
+ }
+
+ if(upper_block_size > 0) {
+ int ic = ncells_old + lower_block_size;
+ for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+ state_temp_double[in] = mem_ptr_double[ic+k];
+ }
+ }
+ state_memory.memory_replace(mem_ptr_double, state_temp_double);
+ } else if (memory_item->mem_elsize == 4) {
+ float *mem_ptr_float = (float *)memory_item->mem_ptr;
+
+ int flags = state_memory.get_memory_flags(mem_ptr_float);
+ float *state_temp_float = (float *) state_memory.memory_malloc(ncells, sizeof(float),
+ "state_temp_float", flags);
+
+ //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+ L7_Update(mem_ptr_float, L7_FLOAT, load_balance_handle);
+ in = 0;
+ if(lower_block_size > 0) {
+ for(; in < MIN(lower_block_size, (int)ncells); in++) {
+ state_temp_float[in] = mem_ptr_float[ncells_old + in];
+ }
+ }
+
+ for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+ state_temp_float[in] = mem_ptr_float[ic];
+ }
+
+ if(upper_block_size > 0) {
+ int ic = ncells_old + lower_block_size;
+ for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+ state_temp_float[in] = mem_ptr_float[ic+k];
+ }
+ }
+ state_memory.memory_replace(mem_ptr_float, state_temp_float);
+ }
+ }
+
+ mesh_memory.memory_realloc_all(ncells_old+indices_needed_count);
+
+ MallocPlus mesh_memory_old = mesh_memory;
+
+ for (memory_item = mesh_memory_old.memory_entry_by_name_begin();
+ memory_item != mesh_memory_old.memory_entry_by_name_end();
+ memory_item = mesh_memory_old.memory_entry_by_name_next() ) {
+
+ //if (mype == 0) printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+
+ if (memory_item->mem_elsize == 8) {
+ long long *mem_ptr_long = (long long *)memory_item->mem_ptr;
+
+ int flags = mesh_memory.get_memory_flags(mem_ptr_long);
+ long long *mesh_temp_long = (long long *)mesh_memory.memory_malloc(ncells, sizeof(long long), "mesh_temp_long", flags);
+
+ //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+ L7_Update(mem_ptr_long, L7_LONG_LONG_INT, load_balance_handle);
+ in = 0;
+ if(lower_block_size > 0) {
+ for(; in < MIN(lower_block_size, (int)ncells); in++) {
+ mesh_temp_long[in] = mem_ptr_long[ncells_old + in];
+ }
+ }
+
+ for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+ mesh_temp_long[in] = mem_ptr_long[ic];
+ }
+
+ if(upper_block_size > 0) {
+ int ic = ncells_old + lower_block_size;
+ for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+ mesh_temp_long[in] = mem_ptr_long[ic+k];
+ }
+ }
+ mesh_memory.memory_replace(mem_ptr_long, mesh_temp_long);
+
+ } else {
+ int *mem_ptr_int = (int *)memory_item->mem_ptr;
+
+ int flags = mesh_memory.get_memory_flags(mem_ptr_int);
+ int *mesh_temp_int = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "mesh_temp_int", flags);
+
+ //printf("%d: DEBUG L7_Update in do_load_balance_local mem_ptr %p\n",mype,mem_ptr);
+ L7_Update(mem_ptr_int, L7_INT, load_balance_handle);
+ in = 0;
+ if(lower_block_size > 0) {
+ for(; in < MIN(lower_block_size, (int)ncells); in++) {
+ mesh_temp_int[in] = mem_ptr_int[ncells_old + in];
+ }
+ }
+
+ for(int ic = MAX((noffset - noffset_old), 0); (ic < ncells_old) && (in < (int)ncells); ic++, in++) {
+ mesh_temp_int[in] = mem_ptr_int[ic];
+ }
+
+ if(upper_block_size > 0) {
+ int ic = ncells_old + lower_block_size;
+ for(int k = max(noffset-upper_block_start,0); ((k+ic) < (ncells_old+indices_needed_count)) && (in < (int)ncells); k++, in++) {
+ mesh_temp_int[in] = mem_ptr_int[ic+k];
+ }
+ }
+ mesh_memory.memory_replace(mem_ptr_int, mesh_temp_int);
+
+ }
+ }
+
+ L7_Free(&load_balance_handle);
+ load_balance_handle = 0;
+
+ memory_reset_ptrs();
+
+ //mesh_memory.memory_report();
+ //state_memory.memory_report();
+ //printf("%d: DEBUG end load balance report\n\n",mype);
+ calc_celltype(ncells);
+ }
+
+
+ cpu_timers[MESH_TIMER_LOAD_BALANCE] += cpu_timer_stop(tstart_cpu);
+}
+#endif
+
+#ifdef HAVE_OPENCL
+#ifdef HAVE_MPI
+int Mesh::gpu_do_load_balance_local(size_t numcells, float *weight, MallocPlus &gpu_state_memory)
+{
+ int do_load_balance_global = 0;
+
+ if (! gpu_do_rezone) return(do_load_balance_global);
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ // To get rid of compiler warning
+ if (DEBUG && weight != NULL) printf("DEBUG weight[0] = %f\n",weight[0]);
+
+ int ncells_old = numcells;
+ int noffset_old = ndispl[mype];
+
+// Need to add weight array to load balance if it is not NULL
+// Need to add tolerance to when load balance is done
+
+ int nsizes_old = 0;
+ for (int ip=0; ip<numpe; ip++){
+ nsizes_old = nsizes[ip];
+ nsizes[ip] = ncells_global/numpe;
+ if (ip < (int)(ncells_global%numpe)) nsizes[ip]++;
+ if (nsizes_old != nsizes[ip]) do_load_balance_global = 1;
+ }
+
+ if(do_load_balance_global) {
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ gpu_counters[MESH_COUNTER_LOAD_BALANCE]++;
+
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ ncells = nsizes[mype];
+ noffset=ndispl[mype];
+
+ // Indices of blocks to be added to load balance
+ int lower_block_start = noffset;
+ int lower_block_end = min(noffset_old-1, (int)(noffset+ncells-1));
+ int upper_block_start = max((int)(noffset_old+ncells_old), noffset);
+ int upper_block_end = noffset+ncells-1;
+ //printf("%d: lbs %d lbe %d ubs %d ube %d\n",mype,lower_block_start-noffset_old,lower_block_end-noffset_old,upper_block_start-noffset_old,upper_block_end-noffset_old);
+
+ size_t lower_block_size = max(lower_block_end-lower_block_start+1,0);
+ if(lower_block_end < 0) lower_block_size = 0; // Handles segfault at start of array
+ size_t upper_block_size = max(upper_block_end-upper_block_start+1,0);
+ int indices_needed_count = lower_block_size + upper_block_size;
+
+ size_t middle_block_size = ncells - lower_block_size - upper_block_size;
+ int middle_block_start = max(noffset - noffset_old, 0);
+
+ int lower_segment_size = noffset-noffset_old;
+ int do_whole_segment = 0;
+ if (lower_segment_size > ncells_old) do_whole_segment = 1;
+
+ int upper_segment_size = ( (noffset_old+ncells_old) - (noffset+ncells) );
+ int upper_segment_start = (noffset_old+ncells_old) - upper_segment_size - noffset_old;
+ if (upper_segment_size > ncells_old) do_whole_segment=1;
+
+ int in = 0;
+ vector<int> indices_needed(indices_needed_count);
+ for (int iz = lower_block_start; iz <= lower_block_end; iz++, in++){
+ indices_needed[in]=iz;
+ }
+ for (int iz = upper_block_start; iz <= upper_block_end; iz++, in++){
+ indices_needed[in]=iz;
+ }
+
+ int load_balance_handle = 0;
+ L7_Setup(0, noffset_old, ncells_old, &indices_needed[0], indices_needed_count, &load_balance_handle);
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells + local_work_size - 1) / local_work_size) * local_work_size;
+
+ // printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
+
+ // Allocate lower block on GPU
+ size_t low_block_size = MAX(1, lower_block_size);
+ cl_mem dev_state_var_lower = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_lower"), &low_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+ // Allocate upper block on GPU
+ size_t up_block_size = MAX(1, upper_block_size);
+ cl_mem dev_state_var_upper = ezcl_malloc(NULL, const_cast<char *>("dev_state_var_upper"), &up_block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+ MallocPlus gpu_state_memory_old = gpu_state_memory;
+ malloc_plus_memory_entry *memory_item;
+
+ for (memory_item = gpu_state_memory_old.memory_entry_by_name_begin();
+ memory_item != gpu_state_memory_old.memory_entry_by_name_end();
+ memory_item = gpu_state_memory_old.memory_entry_by_name_next() ) {
+ //printf("DEBUG -- it.mem_name %s elsize %lu\n",memory_item->mem_name,memory_item->mem_elsize);
+ cl_mem dev_state_mem_ptr = (cl_mem)memory_item->mem_ptr;
+
+ if (memory_item->mem_elsize == 8){
+#ifndef MINIMUM_PRECISION
+ vector<double> state_var_tmp(ncells_old+indices_needed_count,0.0);
+
+ // Read current state values from GPU and write to CPU arrays
+ if (do_whole_segment) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_double), &state_var_tmp[0], NULL);
+ } else {
+ // Read lower block from GPU
+ if (lower_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_double), &state_var_tmp[0], NULL);
+ }
+ // Read upper block from GPU
+ if (upper_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_double), upper_segment_size*sizeof(cl_double), &state_var_tmp[upper_segment_start], NULL);
+ }
+ }
+
+ // Update arrays with L7
+ L7_Update(&state_var_tmp[0], L7_DOUBLE, load_balance_handle);
+
+ // Set lower block on GPU
+ if(lower_block_size > 0) {
+ ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_double), &state_var_tmp[ncells_old], NULL);
+ }
+ // Set upper block on GPU
+ if(upper_block_size > 0) {
+ ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_double), &state_var_tmp[ncells_old+lower_block_size], NULL);
+ }
+
+ // Allocate space on GPU for temp arrays (used in double buffering)
+ cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_double), CL_MEM_READ_WRITE, 0);
+ gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_double), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
+
+ //printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
+
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 1, sizeof(cl_int), &lower_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 2, sizeof(cl_int), &middle_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 3, sizeof(cl_int), &middle_block_start);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 4, sizeof(cl_mem), &dev_state_mem_ptr);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 5, sizeof(cl_mem), &dev_state_var_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 6, sizeof(cl_mem), &dev_state_var_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_double, 7, sizeof(cl_mem), &dev_state_var_new);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_double, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+#else
+ printf("ERROR -- can't have double type for state variable\n");
+ exit(1);
+#endif
+ } else if (memory_item->mem_elsize == 4){
+ vector<float> state_var_tmp(ncells_old+indices_needed_count,0.0);
+
+ // Read current state values from GPU and write to CPU arrays
+ if (do_whole_segment) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, ncells_old*sizeof(cl_float), &state_var_tmp[0], NULL);
+ } else {
+ // Read lower block from GPU
+ if (lower_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, 0, lower_segment_size*sizeof(cl_float), &state_var_tmp[0], NULL);
+ }
+ // Read upper block from GPU
+ if (upper_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_state_mem_ptr, CL_TRUE, upper_segment_start*sizeof(cl_float), upper_segment_size*sizeof(cl_float), &state_var_tmp[upper_segment_start], NULL);
+ }
+ }
+
+ // Update arrays with L7
+ L7_Update(&state_var_tmp[0], L7_FLOAT, load_balance_handle);
+
+ // Set lower block on GPU
+ if(lower_block_size > 0) {
+ ezcl_enqueue_write_buffer(command_queue, dev_state_var_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_float), &state_var_tmp[ncells_old], NULL);
+ }
+ // Set upper block on GPU
+ if(upper_block_size > 0) {
+ ezcl_enqueue_write_buffer(command_queue, dev_state_var_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_float), &state_var_tmp[ncells_old+lower_block_size], NULL);
+ }
+
+ // Allocate space on GPU for temp arrays (used in double buffering)
+ cl_mem dev_state_var_new = ezcl_malloc(NULL, gpu_state_memory.get_memory_name(dev_state_mem_ptr), &ncells, sizeof(cl_float), CL_MEM_READ_WRITE, 0);
+ gpu_state_memory.memory_add(dev_state_var_new, ncells, sizeof(cl_float), "dev_state_var_new", DEVICE_REGULAR_MEMORY);
+
+ //printf("DEBUG memory for proc %d is %p dev_state_new is %p\n",mype,dev_state_mem_ptr,dev_state_var_new);
+
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 1, sizeof(cl_int), &lower_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 2, sizeof(cl_int), &middle_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 3, sizeof(cl_int), &middle_block_start);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 4, sizeof(cl_mem), &dev_state_mem_ptr);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 5, sizeof(cl_mem), &dev_state_var_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 6, sizeof(cl_mem), &dev_state_var_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_float, 7, sizeof(cl_mem), &dev_state_var_new);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_float, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ gpu_state_memory.memory_replace(dev_state_mem_ptr, dev_state_var_new);
+ }
+ }
+
+ ezcl_device_memory_delete(dev_state_var_lower);
+ ezcl_device_memory_delete(dev_state_var_upper);
+
+ vector<int> i_tmp(ncells_old+indices_needed_count,0);
+ vector<int> j_tmp(ncells_old+indices_needed_count,0);
+ vector<int> level_tmp(ncells_old+indices_needed_count,0);
+ vector<int> celltype_tmp(ncells_old+indices_needed_count,0);
+
+ if (do_whole_segment) {
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, ncells_old*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, ncells_old*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, ncells_old*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, ncells_old*sizeof(cl_int), &celltype_tmp[0], NULL);
+ } else {
+ if (lower_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &i_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &j_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, 0, lower_segment_size*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, 0, lower_segment_size*sizeof(cl_int), &celltype_tmp[0], NULL);
+ }
+ if (upper_segment_size > 0) {
+ ezcl_enqueue_read_buffer(command_queue, dev_i, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &i_tmp[upper_segment_start], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_j, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &j_tmp[upper_segment_start], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_FALSE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &level_tmp[upper_segment_start], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_celltype, CL_TRUE, upper_segment_start*sizeof(cl_int), upper_segment_size*sizeof(cl_int), &celltype_tmp[upper_segment_start], NULL);
+ }
+ }
+
+ L7_Update(&i_tmp[0], L7_INT, load_balance_handle);
+ L7_Update(&j_tmp[0], L7_INT, load_balance_handle);
+ L7_Update(&level_tmp[0], L7_INT, load_balance_handle);
+ L7_Update(&celltype_tmp[0], L7_INT, load_balance_handle);
+
+ L7_Free(&load_balance_handle);
+ load_balance_handle = 0;
+
+ // Allocate and set lower block on GPU
+ cl_mem dev_i_lower, dev_j_lower, dev_level_lower, dev_celltype_lower;
+
+ if(lower_block_size > 0) {
+ dev_i_lower = ezcl_malloc(NULL, const_cast<char *>("dev_i_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_j_lower = ezcl_malloc(NULL, const_cast<char *>("dev_j_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_level_lower = ezcl_malloc(NULL, const_cast<char *>("dev_level_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_celltype_lower = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_lower"), &lower_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_enqueue_write_buffer(command_queue, dev_i_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &i_tmp[ncells_old], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_j_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &j_tmp[ncells_old], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_level_lower, CL_FALSE, 0, lower_block_size*sizeof(cl_int), &level_tmp[ncells_old], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_celltype_lower, CL_TRUE, 0, lower_block_size*sizeof(cl_int), &celltype_tmp[ncells_old], NULL);
+ }
+
+ // Allocate and set upper block on GPU
+ cl_mem dev_i_upper, dev_j_upper, dev_level_upper, dev_celltype_upper;
+ if(upper_block_size > 0) {
+ dev_i_upper = ezcl_malloc(NULL, const_cast<char *>("dev_i_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_j_upper = ezcl_malloc(NULL, const_cast<char *>("dev_j_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_level_upper = ezcl_malloc(NULL, const_cast<char *>("dev_level_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ dev_celltype_upper = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_upper"), &upper_block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ ezcl_enqueue_write_buffer(command_queue, dev_i_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &i_tmp[ncells_old+lower_block_size], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_j_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &j_tmp[ncells_old+lower_block_size], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_level_upper, CL_FALSE, 0, upper_block_size*sizeof(cl_int), &level_tmp[ncells_old+lower_block_size], NULL);
+ ezcl_enqueue_write_buffer(command_queue, dev_celltype_upper, CL_TRUE, 0, upper_block_size*sizeof(cl_int), &celltype_tmp[ncells_old+lower_block_size], NULL);
+ }
+
+ local_work_size = 128;
+
+ // printf("MYPE%d: \t ncells = %d \t ncells_old = %d \t ncells_global = %d \n", mype, ncells, ncells_old, ncells_global);
+ // Allocate space on GPU for temp arrays (used in double buffering)
+
+ size_t mem_request = (int)((float)ncells*mem_factor);
+ cl_mem dev_i_new = ezcl_malloc(NULL, const_cast<char *>("dev_i_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_j_new = ezcl_malloc(NULL, const_cast<char *>("dev_j_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_level_new = ezcl_malloc(NULL, const_cast<char *>("dev_level_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_celltype_new = ezcl_malloc(NULL, const_cast<char *>("dev_celltype_new"), &mem_request, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ // Set kernel arguments and call lower block kernel
+ if(lower_block_size > 0) {
+
+ size_t global_work_size = ((lower_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 0, sizeof(cl_mem), &dev_i_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 1, sizeof(cl_mem), &dev_j_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 2, sizeof(cl_mem), &dev_level_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 3, sizeof(cl_mem), &dev_celltype_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 4, sizeof(cl_mem), &dev_i_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 5, sizeof(cl_mem), &dev_j_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 6, sizeof(cl_mem), &dev_level_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 7, sizeof(cl_mem), &dev_celltype_lower);
+ ezcl_set_kernel_arg(kernel_do_load_balance_lower, 8, sizeof(cl_int), &lower_block_size);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_lower, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_device_memory_delete(dev_i_lower);
+ ezcl_device_memory_delete(dev_j_lower);
+ ezcl_device_memory_delete(dev_level_lower);
+ ezcl_device_memory_delete(dev_celltype_lower);
+ }
+
+ // Set kernel arguments and call middle block kernel
+ if(middle_block_size > 0) {
+
+ size_t global_work_size = ((middle_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 0, sizeof(cl_mem), &dev_i_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 1, sizeof(cl_mem), &dev_j_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 2, sizeof(cl_mem), &dev_level_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 3, sizeof(cl_mem), &dev_celltype_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 4, sizeof(cl_mem), &dev_i);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 5, sizeof(cl_mem), &dev_j);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 6, sizeof(cl_mem), &dev_level);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 7, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 8, sizeof(cl_int), &lower_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 9, sizeof(cl_int), &middle_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_middle, 10, sizeof(cl_int), &middle_block_start);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_middle, 1, NULL, &global_work_size, &local_work_size, NULL);
+ }
+
+ // Set kernel arguments and call upper block kernel
+ if(upper_block_size > 0) {
+
+ size_t global_work_size = ((upper_block_size + local_work_size - 1) / local_work_size) * local_work_size;
+
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 0, sizeof(cl_mem), &dev_i_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 1, sizeof(cl_mem), &dev_j_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 2, sizeof(cl_mem), &dev_level_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 3, sizeof(cl_mem), &dev_celltype_new);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 4, sizeof(cl_mem), &dev_i_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 5, sizeof(cl_mem), &dev_j_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 6, sizeof(cl_mem), &dev_level_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 7, sizeof(cl_mem), &dev_celltype_upper);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 8, sizeof(cl_int), &lower_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 9, sizeof(cl_int), &middle_block_size);
+ ezcl_set_kernel_arg(kernel_do_load_balance_upper, 10, sizeof(cl_int), &upper_block_size);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_do_load_balance_upper, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ ezcl_device_memory_delete(dev_i_upper);
+ ezcl_device_memory_delete(dev_j_upper);
+ ezcl_device_memory_delete(dev_level_upper);
+ ezcl_device_memory_delete(dev_celltype_upper);
+ }
+
+ ezcl_device_memory_swap(&dev_i_new, &dev_i);
+ ezcl_device_memory_swap(&dev_j_new, &dev_j);
+ ezcl_device_memory_swap(&dev_level_new, &dev_level);
+ ezcl_device_memory_swap(&dev_celltype_new, &dev_celltype);
+
+ ezcl_device_memory_delete(dev_i_new);
+ ezcl_device_memory_delete(dev_j_new);
+ ezcl_device_memory_delete(dev_level_new);
+ ezcl_device_memory_delete(dev_celltype_new);
+
+ gpu_timers[MESH_TIMER_LOAD_BALANCE] += (long int)(cpu_timer_stop(tstart_cpu)*1.0e9);
+ }
+
+ return(do_load_balance_global);
+}
+#endif
+#endif
+
+#ifdef HAVE_OPENCL
+int Mesh::gpu_count_BCs(void)
+{
+ cl_event count_BCs_stage1_event, count_BCs_stage2_event;
+
+ size_t local_work_size = MIN(ncells, TILE_SIZE);
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+ //size_t block_size = (ncells + TILE_SIZE - 1) / TILE_SIZE; // For on-device global reduction kernel.
+ size_t block_size = global_work_size/local_work_size;
+
+ int bcount = 0;
+
+ if (! have_boundary) {
+ cl_command_queue command_queue = ezcl_get_command_queue();
+ cl_mem dev_ioffset = ezcl_malloc(NULL, const_cast<char *>("dev_ioffset"), &block_size, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ /*
+ __kernel void count_BCs(
+ const int isize, // 0
+ __global const int *i, // 1
+ __global const int *j, // 2
+ __global const int *level, // 3
+ __global const int *lev_ibeg, // 4
+ __global const int *lev_iend, // 5
+ __global const int *lev_jbeg, // 6
+ __global const int *lev_jend, // 7
+ __global int *scratch, // 8
+ __local int *tile) // 9
+ */
+ size_t shared_spd_sum_int = local_work_size * sizeof(cl_int);
+ ezcl_set_kernel_arg(kernel_count_BCs, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_count_BCs, 1, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_count_BCs, 2, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_count_BCs, 3, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_count_BCs, 4, sizeof(cl_mem), (void *)&dev_levibeg);
+ ezcl_set_kernel_arg(kernel_count_BCs, 5, sizeof(cl_mem), (void *)&dev_leviend);
+ ezcl_set_kernel_arg(kernel_count_BCs, 6, sizeof(cl_mem), (void *)&dev_levjbeg);
+ ezcl_set_kernel_arg(kernel_count_BCs, 7, sizeof(cl_mem), (void *)&dev_levjend);
+ ezcl_set_kernel_arg(kernel_count_BCs, 8, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_count_BCs, 9, shared_spd_sum_int, 0);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_count_BCs, 1, NULL, &global_work_size, &local_work_size, &count_BCs_stage1_event);
+
+ if (block_size > 1) {
+ ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 1, sizeof(cl_mem), (void *)&dev_ioffset);
+ ezcl_set_kernel_arg(kernel_reduce_sum_int_stage2of2, 2, shared_spd_sum_int, 0);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_int_stage2of2, 1, NULL, &local_work_size, &local_work_size, &count_BCs_stage2_event);
+ }
+
+ ezcl_enqueue_read_buffer(command_queue, dev_ioffset, CL_TRUE, 0, 1*sizeof(cl_int), &bcount, NULL);
+
+ //printf("DEBUG -- bcount is %d\n",bcount);
+ //state->gpu_time_read += ezcl_timer_calc(&start_read_event, &start_read_event);
+
+ ezcl_device_memory_delete(dev_ioffset);
+
+ gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage1_event, &count_BCs_stage1_event);
+ if (block_size > 1) {
+ gpu_timers[MESH_TIMER_COUNT_BCS] += ezcl_timer_calc(&count_BCs_stage2_event, &count_BCs_stage2_event);
+ }
+
+ }
+
+ return(bcount);
+}
+#endif
+
+void Mesh::allocate(size_t ncells)
+{
+ int flags = 0;
+ flags = RESTART_DATA;
+#ifdef HAVE_J7
+ if (parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+ i = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "i", flags);
+ j = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "j", flags);
+ level = (int *)mesh_memory.memory_malloc(ncells, sizeof(int), "level", flags);
+}
+
+
+void Mesh::resize(size_t new_ncells)
+{
+ size_t current_size = mesh_memory.get_memory_size(i);
+ if (new_ncells > current_size) mesh_memory.memory_realloc_all(new_ncells);
+}
+
+void Mesh::memory_reset_ptrs(void){
+ i = (int *)mesh_memory.get_memory_ptr("i");
+ j = (int *)mesh_memory.get_memory_ptr("j");
+ level = (int *)mesh_memory.get_memory_ptr("level");
+ celltype = (int *)mesh_memory.get_memory_ptr("celltype");
+ nlft = (int *)mesh_memory.get_memory_ptr("nlft");
+ nrht = (int *)mesh_memory.get_memory_ptr("nrht");
+ nbot = (int *)mesh_memory.get_memory_ptr("nbot");
+ ntop = (int *)mesh_memory.get_memory_ptr("ntop");
+}
+
+void Mesh::resize_old_device_memory(size_t ncells)
+{
+#ifdef HAVE_OPENCL
+ ezcl_device_memory_delete(dev_level);
+ ezcl_device_memory_delete(dev_i);
+ ezcl_device_memory_delete(dev_j);
+ ezcl_device_memory_delete(dev_celltype);
+ size_t mem_request = (int)((float)ncells*mem_factor);
+ dev_level = ezcl_malloc(NULL, const_cast<char *>("dev_level"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
+ dev_i = ezcl_malloc(NULL, const_cast<char *>("dev_i"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
+ dev_j = ezcl_malloc(NULL, const_cast<char *>("dev_j"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
+ dev_celltype = ezcl_malloc(NULL, const_cast<char *>("dev_celltype"), &mem_request, sizeof(cl_int), CL_MEM_READ_ONLY, 0);
+#else
+ // To get rid of compiler warning
+ if (1 == 2) printf("DEBUG -- ncells is %lu\n",ncells);
+#endif
+}
+void Mesh::print_object_info(void)
+{
+ printf(" ---- Mesh object info -----\n");
+ printf("Dimensionality : %d\n",ndim);
+ printf("Parallel info : mype %d numpe %d noffset %d parallel %d\n",mype,numpe,noffset,parallel);
+ printf("Sizes : ncells %ld ncells_ghost %ld\n\n",ncells,ncells_ghost);
+#ifdef HAVE_OPENCL
+ int num_elements, elsize;
+
+ num_elements = ezcl_get_device_mem_nelements(dev_celltype);
+ elsize = ezcl_get_device_mem_elsize(dev_celltype);
+ printf("dev_celltype ptr : %p nelements %d elsize %d\n",dev_celltype,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_level);
+ elsize = ezcl_get_device_mem_elsize(dev_level);
+ printf("dev_level ptr : %p nelements %d elsize %d\n",dev_level,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_i);
+ elsize = ezcl_get_device_mem_elsize(dev_i);
+ printf("dev_i ptr : %p nelements %d elsize %d\n",dev_i,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_j);
+ elsize = ezcl_get_device_mem_elsize(dev_j);
+ printf("dev_j ptr : %p nelements %d elsize %d\n",dev_j,num_elements,elsize);
+
+ num_elements = ezcl_get_device_mem_nelements(dev_nlft);
+ elsize = ezcl_get_device_mem_elsize(dev_nlft);
+ printf("dev_nlft ptr : %p nelements %d elsize %d\n",dev_nlft,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_nrht);
+ elsize = ezcl_get_device_mem_elsize(dev_nrht);
+ printf("dev_nrht ptr : %p nelements %d elsize %d\n",dev_nrht,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_nbot);
+ elsize = ezcl_get_device_mem_elsize(dev_nbot);
+ printf("dev_nbot ptr : %p nelements %d elsize %d\n",dev_nbot,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_ntop);
+ elsize = ezcl_get_device_mem_elsize(dev_ntop);
+ printf("dev_ntop ptr : %p nelements %d elsize %d\n",dev_ntop,num_elements,elsize);
+#endif
+ printf("vector celltype ptr : %p nelements %ld elsize %ld\n",&celltype[0],mesh_memory.get_memory_size(celltype),sizeof(celltype[0]));
+ printf("vector level ptr : %p nelements %ld elsize %ld\n",&level[0], mesh_memory.get_memory_size(level), sizeof(level[0]));
+ printf("vector i ptr : %p nelements %ld elsize %ld\n",&i[0], mesh_memory.get_memory_size(i), sizeof(i[0]));
+ printf("vector j ptr : %p nelements %ld elsize %ld\n",&j[0], mesh_memory.get_memory_size(j), sizeof(j[0]));
+
+ printf("vector nlft ptr : %p nelements %ld elsize %ld\n",&nlft[0], mesh_memory.get_memory_size(nlft), sizeof(nlft[0]));
+ printf("vector nrht ptr : %p nelements %ld elsize %ld\n",&nrht[0], mesh_memory.get_memory_size(nrht), sizeof(nrht[0]));
+ printf("vector nbot ptr : %p nelements %ld elsize %ld\n",&nbot[0], mesh_memory.get_memory_size(nbot), sizeof(nbot[0]));
+ printf("vector ntop ptr : %p nelements %ld elsize %ld\n",&ntop[0], mesh_memory.get_memory_size(ntop), sizeof(ntop[0]));
+}
+
+
+void Mesh::set_refinement_order(int order[4], int ic, int ifirst, int ilast, int jfirst, int jlast,
+ int level_first, int level_last, int *i_old, int *j_old, int *level_old)
+{
+ if (localStencil) {
+ // Store the coordinates of the cells before and after this one on
+ // the space-filling curve index.
+
+#ifdef __OLD_STENCIL__
+ spatial_t nx[3], // x-coordinates of cells.
+ ny[3]; // y-coordinates of cells.
+ if (ic != 0) {
+ nx[0] = lev_deltax[level_old[ic-1]] * (spatial_t)i[ic-1];
+ ny[0] = lev_deltay[level_old[ic-1]] * (spatial_t)j[ic-1];
+ } else {
+ nx[0] = lev_deltax[level_first] * (spatial_t)ifirst;
+ ny[0] = lev_deltay[level_first] * (spatial_t)jfirst;
+ }
+ nx[1] = lev_deltax[level_old[ic ]] * (spatial_t)i[ic ];
+ ny[1] = lev_deltay[level_old[ic ]] * (spatial_t)j[ic ];
+ if (ic != ncells-1) {
+ nx[2] = lev_deltax[level_old[ic+1]] * (spatial_t)i[ic+1];
+ ny[2] = lev_deltay[level_old[ic+1]] * (spatial_t)j[ic+1];
+ } else {
+ nx[2] = lev_deltax[level_last] * (spatial_t)ilast;
+ ny[2] = lev_deltay[level_last] * (spatial_t)jlast;
+ }
+
+ // Figure out relative orientation of the neighboring cells. We are
+ // are aided in this because the Hilbert curve only has six possible
+ // ways across the cell: four Ls and two straight lines. Then
+ // refine the cell according to the relative orientation and order
+ // according to the four-point Hilbert stencil.
+ if (nx[0] < nx[1] and ny[2] < ny[1]) // southwest L, forward order
+ { order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+ else if (nx[2] < nx[1] and ny[0] < ny[1]) // southwest L, reverse order
+ { order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
+ else if (nx[0] > nx[1] and ny[2] < ny[1]) // southeast L, forward order
+ { order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW; }
+ else if (nx[2] > nx[1] and ny[0] < ny[1]) // southeast L, reverse order
+ { order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+ else if (nx[0] > nx[1] and ny[2] > ny[1]) // northeast L, forward order
+ { order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE; }
+ else if (nx[2] > nx[1] and ny[0] > ny[1]) // northeast L, reverse order
+ { order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
+ else if (nx[0] < nx[1] and ny[2] > ny[1]) // northwest L, forward order
+ { order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
+ else if (nx[2] < nx[1] and ny[0] > ny[1]) // northwest L, reverse order
+ { order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW; }
+ else if (nx[0] > nx[1] and nx[1] > nx[2]) // straight horizontal, forward order
+ { order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW; }
+ else if (nx[0] < nx[1] and nx[1] < nx[2]) // straight horizontal, reverse order
+ { order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE; }
+ else if (ny[0] > ny[1] and ny[1] > ny[2]) // straight vertical, forward order
+ { order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE; }
+ else if (ny[0] < ny[1] and ny[1] < ny[2]) // straight vertical, reverse order
+ { order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW; }
+ else // other, default to z-order
+ { order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
+#endif
+
+#ifdef __NEW_STENCIL__
+ int ir[3], // First i index at finest level of the mesh
+ jr[3]; // First j index at finest level of the mesh
+ // Cell's Radius at the Finest level of the mesh
+
+ int crf = IPOW2(levmx-level_old[ic]);
+
+ if (ic != 0) {
+ ir[0] = i_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
+ jr[0] = j_old[ic - 1] * IPOW2(levmx-level_old[ic - 1]);
+ } else {
+ //printf("%d cell %d is a first\n",mype,ic);
+ ir[0] = ifirst * IPOW2(levmx-level_first);
+ jr[0] = jfirst * IPOW2(levmx-level_first);
+ }
+ ir[1] = i_old[ic ] * IPOW2(levmx-level_old[ic ]);
+ jr[1] = j_old[ic ] * IPOW2(levmx-level_old[ic ]);
+ if (ic != (int)ncells-1) {
+ ir[2] = i_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
+ jr[2] = j_old[ic + 1] * IPOW2(levmx-level_old[ic + 1]);
+ } else {
+ //printf("%d cell %d is a last\n",mype,ic);
+ ir[2] = ilast * IPOW2(levmx-level_last);
+ jr[2] = jlast * IPOW2(levmx-level_last);
+ }
+ //if (parallel) fprintf(fp,"%d: DEBUG rezone top boundary -- ic %d global %d noffset %d nc %d i %d j %d level %d\n",mype,ic,ic+noffset,noffset,nc,i[nc],j[nc],level[nc]);
+
+ int dir_in = ir[1] - ir[0];
+ int dir_out = ir[1] - ir[2];
+ int djr_in = jr[1] - jr[0];
+ int djr_out = jr[1] - jr[2];
+
+ char in_direction = 'X';
+ char out_direction = 'X';
+
+ // Left In
+ if( (djr_in == 0 && (dir_in == crf*HALF || dir_in == crf || dir_in == crf*TWO)) || (djr_in == -crf*HALF && dir_in == crf*HALF) || (djr_in == crf && dir_in == crf*TWO) ) {
+ in_direction = 'L';
+ }
+ // Bottom In
+ else if( (dir_in == 0 && (djr_in == crf*HALF || djr_in == crf || djr_in == crf*TWO)) || (dir_in == -crf*HALF && djr_in == crf*HALF) || (dir_in == crf && djr_in == crf*TWO) ) {
+ in_direction = 'B';
+ }
+ // Right In
+ else if( (dir_in == -crf && (djr_in == -crf*HALF || djr_in == 0 || (djr_in == crf && level_old[ic-1] < level_old[ic]))) ) {
+ in_direction = 'R';
+ }
+ // Top In
+ else if( (djr_in == -crf && (dir_in == -crf*HALF || dir_in == 0 || (dir_in == crf && level_old[ic-1] < level_old[ic]))) ) {
+ in_direction = 'T';
+ }
+ // Further from the left
+ else if( dir_in > 0 && djr_in == 0 ) {
+ in_direction = 'L';
+ }
+ // Further from the right
+ else if( dir_in < 0 && djr_in == 0 ) {
+ in_direction = 'R';
+ }
+ // Further from the bottom
+ else if( djr_in > 0 && dir_in == 0 ) {
+ in_direction = 'B';
+ }
+ // Further from the top
+ else if( djr_in < 0 && dir_in == 0 ) {
+ in_direction = 'T';
+ }
+ // SW in; 'M'
+ else if( dir_in > 0 && djr_in > 0) {
+ in_direction = 'M';
+ }
+ // NW in; 'W'
+ else if( dir_in > 0 && djr_in < 0) {
+ in_direction = 'W';
+ }
+ // SE in; 'F'
+ else if( dir_in < 0 && djr_in > 0) {
+ in_direction = 'F';
+ }
+ // NE in; 'E'
+ else if( dir_in < 0 && djr_in < 0) {
+ in_direction = 'E';
+ }
+
+
+ // Left Out
+ if( (djr_out == 0 && (dir_out == crf*HALF || dir_out == crf || dir_out == crf*TWO)) || (djr_out == -crf*HALF && dir_out == crf*HALF) || (djr_out == crf && dir_out == crf*TWO) ) {
+ out_direction = 'L';
+ }
+ // Bottom Out
+ else if( (dir_out == 0 && (djr_out == crf*HALF || djr_out == crf || djr_out == crf*TWO)) || (dir_out == -crf*HALF && djr_out == crf*HALF) || (dir_out == crf && djr_out == crf*TWO) ) {
+ out_direction = 'B';
+ }
+ // Right Out
+ else if( (dir_out == -crf && (djr_out == -crf*HALF || djr_out == 0 || (djr_out == crf && level_old[ic+1] < level_old[ic]))) ) {
+ out_direction = 'R';
+ }
+ // Top Out
+ else if( (djr_out == -crf && (dir_out == -crf*HALF || dir_out == 0 || (dir_out == crf && level_old[ic+1] < level_old[ic]))) ) {
+ out_direction = 'T';
+ }
+ // Further from the left
+ else if( dir_out > 0 && djr_out == 0 ) {
+ out_direction = 'L';
+ }
+ // Further from the right
+ else if( dir_out < 0 && djr_out == 0 ) {
+ out_direction = 'R';
+ }
+ // Further from the bottom
+ else if( djr_out > 0 && dir_out == 0 ) {
+ out_direction = 'B';
+ }
+ // Further from the top
+ else if( djr_out < 0 && dir_out == 0 ) {
+ out_direction = 'T';
+ }
+ // SW out; 'M'
+ else if( dir_out > 0 && djr_out > 0) {
+ out_direction = 'M';
+ }
+ // NW out; 'W'
+ else if( dir_out > 0 && djr_out < 0) {
+ out_direction = 'W';
+ }
+ // SE out; 'F'
+ else if( dir_out < 0 && djr_out > 0) {
+ out_direction = 'F';
+ }
+ // NE out; 'E'
+ else if( dir_out < 0 && djr_out < 0) {
+ out_direction = 'E';
+ }
+
+ // Set the Stencil
+ if(in_direction == 'L' && (out_direction == 'B' || out_direction == 'R' || out_direction == 'F')) {
+ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+ }
+ else if(in_direction == 'L' && (out_direction == 'T' || out_direction == 'W' )) {
+ order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+ }
+ else if(in_direction == 'L' && out_direction == 'M') {
+ order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
+ }
+ else if(in_direction == 'L' && out_direction == 'E') {
+ order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
+ }
+
+ else if(in_direction == 'B' && (out_direction == 'R' || out_direction == 'F' )) {
+ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+ }
+ else if(in_direction == 'B' && (out_direction == 'L' || out_direction == 'T' || out_direction == 'W' )) {
+ order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+ }
+ else if(in_direction == 'B' && out_direction == 'M') {
+ order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
+ }
+ else if(in_direction == 'B' && out_direction == 'E') {
+ order[0] = SW; order[1] = NW; order[2] = SE; order[3] = NE;
+ }
+
+ else if(in_direction == 'R' && (out_direction == 'T' || out_direction == 'L' || out_direction == 'W' )) {
+ order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+ }
+ else if(in_direction == 'R' && (out_direction == 'B' || out_direction == 'F' )) {
+ order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+ }
+ else if(in_direction == 'R' && out_direction == 'M') {
+ order[0] = NE; order[1] = NW; order[2] = SE; order[3] = SW;
+ }
+ else if(in_direction == 'R' && out_direction == 'E') {
+ order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
+ }
+
+ else if(in_direction == 'T' && (out_direction == 'L' || out_direction == 'W' )) {
+ order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+ }
+ else if(in_direction == 'T' && (out_direction == 'R' || out_direction == 'B' || out_direction == 'F' )) {
+ order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+ }
+ else if(in_direction == 'T' && out_direction == 'M') {
+ order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
+ }
+ else if(in_direction == 'T' && out_direction == 'E') {
+ order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
+ }
+
+ else if(in_direction == 'M' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
+ order[0] = SW; order[1] = SE; order[2] = NE; order[3] = NW;
+ }
+ else if(in_direction == 'M' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
+ order[0] = SW; order[1] = NW; order[2] = NE; order[3] = SE;
+ }
+ else if(in_direction == 'M' && out_direction == 'E') {
+ order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE;
+ }
+
+ else if(in_direction == 'W' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
+ order[0] = NW; order[1] = NE; order[2] = SE; order[3] = SW;
+ }
+ else if(in_direction == 'W' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
+ order[0] = NW; order[1] = SW; order[2] = SE; order[3] = NE;
+ }
+ else if(in_direction == 'W' && out_direction == 'F') {
+ order[0] = NW; order[1] = NE; order[2] = SW; order[3] = SE;
+ }
+
+ else if(in_direction == 'F' && (out_direction == 'L' || out_direction == 'M' || out_direction == 'B') ) {
+ order[0] = SE; order[1] = NE; order[2] = NW; order[3] = SW;
+ }
+ else if(in_direction == 'F' && (out_direction == 'R' || out_direction == 'E' || out_direction == 'T') ) {
+ order[0] = SE; order[1] = SW; order[2] = NW; order[3] = NE;
+ }
+ else if(in_direction == 'F' && out_direction == 'W') {
+ order[0] = SE; order[1] = NE; order[2] = SW; order[3] = NW;
+ }
+
+ else if(in_direction == 'E' && (out_direction == 'L' || out_direction == 'W' || out_direction == 'T') ) {
+ order[0] = NE; order[1] = SE; order[2] = SW; order[3] = NW;
+ }
+ else if(in_direction == 'E' && (out_direction == 'R' || out_direction == 'F' || out_direction == 'B') ) {
+ order[0] = NE; order[1] = NW; order[2] = SW; order[3] = SE;
+ }
+ else if(in_direction == 'E' && out_direction == 'M') {
+ order[0] = NE; order[1] = SE; order[2] = NW; order[3] = SW;
+ }
+
+ else { // Default to a knot
+ order[0] = NW; order[1] = SE; order[2] = SW; order[3] = NE;
+ if (do_stencil_warning) {
+ printf("Nonlocal case for the stencil.\n");
+ }
+ }
+ // Determine the relative orientation of the neighboring cells.
+ // There are 12 possible ways across the cell: 4 Ls and 2 straight
+ // lines, each with 2 directions of traversal.
+ // Then the cell is refined and ordered according to the relative
+ // orientation and four-point Hilbert stencil.
+
+ // XXX NOTE that the four-point stencil varies depending upon
+ // the starting and ending point of the global Hilbert curve.
+ // The stencil applied here assumes the start at (0,0) and the end
+ // at (0,y_max). XXX WRONG
+#endif
+
+ } // End local stencil version
+ else // Use Z-ordering for the curve.
+ { order[0] = SW; order[1] = SE; order[2] = NW; order[3] = NE; }
+
+}
+
+void Mesh::calc_face_list(void)
+{
+ xface_i.clear();
+ xface_j.clear();
+ xface_level.clear();
+
+ ixmin_level.clear();
+ ixmax_level.clear();
+ jxmin_level.clear();
+ jxmax_level.clear();
+ ixmin_level.resize(levmx+1, 9999999);
+ ixmax_level.resize(levmx+1, -9999999);
+ jxmin_level.resize(levmx+1, 9999999);
+ jxmax_level.resize(levmx+1, -9999999);
+
+ ixadjust.clear();
+ ixadjust.resize(levmx+1);
+ jxadjust.clear();
+ jxadjust.resize(levmx+1);
+
+ int iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nr = nrht[nz];
+ if (nr == nz) continue;
+
+ int ifactor = 1;
+ if (level[nr] < level[nz]) ifactor = 2;
+
+ // Have right face
+ //printf("DEBUG xface -- iface %d lower nz %d upper nr %d\n",iface,nz,nr);
+ xface_level.push_back(MAX(level[nz],level[nr]));
+ xface_i.push_back(i[nr]*ifactor);
+ if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+ xface_j.push_back(j[nr]*ifactor+1);
+ } else {
+ xface_j.push_back(j[nr]*ifactor);
+ }
+
+ iface++;
+
+ if (level[nr] > level[nz] && is_lower(j[nr]) ){
+ int ntr = ntop[nr];
+ if (ntr != nr) {
+ //printf("DEBUG xface -- iface %d lower nz %d upper ntr %d\n",iface,nz,ntr);
+ xface_level.push_back(MAX(level[nz],level[ntr]));
+ xface_i.push_back(i[ntr]*ifactor);
+ xface_j.push_back(j[ntr]*ifactor);
+
+ iface++;
+ }
+ }
+ }
+ nxface=iface;
+
+ yface_i.clear();
+ yface_j.clear();
+ yface_level.clear();
+
+ iymin_level.clear();
+ iymax_level.clear();
+ jymin_level.clear();
+ jymax_level.clear();
+ iymin_level.resize(levmx+1, 9999999);
+ iymax_level.resize(levmx+1, -9999999);
+ jymin_level.resize(levmx+1, 9999999);
+ jymax_level.resize(levmx+1, -9999999);
+
+ iyadjust.clear();
+ iyadjust.resize(levmx+1);
+ jyadjust.clear();
+ jyadjust.resize(levmx+1);
+
+ iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nt = ntop[nz];
+ if (nt == nz) continue;
+
+ int ifactor = 1;
+ if (level[nt] < level[nz]) ifactor = 2;
+
+ // Have top face
+ //printf("DEBUG yface -- iface %d lower nz %d upper nt %d\n",iface,nz,nt);
+ yface_level.push_back(MAX(level[nz],level[nt]));
+ yface_j.push_back(j[nt]*ifactor);
+ if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+ yface_i.push_back(i[nt]*ifactor+1);
+ } else{
+ yface_i.push_back(i[nt]*ifactor);
+ }
+
+ iface++;
+ if (level[nt] > level[nz] && is_lower(i[nt]) ){
+ int nrt = nrht[nt];
+ if (nrt != nt) {
+ //printf("DEBUG yface -- iface %d lower nz %d upper nrt %d\n",iface,nz,nrt);
+ yface_level.push_back(MAX(level[nz],level[nrt]));
+ yface_j.push_back(j[nrt]*ifactor);
+ yface_i.push_back(i[nrt]*ifactor);
+
+ iface++;
+ }
+
+ }
+ }
+ nyface=iface;
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+
+ int fi = xface_i[iface];
+ if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+ if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+ int fj = xface_j[iface];
+ if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+ if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+ if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+ xface_i[iface] -= ixmin_level[fl];
+ xface_j[iface] -= jxmin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ ixadjust[fl] = ixmin_level[fl];
+ jxadjust[fl] = jxmin_level[fl];
+ ixmax_level[fl] -= ixmin_level[fl];;
+ jxmax_level[fl] -= jxmin_level[fl];
+ ixmin_level[fl] = 0;
+ jxmin_level[fl] = 0;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+
+ int fi = yface_i[iface];
+ if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+ if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+ int fj = yface_j[iface];
+ if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+ if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+ if (iymax_level[fl] < iymin_level[fl]) continue;
+
+ yface_i[iface] -= iymin_level[fl];
+ yface_j[iface] -= jymin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ iyadjust[fl] = iymin_level[fl];
+ jyadjust[fl] = jymin_level[fl];
+ iymax_level[fl] -= iymin_level[fl];;
+ jymax_level[fl] -= jymin_level[fl];
+ iymin_level[fl] = 0;
+ jymin_level[fl] = 0;
+ }
+
+}
+
+void Mesh::calc_face_list_wmap(void)
+{
+ map_xface2cell_lower.clear();
+ map_xface2cell_upper.clear();
+
+ xface_i.clear();
+ xface_j.clear();
+ xface_level.clear();
+
+ ixmin_level.clear();
+ ixmax_level.clear();
+ jxmin_level.clear();
+ jxmax_level.clear();
+ ixmin_level.resize(levmx+1, 9999999);
+ ixmax_level.resize(levmx+1, -9999999);
+ jxmin_level.resize(levmx+1, 9999999);
+ jxmax_level.resize(levmx+1, -9999999);
+
+ ixadjust.clear();
+ ixadjust.resize(levmx+1);
+ jxadjust.clear();
+ jxadjust.resize(levmx+1);
+
+ int iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nr = nrht[nz];
+ if (nr == nz) continue;
+
+ int ifactor = 1;
+ if (level[nr] < level[nz]) ifactor = 2;
+
+ // Have right face
+ map_xface2cell_lower.push_back(nz);
+ map_xface2cell_upper.push_back(nr);
+ xface_level.push_back(MAX(level[nz],level[nr]));
+ xface_i.push_back(i[nr]*ifactor);
+ if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+ xface_j.push_back(j[nr]*ifactor+1);
+ } else {
+ xface_j.push_back(j[nr]*ifactor);
+ }
+
+ iface++;
+
+ if (level[nr] > level[nz] && is_lower(j[nr]) ){
+ int ntr = ntop[nr];
+ if (ntr != nr) {
+ map_xface2cell_lower.push_back(nz);
+ map_xface2cell_upper.push_back(ntr);
+ xface_level.push_back(MAX(level[nz],level[ntr]));
+ xface_i.push_back(i[ntr]*ifactor);
+ xface_j.push_back(j[ntr]*ifactor);
+
+ iface++;
+ }
+ }
+ }
+ nxface=iface;
+
+ map_yface2cell_lower.clear();
+ map_yface2cell_upper.clear();
+
+ yface_i.clear();
+ yface_j.clear();
+ yface_level.clear();
+
+ iymin_level.clear();
+ iymax_level.clear();
+ jymin_level.clear();
+ jymax_level.clear();
+ iymin_level.resize(levmx+1, 9999999);
+ iymax_level.resize(levmx+1, -9999999);
+ jymin_level.resize(levmx+1, 9999999);
+ jymax_level.resize(levmx+1, -9999999);
+
+ iyadjust.clear();
+ iyadjust.resize(levmx+1);
+ jyadjust.clear();
+ jyadjust.resize(levmx+1);
+
+ iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nt = ntop[nz];
+ if (nt == nz) continue;
+
+ int ifactor = 1;
+ if (level[nt] < level[nz]) ifactor = 2;
+
+ // Have top face
+ // printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
+ map_yface2cell_lower.push_back(nz);
+ map_yface2cell_upper.push_back(nt);
+ yface_level.push_back(MAX(level[nz],level[nt]));
+ yface_j.push_back(j[nt]*ifactor);
+ if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+ yface_i.push_back(i[nt]*ifactor+1);
+ } else{
+ yface_i.push_back(i[nt]*ifactor);
+ }
+
+ iface++;
+ if (level[nt] > level[nz] && is_lower(i[nt]) ){
+ int nrt = nrht[nt];
+ if (nrt != nt) {
+ map_yface2cell_lower.push_back(nz);
+ map_yface2cell_upper.push_back(nrt);
+ yface_level.push_back(MAX(level[nz],level[nrt]));
+ yface_j.push_back(j[nrt]*ifactor);
+ yface_i.push_back(i[nrt]*ifactor);
+
+ iface++;
+ }
+ }
+ }
+ nyface=iface;
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+
+ int fi = xface_i[iface];
+ if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+ if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+ int fj = xface_j[iface];
+ if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+ if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+ if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+ xface_i[iface] -= ixmin_level[fl];
+ xface_j[iface] -= jxmin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ ixadjust[fl] = ixmin_level[fl];
+ jxadjust[fl] = jxmin_level[fl];
+ ixmax_level[fl] -= ixmin_level[fl];;
+ jxmax_level[fl] -= jxmin_level[fl];
+ ixmin_level[fl] = 0;
+ jxmin_level[fl] = 0;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+
+ int fi = yface_i[iface];
+ if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+ if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+ int fj = yface_j[iface];
+ if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+ if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+ if (iymax_level[fl] < iymin_level[fl]) continue;
+
+ yface_i[iface] -= iymin_level[fl];
+ yface_j[iface] -= jymin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ iyadjust[fl] = iymin_level[fl];
+ jyadjust[fl] = jymin_level[fl];
+ iymax_level[fl] -= iymin_level[fl];;
+ jymax_level[fl] -= jymin_level[fl];
+ iymin_level[fl] = 0;
+ jymin_level[fl] = 0;
+ }
+
+}
+
+void Mesh::calc_face_list_wbidirmap(void)
+{
+ map_xface2cell_lower.clear();
+ map_xface2cell_upper.clear();
+
+ map_xcell2face_left1.clear();
+ map_xcell2face_left2.clear();
+ map_xcell2face_right1.clear();
+ map_xcell2face_right2.clear();
+ map_xcell2face_left1.resize(ncells, -1);
+ map_xcell2face_left2.resize(ncells, -1);
+ map_xcell2face_right1.resize(ncells, -1);
+ map_xcell2face_right2.resize(ncells, -1);
+
+ xface_i.clear();
+ xface_j.clear();
+ xface_level.clear();
+
+ ixmin_level.clear();
+ ixmax_level.clear();
+ jxmin_level.clear();
+ jxmax_level.clear();
+ ixmin_level.resize(levmx+1, 9999999);
+ ixmax_level.resize(levmx+1, -9999999);
+ jxmin_level.resize(levmx+1, 9999999);
+ jxmax_level.resize(levmx+1, -9999999);
+
+ ixadjust.clear();
+ ixadjust.resize(levmx+1);
+ jxadjust.clear();
+ jxadjust.resize(levmx+1);
+
+ int iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nr = nrht[nz];
+ if (nr == nz) continue;
+
+ int ifactor = 1;
+ if (level[nr] < level[nz]) ifactor = 2;
+
+ // Have right face
+ map_xface2cell_lower.push_back(nz);
+ map_xface2cell_upper.push_back(nr);
+ xface_level.push_back(MAX(level[nz],level[nr]));
+ xface_i.push_back(i[nr]*ifactor);
+ if (level[nr] < level[nz] && is_upper(j[nz]) ) {
+ xface_j.push_back(j[nr]*ifactor+1);
+ } else {
+ xface_j.push_back(j[nr]*ifactor);
+ }
+ map_xcell2face_right1[nz] = iface;
+
+ iface++;
+
+ if (level[nr] > level[nz] && is_lower(j[nr]) ){
+ int ntr = ntop[nr];
+ if (ntr != nr) {
+ map_xface2cell_lower.push_back(nz);
+ map_xface2cell_upper.push_back(ntr);
+ xface_level.push_back(MAX(level[nz],level[ntr]));
+ xface_i.push_back(i[ntr]*ifactor);
+ xface_j.push_back(j[ntr]*ifactor);
+ map_xcell2face_right2[nz] = iface;
+
+ iface++;
+ }
+ }
+ }
+ nxface=iface;
+
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nl = nlft[nz];
+ if (nl == nz) continue;
+
+ if (level[nl] < level[nz] && is_upper(j[nz])){
+ map_xcell2face_left1[nz] = map_xcell2face_right2[nl];
+ } else {
+ map_xcell2face_left1[nz] = map_xcell2face_right1[nl];
+ if (level[nl] > level[nz]){
+ map_xcell2face_left2[nz] = map_xcell2face_right1[ntop[nl]];
+ }
+ }
+
+ }
+
+ map_yface2cell_lower.clear();
+ map_yface2cell_upper.clear();
+
+ map_ycell2face_bot1.clear();
+ map_ycell2face_bot2.clear();
+ map_ycell2face_top1.clear();
+ map_ycell2face_top2.clear();
+ map_ycell2face_bot1.resize(ncells, -1);
+ map_ycell2face_bot2.resize(ncells, -1);
+ map_ycell2face_top1.resize(ncells, -1);
+ map_ycell2face_top2.resize(ncells, -1);
+
+ yface_i.clear();
+ yface_j.clear();
+ yface_level.clear();
+
+ iymin_level.clear();
+ iymax_level.clear();
+ jymin_level.clear();
+ jymax_level.clear();
+ iymin_level.resize(levmx+1, 9999999);
+ iymax_level.resize(levmx+1, -9999999);
+ jymin_level.resize(levmx+1, 9999999);
+ jymax_level.resize(levmx+1, -9999999);
+
+ iyadjust.clear();
+ iyadjust.resize(levmx+1);
+ jyadjust.clear();
+ jyadjust.resize(levmx+1);
+
+ iface=0;
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nt = ntop[nz];
+ if (nt == nz) continue;
+
+ int ifactor = 1;
+ if (level[nt] < level[nz]) ifactor = 2;
+
+ // Have top face
+ // printf("DEBUG -- iface %d lower nz %d upper nr %d\n",iface,nz,nt);
+ map_yface2cell_lower.push_back(nz);
+ map_yface2cell_upper.push_back(nt);
+ yface_level.push_back(MAX(level[nz],level[nt]));
+ yface_j.push_back(j[nt]*ifactor);
+ if (level[nt] < level[nz] && is_upper(i[nz]) ) {
+ yface_i.push_back(i[nt]*ifactor+1);
+ } else{
+ yface_i.push_back(i[nt]*ifactor);
+ }
+ map_ycell2face_top1[nz] = iface;
+
+ iface++;
+
+ if (level[nt] > level[nz] &&is_lower(i[nt]) ){
+ int nrt = nrht[nt];
+ if (nrt != nt) {
+ map_yface2cell_lower.push_back(nz);
+ map_yface2cell_upper.push_back(nrt);
+ yface_level.push_back(MAX(level[nz],level[nrt]));
+ yface_j.push_back(j[nrt]*ifactor);
+ yface_i.push_back(i[nrt]*ifactor);
+ map_ycell2face_top2[nz] = iface;
+
+ iface++;
+ }
+ }
+ }
+ nyface=iface;
+
+ for (int nz=0; nz<(int)ncells; nz++){
+ int nb = nbot[nz];
+ if (nb == nz) continue;
+
+ if (level[nb] < level[nz] && is_upper(i[nz])){
+ map_ycell2face_bot1[nz] = map_ycell2face_top2[nb];
+ } else {
+ map_ycell2face_bot1[nz] = map_ycell2face_top1[nb];
+ if (level[nb] > level[nz]){
+ map_ycell2face_bot2[nz] = map_ycell2face_top1[nrht[nb]];
+ }
+ }
+
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+
+ int fi = xface_i[iface];
+ if (fi < ixmin_level[fl]) ixmin_level[fl] = fi;
+ if (fi > ixmax_level[fl]) ixmax_level[fl] = fi;
+
+ int fj = xface_j[iface];
+ if (fj < jxmin_level[fl]) jxmin_level[fl] = fj;
+ if (fj > jxmax_level[fl]) jxmax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ int fl = xface_level[iface];
+ if (ixmax_level[fl] < ixmin_level[fl]) continue;
+
+ xface_i[iface] -= ixmin_level[fl];
+ xface_j[iface] -= jxmin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ ixadjust[fl] = ixmin_level[fl];
+ jxadjust[fl] = jxmin_level[fl];
+ ixmax_level[fl] -= ixmin_level[fl];;
+ jxmax_level[fl] -= jxmin_level[fl];
+ ixmin_level[fl] = 0;
+ jxmin_level[fl] = 0;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+
+ int fi = yface_i[iface];
+ if (fi < iymin_level[fl]) iymin_level[fl] = fi;
+ if (fi > iymax_level[fl]) iymax_level[fl] = fi;
+
+ int fj = yface_j[iface];
+ if (fj < jymin_level[fl]) jymin_level[fl] = fj;
+ if (fj > jymax_level[fl]) jymax_level[fl] = fj;
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ int fl = yface_level[iface];
+ if (iymax_level[fl] < iymin_level[fl]) continue;
+
+ yface_i[iface] -= iymin_level[fl];
+ yface_j[iface] -= jymin_level[fl];
+ }
+
+ for (int fl = 0; fl <= levmx; fl++){
+ iyadjust[fl] = iymin_level[fl];
+ jyadjust[fl] = jymin_level[fl];
+ iymax_level[fl] -= iymin_level[fl];;
+ jymax_level[fl] -= jymin_level[fl];
+ iymin_level[fl] = 0;
+ jymin_level[fl] = 0;
+ }
+
+}
+
+int **Mesh::get_xface_flag(int lev, bool print_output)
+{
+ int **xface_flag = (int **)genmatrix(jxmax_level[lev]+1,
+ ixmax_level[lev]+1, sizeof(int));
+ for (int jj=0; jj<jxmax_level[lev]+1; jj++){
+ for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+ xface_flag[jj][ii] = -1;
+ }
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ if (xface_level[iface] == lev){
+ int ii = xface_i[iface];
+ int jj = xface_j[iface];
+
+ xface_flag[jj][ii] = 1;
+ }
+ }
+
+ if (DEBUG || print_output) {
+ printf("DEBUG -- x face_flag for level %d\n",lev);
+ printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",ixmax_level[lev]+1,jxmax_level[lev]+1);
+
+ printf(" ");
+ for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+ printf(" %4d ",ii);
+ }
+ printf("\n");
+
+ for (int jj=jxmax_level[lev]; jj>=0; jj--){
+
+ printf("DEBUG -- j %4d: ",jj);
+ for (int ii=0; ii<ixmax_level[lev]+1; ii++){
+ if (xface_flag[jj][ii] >= 0){
+ //printf(" xface_flag_check[%d][%d] = 1;\n",jj,ii);
+ printf(" %4d ", xface_flag[jj][ii]);
+ } else {
+ printf(" ");
+ }
+ }
+ printf("\n");
+ }
+ }
+
+ return(xface_flag);
+}
+
+int **Mesh::get_yface_flag(int lev, bool print_output)
+{
+ int **yface_flag = (int **)genmatrix(jymax_level[lev]+1,
+ iymax_level[lev]+1, sizeof(int));
+ for (int jj=0; jj<jymax_level[lev]+1; jj++){
+ for (int ii=0; ii<iymax_level[lev]+1; ii++){
+ yface_flag[jj][ii] = -1;
+ }
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ if (yface_level[iface] == lev){
+ int ii = yface_i[iface];
+ int jj = yface_j[iface];
+
+ yface_flag[jj][ii] = 1;
+ }
+ }
+
+ if (DEBUG || print_output) {
+ printf("DEBUG -- y face_flag for level %d\n",lev);
+ printf("DEBUG -- sizes isize+1 %d jsize+1 %d\n",iymax_level[lev]+1,jymax_level[lev]+1);
+
+ printf(" ");
+ for (int ii=0; ii<iymax_level[lev]+1; ii++){
+ printf(" %4d ",ii);
+ }
+ printf("\n");
+
+ for (int jj=jymax_level[lev]; jj>=0; jj--){
+
+ printf("DEBUG -- j %4d: ",jj);
+ for (int ii=0; ii<iymax_level[lev]+1; ii++){
+ if (yface_flag[jj][ii] >= 0){
+ //printf(" yface_flag_check[%d][%d] = 1;\n",jj,ii);
+ printf(" %4d ", yface_flag[jj][ii]);
+ } else {
+ printf(" ");
+ }
+ }
+ printf("\n");
+ }
+ }
+
+ return(yface_flag);
+}
+
+void Mesh::get_flat_grid(int lev, int ***zone_flag_base, int ***zone_cell_base)
+{
+ int isize = ixmax_level[lev]+4;
+ int jsize = jymax_level[lev]+4;
+ int iadjust = ixadjust[lev]-2;
+ int jadjust = jyadjust[lev]-2;
+
+ //printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
+ //printf("DEBUG -- adjust ixadjust %d jxadjust %d\n",ixadjust[lev],jxadjust[lev]);
+ //printf("DEBUG -- adjust iyadjust %d jyadjust %d\n",iyadjust[lev],jyadjust[lev]);
+
+ (*zone_flag_base) = (int **)genmatrix(jsize, isize, sizeof(int));
+
+ int **zone_flag = *zone_flag_base;
+ for (int jj=0; jj<jsize; jj++){
+ for (int ii=0; ii<isize; ii++){
+ zone_flag[jj][ii] = -1;
+ }
+ }
+
+ (*zone_cell_base) = (int **)genmatrix(jsize, isize, sizeof(int));
+
+ int **zone_cell = *zone_cell_base;
+ for (int jj=0; jj<jsize; jj++){
+ for (int ii=0; ii<isize; ii++){
+ zone_cell[jj][ii] = -1;
+ }
+ }
+
+ for (int iface=0; iface < nxface; iface++){
+ if (xface_level[iface] == lev){
+ int nz1 = map_xface2cell_lower[iface];
+ int nz2 = map_xface2cell_upper[iface];
+
+ if (lev == level[nz1]) {
+ int iii = i[nz1]-iadjust;
+ int jjj = j[nz1]-jadjust;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz1;
+ if (nlft[nz1] != REAL_CELL) {
+ zone_cell[jjj][iii-1] = nlft[nz1];
+ }
+ } else {
+ int iii = i[nz1]*2-iadjust+1;
+ int jjj = j[nz1]*2-jadjust;
+ if (is_upper(j[nz2])) jjj += 1;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz1;
+ zone_cell[jjj][iii-1] = nz1;
+ }
+ if (lev == level[nz2]) {
+ int iii = i[nz2]-iadjust;
+ int jjj = j[nz2]-jadjust;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz2;
+ if (nrht[nz2] != REAL_CELL) {
+ zone_cell[jjj][iii+1] = nrht[nz2];
+ }
+ } else {
+ int iii = i[nz2]*2-iadjust;
+ int jjj = j[nz2]*2-jadjust;
+ if (is_upper(j[nz1])) jjj += 1;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz2;
+ zone_cell[jjj][iii+1] = nz2;
+ }
+ }
+ }
+
+ for (int iface=0; iface < nyface; iface++){
+ if (yface_level[iface] == lev){
+ int nz1 = map_yface2cell_lower[iface];
+ int nz2 = map_yface2cell_upper[iface];
+
+ if (lev == level[nz1]) {
+ int iii = i[nz1]-iadjust;
+ int jjj = j[nz1]-jadjust;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz1;
+ if (nbot[nz1] != REAL_CELL) {
+ zone_cell[jjj-1][iii] = nbot[nz1];
+ }
+ } else {
+ int iii = i[nz1]*2-iadjust;
+ int jjj = j[nz1]*2-jadjust+1;
+ if (is_upper(i[nz2])) iii += 1;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz1;
+ zone_cell[jjj-1][iii] = nz1;
+ }
+ if (lev == level[nz2]) {
+ int iii = i[nz2]-iadjust;
+ int jjj = j[nz2]-jadjust;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz2;
+ if (ntop[nz2] != REAL_CELL) {
+ zone_cell[jjj+1][iii] = ntop[nz2];
+ }
+ } else {
+ int iii = i[nz2]*2-iadjust;
+ int jjj = j[nz2]*2-jadjust;
+ if (is_upper(i[nz1])) iii += 1;
+ zone_flag[jjj][iii] = 1;
+ zone_cell[jjj][iii] = nz2;
+ zone_cell[jjj+1][iii] = nz2;
+ }
+ }
+ }
+
+ if (DEBUG) {
+ printf("DEBUG -- zone_flag for level %d\n",lev);
+ printf("DEBUG -- sizes isize %d jsize %d\n",isize,jsize);
+ for (int j=jsize-1; j>=0; j--){
+ for (int i=0; i<isize; i++){
+ if (zone_flag[j][i] >= 0){
+ printf(" zone_flag_check[%d][%d] = 1;\n",j,i);
+ }
+ }
+ }
+ for (int j=jsize-1; j>=0; j--){
+ for (int i=0; i<isize; i++){
+ if (zone_cell[j][i] >= 0){
+ printf(" zone_cell_check[%d][%d] = %d;\n",j,i,zone_cell[j][i]);
+ }
+ }
+ }
+
+ printf(" ");
+ for (int i=0; i<isize; i++){
+ printf(" %4d ",i);
+ }
+ printf("\n");
+
+ for (int j=jsize-1; j>=0; j--){
+
+ printf("DEBUG -- j %4d: ",j);
+ for (int i=0; i<isize; i++){
+ if (zone_flag[j][i] >= 0){
+ printf(" %4d ", zone_flag[j][i]);
+ } else {
+ printf(" ");
+ }
+ }
+ printf("\n");
+ }
+
+ printf("DEBUG -- zone_cell for level %d\n",lev);
+
+ printf(" ");
+ for (int i=0; i<isize; i++){
+ printf(" %4d ",i);
+ }
+ printf("\n");
+
+ for (int j=jsize-1; j>=0; j--){
+
+ printf("DEBUG -- j %4d: ",j);
+ for (int i=0; i<isize; i++){
+ if (zone_cell[j][i] >= 0){
+ printf(" %4d ", zone_cell[j][i]);
+ } else {
+ printf(" ");
+ }
+ }
+ printf("\n");
+ }
+ }
+}
+
+void Mesh::calc_face_list_clearmaps()
+{
+ map_xface2cell_lower.clear();
+ map_xface2cell_upper.clear();
+
+ map_xcell2face_left1.clear();
+ map_xcell2face_left2.clear();
+ map_xcell2face_right1.clear();
+ map_xcell2face_right2.clear();
+
+ map_yface2cell_lower.clear();
+ map_yface2cell_upper.clear();
+
+ map_ycell2face_bot1.clear();
+ map_ycell2face_bot2.clear();
+ map_ycell2face_top1.clear();
+ map_ycell2face_top2.clear();
+}
+
+void Mesh::timer_output(mesh_timer_category category, mesh_device_types device_type, int timer_level)
+{
+ double local_time = 0.0;
+ if (device_type == MESH_DEVICE_CPU){
+ local_time = get_cpu_timer(category);
+ } else {
+ local_time = get_gpu_timer(category);
+ }
+
+ char string[80] = "/0";
+
+ if (mype == 0) {
+ const char *blank=" ";
+
+ if (device_type == MESH_DEVICE_CPU){
+ sprintf(string,"CPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
+ } else {
+ sprintf(string,"GPU: %.*s%-30.30s\t", 2*timer_level, blank, mesh_timer_descriptor[category]);
+ }
+ }
+
+ parallel_output(string, local_time, timer_level, "s");
+}
+
+void Mesh::parallel_output(const char *string, double local_value, int output_level, const char *units)
+{
+ vector<double> global_values(numpe);
+ global_values[0] = local_value;
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ MPI_Gather(&local_value, 1, MPI_DOUBLE, &global_values[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+ }
+#endif
+ if (mype == 0) {
+ const char *blank=" ";
+
+ printf("%s\t",string);
+ if (numpe <= 4) {
+ for(int ip = 0; ip < numpe; ip++){
+ printf("%.*s%8.4f\t", 2*output_level, blank, global_values[ip]);
+ }
+ printf("%s\n",units);
+ } else {
+ sort(global_values.begin(),global_values.end());
+ double median_value;
+ int half_value = numpe/2;
+ if (numpe%2 == 0) {
+ median_value = (global_values[half_value-1]+global_values[half_value])/2.0;
+ } else {
+ median_value = global_values[half_value+1];
+ }
+ printf("%.*s%8.4f\t%.*s%8.4f\t%.*s%8.4f %s min/median/max\n",
+ 2*output_level, blank, global_values[0],
+ 2*output_level, blank, median_value,
+ 2*output_level, blank, global_values[numpe-1],
+ units);
+ }
+ }
+}
+
+void Mesh::parallel_output(const char *string, long long local_value, int output_level, const char *units)
+{
+ vector<long long> global_values(numpe);
+ global_values[0] = local_value;
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ MPI_Gather(&local_value, 1, MPI_LONG_LONG, &global_values[0], 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
+ }
+#endif
+ if (mype == 0) {
+ const char *blank=" ";
+
+ printf("%s\t",string);
+ if (numpe <= 4) {
+ for(int ip = 0; ip < numpe; ip++){
+ printf("%.*s%10lld\t", 2*output_level, blank, global_values[ip]);
+ }
+ printf("%s\n",units);
+ } else {
+ sort(global_values.begin(),global_values.end());
+ long long median_value;
+ int half_value = numpe/2;
+ if (numpe%2 == 0) {
+ median_value = (global_values[half_value-1]+global_values[half_value])/2;
+ } else {
+ median_value = global_values[half_value+1];
+ }
+ printf("%.*s%10lld\t%.*s%10lld\t%.*s%10lld %s min/median/max\n",
+ 2*output_level, blank, global_values[0],
+ 2*output_level, blank, median_value,
+ 2*output_level, blank, global_values[numpe-1],
+ units);
+ }
+ }
+}
+
+void Mesh::parallel_output(const char *string, int local_value, int output_level, const char *units)
+{
+ vector<int> global_values(numpe);
+ global_values[0] = local_value;
+#ifdef HAVE_MPI
+ if (numpe > 1) {
+ MPI_Gather(&local_value, 1, MPI_INT, &global_values[0], 1, MPI_INT, 0, MPI_COMM_WORLD);
+ }
+#endif
+ if (mype == 0) {
+ const char *blank=" ";
+
+ printf("%s\t",string);
+ if (numpe <= 4) {
+ for(int ip = 0; ip < numpe; ip++){
+ printf("%.*s%10d\t", 2*output_level, blank, global_values[ip]);
+ }
+ printf("%s\n",units);
+ } else {
+ sort(global_values.begin(),global_values.end());
+ int median_value;
+ int half_value = numpe/2;
+ if (numpe%2 == 0) {
+ median_value = (global_values[half_value-1]+global_values[half_value])/2;
+ } else {
+ median_value = global_values[half_value+1];
+ }
+ printf("%.*s%10d\t%.*s%10d\t%.*s%10d %s min/median/max\n",
+ 2*output_level, blank, global_values[0],
+ 2*output_level, blank, median_value,
+ 2*output_level, blank, global_values[numpe-1],
+ units);
+ }
+ }
+}
+
+const int CRUX_MESH_VERSION = 103;
+const int num_int_dist_vals = 3;
+const int num_int_vals = 3;
+const int num_double_vals = 1;
+
+size_t Mesh::get_checkpoint_size(void)
+{
+ size_t nsize;
+ nsize = num_int_dist_vals*sizeof(int);
+ nsize += num_int_vals*sizeof(int);
+ nsize += num_double_vals*sizeof(double);
+ nsize += 2*MESH_COUNTER_SIZE*sizeof(int);
+ nsize += MESH_TIMER_SIZE*sizeof(double);
+ nsize += MESH_TIMER_SIZE*sizeof(long);
+ nsize += ncells*3*sizeof(int);
+ return(nsize);
+}
+
+void Mesh::store_checkpoint(Crux *crux)
+{
+ // Need ncells for memory allocation
+ int storage = mesh_memory.get_memory_capacity(level);
+ crux->store_named_ints("storage", 7, &storage, 1);
+ // Write scalars to arrays for storing in checkpoint
+ int int_vals[num_int_vals];
+
+ int_vals[ 0] = CRUX_MESH_VERSION;
+ int_vals[ 1] = ndim;
+ int_vals[ 2] = levmx;
+
+ // These are for values that will be different on every processor
+ int int_dist_vals[num_int_dist_vals];
+ int_dist_vals[ 0] = (int)ncells;
+ int_dist_vals[ 1] = (int)ncells_ghost;
+ int_dist_vals[ 2] = offtile_local_count;
+
+ double double_vals[num_double_vals];
+
+ double_vals[0] = offtile_ratio_local;
+
+ int flags = RESTART_DATA;
+ // Now add memory entries to database for storing checkpoint
+ mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
+ flags = RESTART_DATA | REPLICATED_DATA;
+ mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
+
+ flags = RESTART_DATA;
+ mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
+ mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
+ mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
+
+ mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
+ mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
+
+ // Store MallocPlus memory database
+ crux->store_MallocPlus(mesh_memory);
+
+ // Remove memory entries from database now that data is stored
+ mesh_memory.memory_remove(int_dist_vals);
+ mesh_memory.memory_remove(int_vals);
+ mesh_memory.memory_remove(double_vals);
+ mesh_memory.memory_remove(cpu_counters);
+ mesh_memory.memory_remove(gpu_counters);
+ mesh_memory.memory_remove(cpu_timers);
+ mesh_memory.memory_remove(gpu_timers);
+}
+
+void Mesh::restore_checkpoint(Crux *crux)
+{
+ int storage;
+ crux->restore_named_ints("storage", 7, &storage, 1);
+
+ // Create memory for reading data into
+ int int_dist_vals[num_int_dist_vals];
+ int int_vals[num_int_vals];
+ double double_vals[num_double_vals];
+
+ mesh_memory.memory_delete(nlft);
+ mesh_memory.memory_delete(nrht);
+ mesh_memory.memory_delete(nbot);
+ mesh_memory.memory_delete(ntop);
+ mesh_memory.memory_delete(celltype);
+
+ nlft = NULL;
+ nrht = NULL;
+ ntop = NULL;
+ nbot = NULL;
+ celltype = NULL;
+
+ // Resize is a mesh method
+ // resize(storage);
+ // memory_reset_ptrs();
+ allocate (storage);
+
+ int flags = RESTART_DATA;
+ // Now add memory entries to database for restoring checkpoint
+ mesh_memory.memory_add(int_dist_vals, (size_t)num_int_dist_vals, 4, "mesh_int_dist_vals", flags);
+ flags = RESTART_DATA | REPLICATED_DATA;
+ mesh_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "mesh_int_vals", flags);
+ mesh_memory.memory_add(double_vals, (size_t)num_double_vals, 8, "mesh_double_vals", flags);
+
+ flags = RESTART_DATA;
+ mesh_memory.memory_add(cpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_cpu_counters", flags);
+ mesh_memory.memory_add(gpu_counters, (size_t)MESH_COUNTER_SIZE, 4, "mesh_gpu_counters", flags);
+
+ mesh_memory.memory_add(cpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_cpu_timers", flags);
+ mesh_memory.memory_add(gpu_timers, (size_t)MESH_TIMER_SIZE, 8, "mesh_gpu_timers", flags);
+
+ // Restore MallocPlus memory database
+ crux->restore_MallocPlus(mesh_memory);
+
+ // Remove memory entries from database now that data is restored
+ mesh_memory.memory_remove(int_dist_vals);
+ mesh_memory.memory_remove(int_vals);
+ mesh_memory.memory_remove(double_vals);
+ mesh_memory.memory_remove(cpu_counters);
+ mesh_memory.memory_remove(gpu_counters);
+ mesh_memory.memory_remove(cpu_timers);
+ mesh_memory.memory_remove(gpu_timers);
+
+ // Check version number
+ if (int_vals[ 0] != CRUX_MESH_VERSION) {
+ printf("CRUX version mismatch for mesh data, version on file is %d, version in code is %d\n",
+ int_vals[0], CRUX_MESH_VERSION);
+ exit(0);
+ }
+
+ // Copy out scalar values from array
+ ncells = int_dist_vals[ 0];
+ ncells_ghost = int_dist_vals[ 1];
+ offtile_local_count = int_dist_vals[ 2];
+
+ // Copy out scalar values from array
+ ndim = int_vals[ 1];
+ levmx = int_vals[ 2];
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS && mype == 0) {
+ const char *int_dist_vals_descriptor[num_int_dist_vals] = {
+ "ncells",
+ "ncells_ghost",
+ "offtile_local_count"
+ };
+ const char *int_vals_descriptor[num_int_vals] = {
+ "CRUX_MESH_VERSION",
+ "ndim",
+ "levmx",
+ };
+ printf("\n");
+ printf(" === Restored mesh int_dist_vals ===\n");
+ for (int i = 0; i < num_int_dist_vals; i++){
+ printf(" %-30s %d\n",int_dist_vals_descriptor[i], int_dist_vals[i]);
+ }
+ printf(" === Restored mesh int_vals ===\n");
+ for (int i = 0; i < num_int_vals; i++){
+ printf(" %-30s %d\n",int_vals_descriptor[i], int_vals[i]);
+ }
+ printf(" === Restored mesh int_vals ===\n");
+ printf("\n");
+ }
+#endif
+
+ offtile_ratio_local = double_vals[0];
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS && mype == 0) {
+ const char *double_vals_descriptor[num_double_vals] = {
+ "offtile_ratio_local"
+ };
+ printf("\n");
+ printf(" === Restored mesh double_vals ===\n");
+ for (int i = 0; i < num_double_vals; i++){
+ printf(" %-30s %lf\n",double_vals_descriptor[i], double_vals[i]);
+ }
+ printf(" === Restored mesh double_vals ===\n");
+ printf("\n");
+ }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS && mype == 0) {
+ printf(" === Restored mesh cpu counters ===\n");
+ for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+ printf(" %-30s %d\n",mesh_counter_descriptor[i], cpu_counters[i]);
+ }
+ printf(" === Restored mesh cpu counters ===\n");
+ printf(" === Restored mesh gpu counters ===\n");
+ for (int i = 0; i < MESH_COUNTER_SIZE; i++){
+ printf(" %-30s %d\n",mesh_counter_descriptor[i], gpu_counters[i]);
+ }
+ printf(" === Restored mesh gpu counters ===\n");
+ printf("\n");
+ }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS && mype == 0) {
+ printf(" === Restored mesh cpu timers ===\n");
+ for (int i = 0; i < MESH_TIMER_SIZE; i++){
+ printf(" %-30s %lf\n",mesh_timer_descriptor[i], cpu_timers[i]);
+ }
+ printf(" === Restored mesh cpu timers ===\n");
+ printf("\n");
+ }
+#endif
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS && mype == 0) {
+ printf("\n");
+ printf(" === Restored mesh gpu timers ===\n");
+ for (int i = 0; i < MESH_TIMER_SIZE; i++){
+ printf(" %-30s %lld\n",mesh_timer_descriptor[i], gpu_timers[i]);
+ }
+ printf(" === Restored mesh gpu timers ===\n");
+ printf("\n");
+ }
+#endif
+ //calc_celltype(ncells);
+}
+
+
+// This code due to Matt Calef
+void scan ( scanInt *input , scanInt *output , scanInt length)
+{
+#ifdef _OPENMP
+ // This already assumes it is in a parallel region
+
+ // Get the total number of threads
+
+ scanInt numThreads = omp_get_num_threads ( );
+
+ // Compute the range for which this thread is responsible.
+
+ scanInt threadID = omp_get_thread_num ( );
+ scanInt start = length * ( threadID ) / numThreads;
+ scanInt end = length * ( threadID + 1 ) / numThreads;
+
+ // In the case that there are fewer entries than threads, some
+ // threads will have no entries. Only perform this operation if
+ // there is a postive number of entries.
+
+ if ( start < end ) {
+
+ // Do a scan over the region for this thread, with an initial
+ // value of zero.
+
+ output[start] = 0;
+ for ( scanInt i = start + 1 ; i < end ; i++ )
+ output[i] = output[i-1] + input[i-1];
+ }
+
+ // Wait until all threads get here.
+
+#pragma omp barrier
+
+ // At this point each thread has done an independent scan of its
+ // region. All scans, except the first, are off by an
+ // offset. Here we have a single thread compute that offset with a
+ // serial scan that strides over the regions assigned to each
+ // thread.
+
+#pragma omp single
+ for ( scanInt i = 1 ; i < numThreads ; i ++ ) {
+ scanInt s0 = length * ( i - 1 ) / numThreads;
+ scanInt s1 = length * ( i ) / numThreads;
+
+ if ( s0 < s1 )
+ output[s1] = output[s0] + input[s1-1];
+
+ if ( s0 < s1 - 1 )
+ output[s1] += output[s1-1];
+ }
+
+ // Barrier is implicit from omp single Wait until all threads get here.
+
+ // Apply the offset to the range for this thread.
+
+ for ( scanInt i = start + 1 ; i < end ; i++ )
+ output[i] += output[start];
+
+#else
+ output[0] = 0;
+ for (int ic = 0; ic < length; ic++){
+ output[ic+1] = output[ic] + input[ic];
+ }
+#endif
+}
+/****************************************************//**
+*GET BOUNDS!!!!!!****
+**********************************/
+void Mesh::get_bounds(int& lowerBound, int& upperBound){
+#ifdef _OPENMP
+ int threadID = omp_get_thread_num();
+ lowerBound = lowerBound_Global[threadID];
+ upperBound = upperBound_Global[threadID];
+// printf("GETBOUNDs ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
+#else
+ lowerBound = 0;
+ upperBound = ncells;
+#endif
+}
+
+/****************************************************//**
+*SETTING BOUNDS!!!!!!****
+**********************************/
+void Mesh::set_bounds(int n){
+
+#ifdef _OPENMP
+ // #pragma omp parallel
+ {
+ int nthreads = omp_get_num_threads();//Private for each thread
+ int threadID = omp_get_thread_num(); //Private for each thread
+ #pragma omp master
+ {
+ if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(nthreads*sizeof(int));
+ if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(nthreads*sizeof(int));
+ }
+ //#pragma omp flush (lowerBound_Global, upperBound_Global)
+ #pragma omp barrier
+
+ int work = n/nthreads;
+ if(threadID<(n%nthreads))work++;
+ int lowerBound = ((n / nthreads)*threadID) + min(n%nthreads, threadID);
+ int upperBound = lowerBound + work;
+// printf("ThreadID: %d, upperBound: %d, lowerBound: %d \n",threadID, upperBound, lowerBound);
+ lowerBound_Global[threadID] = lowerBound;
+ upperBound_Global[threadID] = upperBound;
+ }
+#else
+ if(lowerBound_Global == NULL) lowerBound_Global = (int *)malloc(1*sizeof(int));
+ if(upperBound_Global == NULL) upperBound_Global = (int *)malloc(1*sizeof(int));
+ int lowerBound = 0;
+ int upperBound = ncells;
+ lowerBound_Global[0] = lowerBound;
+ upperBound_Global[0] = upperBound;
+#endif
+
+}
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/mesh.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef MESH_H_
+#define MESH_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "MallocPlus.h"
+#include <string>
+#include <stdio.h>
+#include <vector>
+#include <math.h>
+#include "KDTree.h"
+#include "crux.h"
+#include "partition.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+
+#if !defined(FULL_PRECISION) && !defined(MIXED_PRECISION) && !defined(MINIMUM_PRECISION)
+#define FULL_PRECISION
+#endif
+#ifdef NO_CL_DOUBLE
+#undef FULL_PRECISION
+#undef MIXED_PRECISION
+#define MINIMUM_PRECISION
+#endif
+
+#if defined(MINIMUM_PRECISION)
+ typedef float real_t; // this is used for intermediate calculations
+ typedef float spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+ typedef cl_float cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_float cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+ #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
+ #define MPI_SPATIAL_T MPI_FLOAT
+#endif
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+ typedef double real_t;
+ typedef float spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+ typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_float cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+ #define MPI_REAL_T MPI_DOUBLE
+ #define MPI_SPATIAL_T MPI_FLOAT
+#endif
+
+#elif defined(FULL_PRECISION)
+ typedef double real_t;
+ typedef double spatial_t; // for spatial variables
+#ifdef HAVE_OPENCL
+ typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_double cl_spatial_t;
+#endif
+#ifdef HAVE_MPI
+ #define MPI_REAL_T MPI_DOUBLE
+ #define MPI_SPATIAL_T MPI_DOUBLE
+#endif
+#endif
+
+#define TILE_SIZE 128
+
+#define SWAP_PTR(xnew,xold,xtmp) (xtmp=xnew, xnew=xold, xold=xtmp)
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+typedef unsigned int uint;
+
+//float mem_opt_factor = 1.0;
+
+enum boundary
+{ REAL_CELL = 1, // Denotes cell type of real cell.
+ LEFT_BOUNDARY = -1, // Denotes left boundary ghost cell.
+ RIGHT_BOUNDARY = -2, // Denotes right boundary ghost cell.
+ BOTTOM_BOUNDARY= -3, // Denotes bottom boundary ghost cell.
+ TOP_BOUNDARY = -4, // Denotes top boundary ghost cell.
+ FRONT_BOUNDARY = -5, // Denotes front boundary ghost cell.
+ BACK_BOUNDARY = -6 }; // Denotes back boundary ghost cell.
+
+enum dimensionality
+{ ONE_DIMENSIONAL = 1, // Dimensionality based at 1 for clarity.
+ TWO_DIMENSIONAL,
+ THREE_DIMENSIONAL};
+
+enum orientation
+{ SW, // SW quadrant.
+ NW, // NW quadrant.
+ NE, // NE quadrant.
+ SE }; // SE quadrant.
+
+enum neighbor_calc
+{ HASH_TABLE, // Hash Table.
+ KDTREE }; // kD-tree.
+
+enum mesh_timers
+{
+ MESH_TIMER_COUNT_BCS,
+ MESH_TIMER_CALC_NEIGHBORS,
+ MESH_TIMER_HASH_SETUP,
+ MESH_TIMER_HASH_QUERY,
+ MESH_TIMER_FIND_BOUNDARY,
+ MESH_TIMER_PUSH_SETUP,
+ MESH_TIMER_PUSH_BOUNDARY,
+ MESH_TIMER_LOCAL_LIST,
+ MESH_TIMER_LAYER1,
+ MESH_TIMER_LAYER2,
+ MESH_TIMER_LAYER_LIST,
+ MESH_TIMER_COPY_MESH_DATA,
+ MESH_TIMER_FILL_MESH_GHOST,
+ MESH_TIMER_FILL_NEIGH_GHOST,
+ MESH_TIMER_SET_CORNER_NEIGH,
+ MESH_TIMER_NEIGH_ADJUST,
+ MESH_TIMER_SETUP_COMM,
+ MESH_TIMER_KDTREE_SETUP,
+ MESH_TIMER_KDTREE_QUERY,
+ MESH_TIMER_REFINE_SMOOTH,
+ MESH_TIMER_REZONE_ALL,
+ MESH_TIMER_PARTITION,
+ MESH_TIMER_CALC_SPATIAL_COORDINATES,
+ MESH_TIMER_LOAD_BALANCE,
+ MESH_TIMER_SIZE
+};
+
+enum mesh_counters
+{
+ MESH_COUNTER_REZONE,
+ MESH_COUNTER_REFINE_SMOOTH,
+ MESH_COUNTER_CALC_NEIGH,
+ MESH_COUNTER_LOAD_BALANCE,
+ MESH_COUNTER_SIZE
+};
+
+//#ifdef DEBUG_RESTORE_VALS
+static const char *mesh_counter_descriptor[MESH_COUNTER_SIZE] = {
+ "mesh_counter_rezone",
+ "mesh_counter_refine_smooth",
+ "mesh_counter_calc_neigh",
+ "mesh_counter_load_balance"
+};
+//#endif
+
+typedef enum mesh_timers mesh_timer_category;
+typedef enum mesh_counters mesh_counter_category;
+
+enum mesh_device_types
+{
+ MESH_DEVICE_CPU,
+ MESH_DEVICE_GPU
+};
+
+typedef mesh_device_types mesh_device_type;
+
+using namespace std;
+
+/****************************************************************//**
+ * Mesh class
+ * Contains the cell-based adaptive mesh refinement
+ * (AMR) object with its data and methods.
+ *******************************************************************/
+class Mesh
+{
+
+public:
+ int ndim; //!< Dimensionality of mesh (2 or 3).
+
+ MallocPlus mesh_memory;
+ MallocPlus gpu_mesh_memory;
+
+#ifdef HAVE_OPENCL
+ string defines;
+#endif
+
+ double cpu_timers[MESH_TIMER_SIZE];
+ long long gpu_timers[MESH_TIMER_SIZE];
+
+ int cpu_counters[MESH_COUNTER_SIZE];
+ int gpu_counters[MESH_COUNTER_SIZE];
+
+ bool do_rezone,
+ gpu_do_rezone;
+
+ int mype,
+ numpe,
+ parallel,
+ cell_handle,
+ noffset;
+
+ int *lowerBound_Global,
+ *upperBound_Global;
+
+ float mem_factor;
+
+ double offtile_ratio_local;
+ int offtile_local_count;
+
+ vector<int> corners_i,
+ corners_j;
+
+ vector<int> nsizes,
+ ndispl;
+
+ FILE *fp;
+
+ TKDTree tree; //!< k-D tree for neighbor search.
+ vector<int> proc;
+ vector<int> lev_ibegin, //!< Lowest x-index in use at specified level of refinement.
+ lev_iend, //!< Highest x-index in use at specified level of refinement.
+ lev_jbegin, //!< Lowest y-index in use at specified level of refinement.
+ lev_jend, //!< Highest y-index in use at specified level of refinement.
+ lev_kbegin, //!< Lowest z-index in use at specified level of refinement.
+ lev_kend, //!< Highest z-index in use at specified level of refinement.
+ levtable; //!< Powers of two to simplify i,j calculations
+ vector<real_t> lev_deltax, //!< Grid spacing along x-axis at specified level of refinement.
+ lev_deltay, //!< Grid spacing along y-axis at specified level of refinement.
+ lev_deltaz; //!< Grid spacing along z-axis at specified level of refinement.
+ int levmx, //!< Maximum level of refinement allowed.
+ have_boundary,//!< Mesh includes boundary cells, else creates on the fly
+ ibase, //!< Index basis for arrays (0 for C, 1 for Fortan).
+ imin, //!< Lowest x-index in use.
+ imax, //!< Highest x-index in use.
+ jmin, //!< Lowest y-index in use.
+ jmax, //!< Highest y-index in use.
+ kmin, //!< Lowest z-index in use.
+ kmax; //!< Highest z-index in use.
+ size_t ncells, //!< Number of cells in mesh.
+ ncells_global, //!< Global number of cells for parallel runs
+ ncells_ghost; //!< Number of cells in mesh with ghost cells.
+ real_t xmin, //!< Lowest x-coordinate in use.
+ xmax, //!< Highest x-coordinate in use.
+ ymin, //!< Lowest y-coordinate in use.
+ ymax, //!< Highest y-coordinate in use.
+ zmin, //!< Lowest z-coordinate in use.
+ zmax, //!< Highest z-coordinate in use.
+ xcentermin, //!< Center of minimum x cell
+ xcentermax, //!< Center of maximum x cell
+ ycentermin, //!< Center of minimum y cell
+ ycentermax, //!< Center of maximum y cell
+ zcentermin, //!< Center of minimum z cell
+ zcentermax, //!< Center of maximum z cell
+ deltax, //!< Grid spacing along x-axis.
+ deltay, //!< Grid spacing along y-axis.
+ deltaz; //!< Grid spacing along z-axis.
+
+ vector<int> index; //!< 1D ordered index of mesh elements.
+
+ // mesh state data
+ int *i, //!< 1D array of mesh element x-indices.
+ *j, //!< 1D array of mesh element y-indices.
+ *k, //!< 1D array of mesh element z-indices.
+ *level, //!< 1D array of mesh element refinement levels.
+ //!< derived data from mesh state data
+ *celltype, //!< 1D ordered index of mesh element cell types (ghost or real).
+ *nlft, //!< 1D ordered index of mesh element left neighbors.
+ *nrht, //!< 1D ordered index of mesh element right neighbors.
+ *nbot, //!< 1D ordered index of mesh element bottom neighbors.
+ *ntop, //!< 1D ordered index of mesh element top neighbors.
+ *nfrt, //!< 1D ordered index of mesh element front neighbors.
+ *nbak; //!< 1D ordered index of mesh element back neighbors.
+
+ vector<spatial_t> x, //!< 1D ordered index of mesh element x-coordinates.
+ dx, //!< 1D ordered index of mesh element x-coordinate spacings.
+ y, //!< 1D ordered index of mesh element y-coordinates.
+ dy, //!< 1D ordered index of mesh element y-coordinate spacings.
+ z, //!< 1D ordered index of mesh element z-coordinates.
+ dz; //!< 1D ordered index of mesh element z-coordinate spacings.
+
+#ifdef HAVE_OPENCL
+ cl_mem dev_ioffset;
+
+ cl_mem dev_celltype,
+ dev_i,
+ dev_j,
+ dev_level,
+ dev_nlft,
+ dev_nrht,
+ dev_nbot,
+ dev_ntop;
+
+ cl_mem dev_levdx, // corresponds to lev_deltax
+ dev_levdy, // corresponds to lev_deltay
+ dev_levibeg,
+ dev_leviend,
+ dev_levjbeg,
+ dev_levjend,
+ dev_levtable; //
+
+ cl_mem dev_corners_i,
+ dev_corners_j;
+#endif
+
+ int nxface;
+ int nyface;
+
+ vector<int> xface_i;
+ vector<int> xface_j;
+ vector<int> xface_level;
+ vector<int> map_xface2cell_lower;
+ vector<int> map_xface2cell_upper;
+
+ vector<int> map_xcell2face_left1;
+ vector<int> map_xcell2face_left2;
+ vector<int> map_xcell2face_right1;
+ vector<int> map_xcell2face_right2;
+
+ vector<int> ixmin_level;
+ vector<int> ixmax_level;
+ vector<int> jxmin_level;
+ vector<int> jxmax_level;
+ vector<int> ixadjust;
+ vector<int> jxadjust;
+
+ vector<int> yface_i;
+ vector<int> yface_j;
+ vector<int> yface_level;
+ vector<int> map_yface2cell_lower;
+ vector<int> map_yface2cell_upper;
+
+ vector<int> map_ycell2face_bot1;
+ vector<int> map_ycell2face_bot2;
+ vector<int> map_ycell2face_top1;
+ vector<int> map_ycell2face_top2;
+
+ vector<int> iymin_level;
+ vector<int> iymax_level;
+ vector<int> jymin_level;
+ vector<int> jymax_level;
+ vector<int> iyadjust;
+ vector<int> jyadjust;
+
+ // Public constructors.
+ Mesh(FILE *fin, int *numpe);
+ Mesh(int nx, int ny, int levmx_in, int ndim_in, double deltax_in, double deltay_in, int boundary, int parallel_in, int do_gpu_calc);
+
+ // Member functions.
+ void init(int nx, int ny, real_t circ_radius, partition_method initial_order, int do_gpu_calc);
+ void terminate(void);
+
+ void set_bounds(int n);
+ void get_bounds(int& lowerBound, int& upperBound);
+
+/****************************************************************//**
+ * @name Memory routines
+ *******************************************************************/
+///@{
+
+/****************************************************************//**
+ * \brief
+ * Allocates the basic mesh memory, i, j, and level, using the MallocPlus
+ * memory database.
+ *
+ * **Parameters**
+ * * size_t ncells -- number of cells in the mesh
+ *
+ * Typical Usage
+ *
+ * mesh.allocate(ncells);
+ *******************************************************************/
+ void allocate(size_t ncells);
+
+ void resize(size_t new_ncells);
+ void memory_reset_ptrs(void);
+ void resize_old_device_memory(size_t ncells);
+///@}
+
+/* inline "macros" */
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Boundary cell tests
+ *******************************************************************/
+ int is_lower_boundary(int *iv, int *lev_begin, int ic) { return (iv[ic] < lev_begin[level[ic]]); }
+ int is_upper_boundary(int *iv, int *lev_end, int ic) { return (iv[ic] > lev_end[level[ic]]); }
+
+ int is_left_boundary(int ic) { return (i[ic] < lev_ibegin[level[ic]]); }
+ int is_right_boundary(int ic) { return (i[ic] > lev_iend[ level[ic]]); }
+ int is_bottom_boundary(int ic) { return (j[ic] < lev_jbegin[level[ic]]); }
+ int is_top_boundary(int ic) { return (j[ic] > lev_jend[ level[ic]]); }
+ int is_front_boundary(int ic) { return (k[ic] < lev_kbegin[level[ic]]); }
+ int is_back_boundary(int ic) { return (k[ic] > lev_kend[ level[ic]]); }
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Tests for positioning in set of 4 cells
+ *******************************************************************/
+ int is_lower(int i) { return(i % 2 == 0); }
+ int is_upper(int i) { return(i % 2 == 1); }
+
+ int is_lower_left(int i, int j) { return(i % 2 == 0 && j % 2 == 0); }
+ int is_lower_right(int i, int j) { return(i % 2 == 1 && j % 2 == 0); }
+ int is_upper_left(int i, int j) { return(i % 2 == 0 && j % 2 == 1); }
+ int is_upper_right(int i, int j) { return(i % 2 == 1 && j % 2 == 1); }
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Level tests
+ *******************************************************************/
+ int is_same_level_or_coarser(int nn, int nz) { return(level[nn] <= level[nz]); }
+ int is_coarser(int nn, int nz) { return(level[nn] < level[nz]); }
+ int is_finer(int nn, int nz) { return(level[nn] > level[nz]); }
+ int is_same_level(int nn, int nz) { return(level[nn] == level[nz]); }
+///@}
+
+/* accessor routines */
+ double get_cpu_timer(mesh_timer_category category) {return(cpu_timers[category]); };
+ /* Convert nanoseconds to msecs */
+ double get_gpu_timer(mesh_timer_category category) {return((double)(gpu_timers[category])*1.0e-9); };
+
+ void parallel_output(const char *string, double local_value, int output_level, const char *units);
+ void parallel_output(const char *string, long long local_value, int output_level, const char *units);
+ void parallel_output(const char *string, int local_value, int output_level, const char *units);
+ void timer_output(mesh_timer_category category, mesh_device_types device_type, int timer_level);
+
+ int get_cpu_counter(mesh_counter_category category) {return(cpu_counters[category]); };
+ int get_gpu_counter(mesh_counter_category category) {return(gpu_counters[category]); };
+
+ int get_calc_neighbor_type(void);
+
+ void print_partition_measure(void);
+ void print_calc_neighbor_type(void);
+ void print_partition_type(void);
+/* end accessor routines */
+
+/* Debugging, internal, or not used yet */
+#ifdef HAVE_OPENCL
+ int gpu_count_BCs();
+#endif
+ void kdtree_setup(void);
+ void partition_measure(void);
+ void partition_cells(int numpe,
+ vector<int> &order,
+ enum partition_method method);
+ void calc_distribution(int numpe);
+ void calc_symmetry(vector<int> &dsym,
+ vector<int> &xsym,
+ vector<int> &ysym);
+
+/* End of debugging, internal, or not used yet */
+
+ //void calc_face_list_test(double *H);
+ void calc_face_list(void);
+ void calc_face_list_wmap(void);
+ void calc_face_list_wbidirmap(void);
+ void calc_face_list_clearmaps(void);
+
+ int **get_xface_flag(int lev, bool print_output=0);
+ int **get_yface_flag(int lev, bool print_output=0);
+ void get_flat_grid(int lev, int ***zone_flag, int ***zone_cell);
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate neighbors
+ *
+ * **Parameters**
+ *
+ * Input -- from within the object
+ * i, j, level
+ * Output -- in the object
+ * nlft, nrht, nbot, ntop arrays
+ *******************************************************************/
+ void calc_neighbors(int ncells);
+ void calc_neighbors_local(void);
+#ifdef HAVE_OPENCL
+ void gpu_calc_neighbors(void);
+ void gpu_calc_neighbors_local(void);
+#endif
+ // TODO: Not created yet; overloading for 3D mesh support. (davis68)
+ void calc_neighbors(vector<int> &nlft,
+ vector<int> &nrht,
+ vector<int> &nbot,
+ vector<int> &ntop,
+ vector<int> &nfrt,
+ vector<int> &nbak,
+ vector<int> index);
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate rezone count
+ *
+ * **Parameters**
+ *
+ * Input
+ * mpot -- potential mesh refinement
+ * ioffset -- write offset for each cell
+ * Output
+ * result -- cell count
+ *******************************************************************/
+ int rezone_count(vector<int> mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+ void gpu_rezone_count2(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result);
+ void gpu_rezone_count(size_t block_size, size_t local_work_size, cl_mem dev_redscratch, cl_mem &dev_result);
+ void gpu_rezone_scan(size_t block_size, size_t local_work_size, cl_mem dev_ioffset, cl_mem &dev_result);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Refine Smooth -- smooths jump in refinement level so that only a 1 to 2 jump occurs
+ *
+ * **Parameters**
+ *
+ * Input/Output
+ * mpot -- potential mesh refinement array, 1 is refine and -1 coarsen
+ * ioffset -- write offset for each cell to account for new cells
+ * result -- refinement count
+ *******************************************************************/
+ size_t refine_smooth(vector<int> &mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+ int gpu_refine_smooth(cl_mem &dev_mpot, int &icount, int &jcount);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Rezone mesh
+ *
+ * **Parameters**
+ *
+ * Input
+ * add_ncells -- for each processor. A global sum will be done and the main part of
+ * the rezone will be skipped if no cells are added.
+ * mpot -- mesh rezone potential
+ * have_state flag -- 0 (false) for setup when physics state has not been allocated
+ * ioffset -- partial prefix scan results for starting address to write new cells
+ * state_memory -- linked list of arrays for state
+ * Output
+ * new mesh and state arrays with refinement/coarsening performed
+ *******************************************************************/
+ void rezone_all(int icount, int jcount, vector<int> mpot, int have_state, MallocPlus &state_memory);
+#ifdef HAVE_OPENCL
+ void gpu_rezone_all(int icount, int jcount, cl_mem &dev_mpot, MallocPlus &gpu_state_memory);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Load balance -- only needed for parallel (MPI) runs
+ *
+ * **Parameters**
+ *
+ * Input
+ * numcells -- ncells from rezone all routine. This is a copy in so that a local
+ * value can be used for load_balance and gpu_load_balance without it getting
+ * reset for clamr_checkall routine
+ * weight -- weighting array per cell for balancing. Currently not used. Null value
+ * indicates even weighting of cells for load balance.
+ * state_memory or gpu_state_memory -- linked-list of arrays from physics routine
+ * to be load balanced.
+ * Output -- arrays will be returned load balanced with new sizes. Pointers to arrays
+ * will need to be reset
+ *******************************************************************/
+#ifdef HAVE_MPI
+ void do_load_balance_local(size_t numcells, float *weight, MallocPlus &state_memory);
+#ifdef HAVE_OPENCL
+ int gpu_do_load_balance_local(size_t numcells, float *weight, MallocPlus &gpu_state_memory);
+#endif
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Calculate spatial coordinates
+ *
+ * **Parameters**
+ *
+ * Input -- from within the object
+ * i, j, level
+ * Output
+ * x, y -- coordinates for each cell
+ * dx, dy -- size of each cell
+ *******************************************************************/
+ void calc_spatial_coordinates(int ibase);
+#ifdef HAVE_OPENCL
+ void gpu_calc_spatial_coordinates(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy);
+#endif
+///@}
+
+///@{
+/****************************************************************//**
+ * \brief
+ * Testing routines
+ *******************************************************************/
+#ifdef HAVE_OPENCL
+ void compare_dev_local_to_local(void); // Not currently called
+ void compare_neighbors_gpu_global_to_cpu_global(void);
+#endif
+ void compare_neighbors_cpu_local_to_cpu_global(uint ncells_ghost, uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl);
+#ifdef HAVE_OPENCL
+ void compare_neighbors_all_to_gpu_local(Mesh *mesh_global, int *nsizes, int *ndispl);
+ void compare_mpot_gpu_global_to_cpu_global(int *mpot, cl_mem dev_mpot);
+#endif
+ void compare_mpot_cpu_local_to_cpu_global(uint ncells_global, int *nsizes, int *displ, int *mpot, int *mpot_global, int cycle);
+#ifdef HAVE_OPENCL
+ void compare_mpot_all_to_gpu_local(int *mpot, int *mpot_global, cl_mem dev_mpot, cl_mem dev_mpot_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle);
+ void compare_ioffset_gpu_global_to_cpu_global(uint old_ncells, int *mpot);
+ void compare_ioffset_all_to_gpu_local(uint old_ncells, uint old_ncells_global, int block_size, int block_size_global, int *mpot, int *mpot_global, cl_mem dev_ioffset, cl_mem dev_ioffset_global, int *ioffset, int *ioffset_global, int *celltype_global, int *i_global, int *j_global);
+ void compare_coordinates_gpu_global_to_cpu_global_double(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, double *H);
+ void compare_coordinates_gpu_global_to_cpu_global_float(cl_mem dev_x, cl_mem dev_dx, cl_mem dev_y, cl_mem dev_dy, cl_mem dev_H, float *H);
+#endif
+ void compare_coordinates_cpu_local_to_cpu_global_double(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, double *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, double *H_global, int cycle);
+ void compare_coordinates_cpu_local_to_cpu_global_float(uint ncells_global, int *nsizes, int *ndispl, spatial_t *x, spatial_t *dx, spatial_t *y, spatial_t *dy, float *H, spatial_t *x_global, spatial_t *dx_global, spatial_t *y_global, spatial_t *dy_global, float *H_global, int cycle);
+#ifdef HAVE_OPENCL
+ void compare_indices_gpu_global_to_cpu_global(void);
+#endif
+ void compare_indices_cpu_local_to_cpu_global(uint ncells_global, Mesh *mesh_global, int *nsizes, int *ndispl, int cycle);
+#ifdef HAVE_OPENCL
+ void compare_indices_all_to_gpu_local(Mesh *mesh_global, uint ncells_global, int *nsizes, int *ndispl, int ncycle);
+#endif
+///@}
+
+ size_t get_checkpoint_size(void);
+ void store_checkpoint(Crux *crux);
+ void restore_checkpoint(Crux *crux);
+
+ void calc_celltype_threaded(size_t ncells);
+ void calc_celltype(size_t ncells);
+
+private:
+ // Private constructors.
+ Mesh(const Mesh&); // Blocks copy constructor so copies are not made inadvertently.
+
+ // Member functions.
+ void print_object_info();
+
+ void set_refinement_order(int order[4], int ic, int ifirst, int ilast, int jfirst, int jlast,
+ int level_first, int level_last, int *i, int *j, int *level);
+
+ void write_grid(int ncycle);
+ void calc_centerminmax(void);
+ void calc_minmax(void);
+
+ void print(void);
+ void print_local(void);
+#ifdef HAVE_OPENCL
+ void print_dev_local();
+#endif
+
+};
+
+#endif /* MESH_H */
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/partition.cpp?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.cpp Mon Sep 4 07:25:36 2017
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifdef HAVE_MPI
+#include "mpi.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <list>
+#include <algorithm>
+#include "partition.h"
+#include "KDTree.h"
+#include "mesh.h"
+#ifdef HAVE_MPI
+#include "s7.h"
+#endif
+#include "zorder.h"
+#include "timer.h"
+#include "hsfc.h"
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+typedef unsigned int uint;
+
+int measure_type;
+int meas_count = 0;
+double meas_sum_average = 0.0;
+
+extern bool localStencil;
+extern int initial_order;
+extern int cycle_reorder;
+
+void Mesh::partition_measure(void)
+{
+ if (measure_type != NO_PARTITION_MEASURE){
+
+ int ntX = TILE_SIZE;
+ static double offtile_ratio = 0.0;
+
+ uint num_groups = (ncells + TILE_SIZE - 1)/TILE_SIZE;
+
+ if (measure_type == WITH_DUPLICATES) {
+ int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+ for (uint group_id=0; group_id < num_groups; group_id ++){
+
+ int start_idx = group_id * ntX;
+ int end_idx = (group_id + 1) * ntX;
+
+ int offtile =0;
+ for (uint ic = 0; ic < TILE_SIZE; ic++, i++){
+
+ if (i >= ncells) continue;
+ //taken from wave_kern_calc.cl 'setup tile' kernel
+ if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile++;
+ if (level[nlft[i]] > level[i] &&
+ (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile++;
+ if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile++;
+ if (level[nrht[i]] > level[i] &&
+ (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile++;
+ if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile++;
+ if (level[nbot[i]] > level[i] &&
+ (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile++;
+ if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile++;
+ if (level[ntop[i]] > level[i] &&
+ (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile++;
+ }
+ offtile_ratio += (double)offtile/(double)(TILE_SIZE);
+ //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+ }
+ } else if (measure_type == WITHOUT_DUPLICATES) {
+ int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+ for (uint group_id=0; group_id < num_groups; group_id ++){
+ list<int> offtile_list;
+
+ int start_idx = group_id * ntX;
+ int end_idx = (group_id + 1) * ntX;
+
+ for (uint ic = 0; ic < TILE_SIZE; ic++, i++){
+
+ if (i >= ncells) continue;
+
+ if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile_list.push_back(nlft[i]);
+ if (level[nlft[i]] > level[i] &&
+ (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile_list.push_back(ntop[nlft[i]]);
+ if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile_list.push_back(nrht[i]);
+ if (level[nrht[i]] > level[i] &&
+ (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile_list.push_back(ntop[nrht[i]]);
+ if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile_list.push_back(nbot[i]);
+ if (level[nbot[i]] > level[i] &&
+ (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile_list.push_back(nrht[nbot[i]]);
+ if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile_list.push_back(ntop[i]);
+ if (level[ntop[i]] > level[i] &&
+ (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile_list.push_back(nrht[ntop[i]]);
+ }
+ offtile_list.sort();
+ offtile_list.unique();
+
+ offtile_ratio += (double)offtile_list.size()/(double)(TILE_SIZE);
+ //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+ }
+ } else if (measure_type == CVALUE) {
+ int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+ for (uint group_id=0; group_id < num_groups; group_id ++){
+ list<int> offtile_list;
+
+ int start_idx = group_id * ntX;
+ int end_idx = (group_id + 1) * ntX;
+
+ for (uint ic = 0; ic < TILE_SIZE; ic++, i++){
+
+ if (i >= ncells) continue;
+
+ if (nlft[i] < start_idx || nlft[i] >= end_idx) offtile_list.push_back(nlft[i]);
+ if (level[nlft[i]] > level[i] &&
+ (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) offtile_list.push_back(ntop[nlft[i]]);
+ if (nrht[i] < start_idx || nrht[i] >= end_idx) offtile_list.push_back(nrht[i]);
+ if (level[nrht[i]] > level[i] &&
+ (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) offtile_list.push_back(ntop[nrht[i]]);
+ if (nbot[i] < start_idx || nbot[i] >= end_idx) offtile_list.push_back(nbot[i]);
+ if (level[nbot[i]] > level[i] &&
+ (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) offtile_list.push_back(nrht[nbot[i]]);
+ if (ntop[i] < start_idx || ntop[i] >= end_idx) offtile_list.push_back(ntop[i]);
+ if (level[ntop[i]] > level[i] &&
+ (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) offtile_list.push_back(nrht[ntop[i]]);
+ }
+ offtile_list.sort();
+ offtile_list.unique();
+
+ offtile_ratio += (double)offtile_list.size()/(4*sqrt((double)(TILE_SIZE)));
+ //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+ }
+ } else if (measure_type == CSTARVALUE) {
+ int i = 0;
+#ifdef _OPENMP
+#pragma omp for reduction(+:offtile_ratio)
+#endif
+ for (uint group_id=0; group_id < num_groups; group_id ++){
+ list<int> offtile_list;
+ list<int> offtile_cache_lines; // Assumes memory is aligned
+ int cache_line_size = 4; // Some could be 8, or more?
+
+ int start_idx = group_id * ntX;
+ int end_idx = (group_id + 1) * ntX;
+
+ for (uint ic = 0; ic < TILE_SIZE; ic++, i++){
+
+ if (i >= ncells) continue;
+
+ if (nlft[i] < start_idx || nlft[i] >= end_idx) {
+ offtile_list.push_back(nlft[i]);
+ offtile_cache_lines.push_back(nlft[i]/cache_line_size);
+ }
+
+ if (level[nlft[i]] > level[i] && (ntop[nlft[i]] < start_idx || ntop[nlft[i]] >= end_idx) ) {
+ offtile_list.push_back(ntop[nlft[i]]);
+ offtile_cache_lines.push_back(ntop[nlft[i]]/cache_line_size);
+ }
+ if (nrht[i] < start_idx || nrht[i] >= end_idx) {
+ offtile_list.push_back(nrht[i]);
+ offtile_cache_lines.push_back(nrht[i]/cache_line_size);
+ }
+ if (level[nrht[i]] > level[i] && (ntop[nrht[i]] < start_idx || ntop[nrht[i]] >= end_idx) ) {
+ offtile_list.push_back(ntop[nrht[i]]);
+ offtile_cache_lines.push_back(ntop[nrht[i]]/cache_line_size);
+ }
+ if (nbot[i] < start_idx || nbot[i] >= end_idx) {
+ offtile_list.push_back(nbot[i]);
+ offtile_cache_lines.push_back(nbot[i]/cache_line_size);
+ }
+ if (level[nbot[i]] > level[i] && (nrht[nbot[i]] < start_idx || nrht[nbot[i]] >= end_idx) ) {
+ offtile_list.push_back(nrht[nbot[i]]);
+ offtile_cache_lines.push_back(nrht[nbot[i]]/cache_line_size);
+ }
+ if (ntop[i] < start_idx || ntop[i] >= end_idx) {
+ offtile_list.push_back(ntop[i]);
+ offtile_cache_lines.push_back(ntop[i]/cache_line_size);
+ }
+ if (level[ntop[i]] > level[i] && (nrht[ntop[i]] < start_idx || nrht[ntop[i]] >= end_idx) ) {
+ offtile_list.push_back(nrht[ntop[i]]);
+ offtile_cache_lines.push_back(nrht[ntop[i]]/cache_line_size);
+ }
+ }
+ offtile_list.sort();
+ offtile_list.unique();
+ offtile_cache_lines.sort();
+ offtile_cache_lines.unique();
+
+ double s_ngeom = (double)(offtile_list.size());
+ double q_ngeom = (double)(offtile_cache_lines.size());
+ double ngeom = (double)(TILE_SIZE);
+ double cover = (double)(cache_line_size);
+// offtile_ratio += (s_ngeom * q_ngeom) / (4*sqrt(ngeom)*2*(1+(ngeom+cache_line_size-1)/cache_line_size));
+// offtile_ratio += (q_ngeom) / (2*sqrt(ngeom)+2*((sqrt(ngeom)+cover-1)/cover));
+// offtile_ratio += (q_ngeom) / ( (8*sqrt(ngeom)+cover-1)/cover );
+ ngeom = sqrt(ngeom);
+ offtile_ratio += (s_ngeom*q_ngeom*cover) / ( 4 * ngeom * (8*ngeom+cover-1) );
+
+ //printf("DEBUG Ratio of surface area to volume is equal to %d / %d ratio is %lf\n", offtile, TILE_SIZE, (double)offtile/(double)TILE_SIZE);
+ }
+ }
+
+ // printf("DEBUG Ratio of surface area to volume is equal to %d / %d \n", offtile, ontile);
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ meas_count ++;
+ meas_sum_average += offtile_ratio/(double)num_groups;
+ // printf("DEBUG %d icount %d sum_average %lf\n",__LINE__,icount, sum_average);
+#ifdef _OPENMP
+ }
+#endif
+ } // if PARTITION TYPE
+}
+
+void Mesh::print_partition_measure()
+{
+ if (meas_count != 0) {
+ if (measure_type == NO_PARTITION_MEASURE) {
+ if (mype == 0) printf("No Partition Measure\n");
+ } else if (measure_type == WITH_DUPLICATES) {
+ parallel_output("Average surface area to volume ratio ", meas_sum_average/(double)meas_count, 0, "with duplicates");
+ } else if (measure_type == WITHOUT_DUPLICATES) {
+ parallel_output("Average surface area to volume ratio ", meas_sum_average/(double)meas_count, 0, "without duplicates");
+ } else if (measure_type == CVALUE) {
+ parallel_output("Partition Quality Avg C value ", meas_sum_average/(double)meas_count, 0, "");
+ } else if (measure_type == CSTARVALUE){
+ parallel_output("Partition Quality Avg C* value ", meas_sum_average/(double)meas_count, 0, "");
+ }
+ }
+
+ if (numpe > 1){
+ parallel_output("The MPI surface area to volume ratio ", offtile_ratio_local, 0, "without duplicates");
+ }
+}
+
+void Mesh::print_partition_type()
+{
+ if (mype == 0) {
+ if (initial_order == ORIGINAL_ORDER) {
+ printf("Initial order is naive.");
+ } else if (initial_order == HILBERT_SORT) {
+ printf("Initial order is Hilbert sort.");
+ } else if (initial_order == HILBERT_PARTITION) {
+ printf("Initial order is Hilbert partitionr.");
+ } else if (initial_order == ZORDER) {
+ printf("Initial order is Z order.");
+ }
+
+ if (cycle_reorder == ORIGINAL_ORDER) {
+ printf(" No cycle reorder.");
+ } else if (cycle_reorder == HILBERT_SORT) {
+ printf(" Cycle reorder is Hilbert sort.");
+ } else if (cycle_reorder == HILBERT_PARTITION) {
+ printf(" Cycle reorder is Hilbert partition.");
+ } else if (cycle_reorder == ZORDER) {
+ printf(" Cycle reorder is Z order.");
+ }
+
+ if (localStencil) {
+ printf(" Local Stencil is on.\n");
+ } else {
+ printf("\n");
+ }
+ }
+
+}
+void Mesh::partition_cells(
+ int numpe, //
+ vector<int> &z_order, // Resulting index ordering.
+ enum partition_method method) // Assigned partitioning method.
+{
+ int *info; //
+ double iscale, //
+ jscale; //
+ int imax, // Maximum x-index.
+ jmax; // Maximum y-index.
+ vector<int> z_index; // Ordered curve from hsfc.
+ vector<int> i_scaled; // x-indices normalized to a scale of [0, 1] for hsfc.
+ vector<int> j_scaled; // y-indices normalized to a scale of [0, 1] for hsfc.
+ vector<double> iunit; //
+ vector<double> junit; //
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ // Initialize ordered curve index.
+ z_index.resize(ncells, 0);
+ //z_order.resize(ncells, 0);
+
+ if (parallel) {
+#ifdef HAVE_MPI
+ nsizes.resize(numpe);
+ ndispl.resize(numpe);
+ MPI_Allgather(&ncells, 1, MPI_INT, &nsizes[0], 1, MPI_INT, MPI_COMM_WORLD);
+ ndispl[0]=0;
+ for (int ip=1; ip<numpe; ip++){
+ ndispl[ip] = ndispl[ip-1] + nsizes[ip-1];
+ }
+ noffset=0;
+ for (int ip=0; ip<mype; ip++){
+ noffset += nsizes[ip];
+ }
+#endif
+ } else {
+ // Adjust the number of required work items to the number of cells.
+ proc.resize(ncells);
+ // Decompose the domain equitably.
+ calc_distribution(numpe);
+ noffset = 0;
+ }
+
+
+ // Partition cells according to one of several possible orderings.
+ int have_spatial_variables=0;
+ switch (method)
+ { case ORIGINAL_ORDER:
+ // Set z_order to the current cell order.
+ for (uint ic = 0; ic < ncells; ++ic)
+ { z_order[ic] = ic; }
+
+ cpu_timers[MESH_TIMER_PARTITION] += cpu_timer_stop(tstart_cpu);
+
+ return;
+ break;
+
+ case HILBERT_SORT:
+ // Resort the curve by Hilbert order.
+ have_spatial_variables = 1;
+ if (x.size() < ncells) {
+ calc_spatial_coordinates(0);
+ have_spatial_variables = 0;
+ }
+ calc_centerminmax();
+ iunit.resize(ncells);
+ junit.resize(ncells);
+
+ // Get the range of values in the x- and y-directions and make the scale square.
+ iscale = 1.0 / (xcentermax - xcentermin);
+ jscale = 1.0 / (ycentermax - ycentermin);
+
+ // Scale the indices to a normalized [0, 1] range for hsfc.
+ for (uint ic = 0; ic < ncells; ++ic){
+ iunit[ic] = (x[ic] + 0.5 * dx[ic] - xcentermin) * iscale;
+ junit[ic] = (y[ic] + 0.5 * dy[ic] - ycentermin) * jscale;
+ }
+
+ if (have_spatial_variables == 0){
+ x.clear();
+ dx.clear();
+ y.clear();
+ dy.clear();
+ }
+
+ if (parallel){
+#ifdef HAVE_MPI
+ info = (int *)malloc(sizeof(int) * 3 * ncells_global);
+ vector<double>iunit_global(ncells_global);
+ vector<double>junit_global(ncells_global);
+ vector<int>z_order_global(ncells_global);
+
+ MPI_Allgatherv(&iunit[0], ncells, MPI_DOUBLE, &iunit_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+ MPI_Allgatherv(&junit[0], ncells, MPI_DOUBLE, &junit_global[0], &nsizes[0], &ndispl[0], MPI_DOUBLE, MPI_COMM_WORLD);
+ // Sort the mesh into an ordered space-filling curve from hsfc.
+ hsfc2sort(ncells_global, &iunit_global[0], &junit_global[0], 0, info, 1);
+
+ // Copy the cell order information from info into z_order.
+ for (uint ic = 0; ic < ncells_global; ++ic)
+ { z_order_global[ic] = info[ic]; }
+ free(info);
+
+ // Order the mesh according to the calculated order (note that z_order is for both curves).
+ vector<int> int_global(ncells_global);
+ vector<int> int_global_new(ncells_global);
+
+ // gather, reorder and scatter i
+ MPI_Allgatherv(&i[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &i[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // gather, reorder and scatter j
+ MPI_Allgatherv(&j[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &j[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // gather, reorder and scatter level
+ MPI_Allgatherv(&level[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &level[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // It is faster just to recalculate these variables instead of communicating them
+ if (mesh_memory.get_memory_size(celltype) >= ncells) {
+ calc_celltype(mesh_memory.get_memory_size(celltype));
+ }
+
+ if (have_spatial_variables) {
+ calc_spatial_coordinates(0);
+ }
+
+ if (mesh_memory.get_memory_size(nlft) >= ncells) {
+ vector<int> inv_z_order(ncells_global);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ inv_z_order[z_order_global[ic]] = ic;
+ }
+
+ MPI_Allgatherv(&nlft[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nlft[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&nrht[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nrht[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&nbot[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nbot[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&ntop[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &ntop[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+ }
+
+ MPI_Scatterv(&z_order_global[0], &nsizes[0], &ndispl[0], MPI_INT, &z_order[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+ } else {
+ info = (int *)malloc(sizeof(int) * 3 * ncells);
+
+ // Sort the mesh into an ordered space-filling curve from hsfc.
+ hsfc2sort(ncells, &iunit[0], &junit[0], 0, info, 1);
+
+ // Copy the cell order information from info into z_order.
+ for (uint ic = 0; ic < ncells; ++ic)
+ { z_order[ic] = info[ic]; }
+ free(info);
+
+ // Order the mesh according to the calculated order (note that z_order is for both curves).
+ vector<int> int_local(ncells);
+
+ mesh_memory.set_memory_attribute(nlft, 0x100);
+ mesh_memory.set_memory_attribute(nrht, 0x100);
+ mesh_memory.set_memory_attribute(nbot, 0x100);
+ mesh_memory.set_memory_attribute(ntop, 0x100);
+
+ mesh_memory.memory_reorder_all(&z_order[0]);
+ memory_reset_ptrs();
+
+ if (x.size() >= ncells) {
+ vector<spatial_t> real_local(ncells);
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = x[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ x[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = dx[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ dx[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = y[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ y[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = dy[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ dy[ic] = real_local[z_order[ic]];
+ }
+ }
+
+ }
+
+ break;
+
+ case ZORDER:
+ // Resort the curve by z-order.
+ if (parallel) {
+#ifdef HAVE_MPI
+ vector<int>i_global(ncells_global);
+ vector<int>j_global(ncells_global);
+ vector<int>level_global(ncells_global);
+ vector<int>z_index_global(ncells_global);
+ vector<int>z_order_global(ncells_global);
+ MPI_Allgatherv(&i[0], ncells, MPI_REAL, &i_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+ MPI_Allgatherv(&j[0], ncells, MPI_REAL, &j_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+ MPI_Allgatherv(&level[0], ncells, MPI_REAL, &level_global[0], &nsizes[0], &ndispl[0], MPI_REAL, MPI_COMM_WORLD);
+
+ i_scaled.resize(ncells_global);
+ j_scaled.resize(ncells_global);
+
+ //
+ imax = 0;
+ jmax = 0;
+ for (uint ic = 0; ic < ncells_global; ++ic)
+ { if (i_global[ic] > imax) imax = i_global[ic];
+ if (j_global[ic] > jmax) jmax = j_global[ic]; }
+
+ //
+ iscale = 16.0 / (double)imax;
+ jscale = 16.0 / (double)jmax;
+
+ //
+ for (uint ic = 0; ic < ncells_global; ++ic)
+ { i_scaled[ic]=(int) ( (double)i_global[ic]*iscale);
+ j_scaled[ic]=(int) ( (double)j_global[ic]*jscale); }
+
+ //
+ calc_zorder(ncells_global, &i_scaled[0], &j_scaled[0], &level_global[0], levmx, ibase, &z_index_global[0], &z_order_global[0]);
+
+ // Order the mesh according to the calculated order (note that z_order is for both curves).
+ vector<int> int_global(ncells_global);
+ vector<int> int_global_new(ncells_global);
+
+ // gather, reorder and scatter i
+ MPI_Allgatherv(&i[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &i[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // gather, reorder and scatter j
+ MPI_Allgatherv(&j[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &j[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // gather, reorder and scatter level
+ MPI_Allgatherv(&level[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = int_global[z_order_global[ic]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &level[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ // It is faster just to recalculate these variables instead of communicating them
+ if (mesh_memory.get_memory_size(celltype) >= ncells) {
+ calc_celltype(mesh_memory.get_memory_size(celltype));
+ }
+
+ if (x.size() >= ncells) {
+ calc_spatial_coordinates(0);
+ }
+
+ if (mesh_memory.get_memory_size(nlft) >= ncells) {
+ vector<int> inv_z_order(ncells_global);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ inv_z_order[z_order_global[ic]] = ic;
+ }
+
+ MPI_Allgatherv(&nlft[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nlft[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&nrht[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nrht[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&nbot[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &nbot[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+
+ MPI_Allgatherv(&ntop[0], ncells, MPI_INT, &int_global[0], &nsizes[0], &ndispl[0], MPI_INT, MPI_COMM_WORLD);
+ for (int ic = 0; ic<(int)ncells_global; ic++){
+ int_global_new[ic] = inv_z_order[int_global[z_order_global[ic]]];
+ }
+ MPI_Scatterv(&int_global_new[0], &nsizes[0], &ndispl[0], MPI_INT, &ntop[0], ncells, MPI_INT, 0, MPI_COMM_WORLD);
+ }
+ MPI_Scatterv(&z_order_global[0], &nsizes[0], &ndispl[0], MPI_REAL, &z_order[0], ncells, MPI_REAL, 0, MPI_COMM_WORLD);
+#endif
+ } else {
+ i_scaled.resize(ncells);
+ j_scaled.resize(ncells);
+
+ //
+ imax = 0;
+ jmax = 0;
+ for (uint ic = 0; ic < ncells; ++ic)
+ { if (i[ic] > imax) imax = i[ic];
+ if (j[ic] > jmax) jmax = j[ic]; }
+
+ //
+ iscale = 16.0 / (double)imax;
+ jscale = 16.0 / (double)jmax;
+
+ //
+ for (uint ic = 0; ic < ncells; ++ic)
+ { i_scaled[ic]=(int) ( (double)i[ic]*iscale);
+ j_scaled[ic]=(int) ( (double)j[ic]*jscale); }
+
+ //
+ calc_zorder(ncells, &i_scaled[0], &j_scaled[0], &level[0], levmx, ibase, &z_index[0], &z_order[0]);
+
+ // Order the mesh according to the calculated order (note that z_order is for both curves).
+ vector<int> int_local(ncells);
+
+ mesh_memory.set_memory_attribute(nlft, 0x100);
+ mesh_memory.set_memory_attribute(nrht, 0x100);
+ mesh_memory.set_memory_attribute(nbot, 0x100);
+ mesh_memory.set_memory_attribute(ntop, 0x100);
+
+ mesh_memory.memory_reorder_all(&z_order[0]);
+ memory_reset_ptrs();
+
+
+ if (x.size() >= ncells) {
+ vector<spatial_t> real_local(ncells);
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = x[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ x[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = dx[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ dx[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = y[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ y[ic] = real_local[z_order[ic]];
+ }
+
+ for (int ic = 0; ic<(int)ncells; ic++){
+ real_local[ic] = dy[ic];
+ }
+ for (int ic = 0; ic<(int)ncells; ic++){
+ dy[ic] = real_local[z_order[ic]];
+ }
+ }
+
+ }
+
+ break;
+
+ default:
+ // Note that HILBERT_PARTITION is not currently supported due to redundancy with HILBERT_SORT.
+ break;
+ }
+
+
+ // Output ordered mesh information.
+ if (DEBUG)
+ { printf("orig index i j lev nlft nrht nbot ntop xlow xhigh ylow yhigh z index z order\n");
+ for (uint ic=0; ic<ncells; ic++){
+ printf(" %6d %4d %4d %4d %4d %4d %4d %4d ", index[ic], j[ic], i[ic], level[ic], nlft[ic], nrht[ic], nbot[ic], ntop[ic]);
+ printf(" %8.2lf %8.2lf %8.2lf %8.2lf", x[ic], x[ic]+dx[ic], y[ic], y[ic]+dy[ic]);
+ printf(" %6d %5d\n", z_index[ic], z_order[ic]); } }
+
+ cpu_timers[MESH_TIMER_PARTITION] += cpu_timer_stop(tstart_cpu);
+}
+
+// The distribution needs to be modified in order to spread out extra cells equitably among the work items.
+void Mesh::calc_distribution(int numpe)
+{
+ uint lsize = 0; //
+ for (int ip = 0; ip < numpe; ++ip) {
+ lsize += proc.size()/numpe;
+ if (ip < (int)proc.size()%numpe) lsize++;
+ for (int ic = 0; ic < (int)lsize; ic++) {
+ proc[ic] = ip;
+ }
+ }
+}
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/partition.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/partition.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef PARTITION_H
+#define PARTITION_H
+
+#include <vector>
+
+#include "input.h"
+
+using namespace std;
+
+enum partition_method {
+ ORIGINAL_ORDER,
+ HILBERT_SORT,
+ HILBERT_PARTITION,
+ ZORDER
+};
+
+enum partition_measure {
+ NO_PARTITION_MEASURE,
+ WITH_DUPLICATES,
+ WITHOUT_DUPLICATES,
+ CVALUE,
+ CSTARVALUE
+};
+
+
+void calc_distribution(int numpe, vector<int> &proc);
+//void partition_cells(int numpe, vector<int> &proc, Mesh &mesh, enum partition_method method);
+
+typedef void (*maptonorm)( double * , double * , void * );
+
+extern "C" void hsfc2sort(
+ const int N , /* IN: Number of points */
+ const double * X , /* IN: array of X-Coordinates */
+ const double * Y , /* IN: array of Y-Coordinates */
+ const int ibase , /* IN: Stride for Y array */
+ int * Info , /* OUT: (1 <= LDInfo) [ HSFC ordering ]
+ (2 <= LDInfo) [ HSFC index, #1 ]
+ (3 <= LDInfo) [ HSFC index, #2 ] */
+ int LDInfo /* IN: Leading dimension of Info */
+ );
+
+extern "C" void hsfc2part(
+ const int Level , /* IN: Background grid level of partitioning */
+ const int Limit , /* IN: Number of levels to consider for 'gaps' */
+ const int NPart , /* IN: Target number of partitions */
+ const int N , /* IN: Number of points */
+ const double * X , /* IN: array of X-Coordinates */
+ const double * Y , /* IN: array of Y-Coordinates */
+ const int ibase , /* IN: Base - 0 for C, 1 for Fortran */
+ int * Info , /* IN: Array of computational weights,
+ OUT: (1 <= LDInfo) [ Partitioning ]
+ (2 <= LDInfo) [ Adjusted HSFC ordering ]
+ (3 <= LDInfo) [ Original HSFC index, #1 ]
+ (4 <= LDInfo) [ Original HSFC index, #2 ] */
+ int LDInfo );/* IN: Leading dimension of Info */
+
+
+#endif /* PARTITION_H */
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/reduce.c?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.c Mon Sep 4 07:25:36 2017
@@ -0,0 +1,245 @@
+/**
+ * Copyright (c) 2011, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#include "reduce.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+
+#ifdef HAVE_OPENCL
+#include "reduce_kernel.inc"
+#endif
+
+void init_kernels_reduce(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_sum = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_cl");
+ kernel_reduce_sum_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage1of2_cl");
+ kernel_reduce_sum_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage2of2_cl");
+ kernel_reduce_sum_int_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage1of2_cl");
+ kernel_reduce_sum_int_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage2of2_cl");
+ kernel_reduce_product = ezcl_create_kernel_wsource(context, reduce_source, "reduce_product_cl");
+ kernel_reduce_max = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_cl");
+ kernel_reduce_max_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage1of2_cl");
+ kernel_reduce_max_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage2of2_cl");
+ kernel_reduce_min = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_cl");
+ kernel_reduce_min_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage1of2_cl");
+ kernel_reduce_min_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage2of2_cl");
+#endif
+}
+
+void init_kernel_sum(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_sum = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_cl");
+#endif
+}
+
+void init_kernel_2stage_sum(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_sum_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage1of2_cl");
+ kernel_reduce_sum_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_stage2of2_cl");
+#endif
+}
+
+void terminate_kernel_2stage_sum(void)
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+#endif
+}
+
+void init_kernel_2stage_sum_int(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_sum_int_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage1of2_cl");
+ kernel_reduce_sum_int_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_sum_int_stage2of2_cl");
+#endif
+}
+
+void terminate_kernel_2stage_sum_int(void)
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+#endif
+}
+
+void init_kernel_product(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_product = ezcl_create_kernel_wsource(context, reduce_source, "reduce_product_cl");
+#endif
+}
+
+void init_kernel_max(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_max = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_cl");
+#endif
+}
+
+void init_kernel_2stage_max(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_max_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage1of2_cl");
+ kernel_reduce_max_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_max_stage2of2_cl");
+#endif
+}
+
+void init_kernel_min(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_min = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_cl");
+#endif
+}
+
+void init_kernel_2stage_min(void)
+{
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+ kernel_reduce_min_stage1of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage1of2_cl");
+ kernel_reduce_min_stage2of2 = ezcl_create_kernel_wsource(context, reduce_source, "reduce_min_stage2of2_cl");
+#endif
+}
+
+void release_kernels_reduce()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum);
+ ezcl_kernel_release(kernel_reduce_sum_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+ ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+ ezcl_kernel_release(kernel_reduce_product);
+ ezcl_kernel_release(kernel_reduce_max);
+ ezcl_kernel_release(kernel_reduce_max_stage1of2);
+ ezcl_kernel_release(kernel_reduce_max_stage2of2);
+ ezcl_kernel_release(kernel_reduce_min);
+ ezcl_kernel_release(kernel_reduce_min_stage1of2);
+ ezcl_kernel_release(kernel_reduce_min_stage2of2);
+#endif
+}
+
+void release_kernel_sum()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum);
+#endif
+}
+
+void release_kernel_2stage_sum()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_stage2of2);
+#endif
+}
+
+void release_kernel_2stage_sum_int()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_sum_int_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_int_stage2of2);
+#endif
+}
+
+void release_kernel_product()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_product);
+#endif
+}
+
+void release_kernel_max()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_max);
+#endif
+}
+
+void release_kernel_2stage_max()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_max_stage1of2);
+ ezcl_kernel_release(kernel_reduce_max_stage2of2);
+#endif
+}
+
+void release_kernel_min()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_min);
+#endif
+}
+
+void release_kernel_2stage_min()
+{
+#ifdef HAVE_OPENCL
+ ezcl_kernel_release(kernel_reduce_min_stage1of2);
+ ezcl_kernel_release(kernel_reduce_min_stage2of2);
+#endif
+}
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/reduce.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/reduce.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef _REDUCE_H_
+#define _REDUCE_H_
+
+#ifdef HAVE_OPENCL
+#ifdef __APPLE_CC__
+#include <OpenCL/OpenCL.h>
+#else
+#include "CL/cl.h"
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifdef HAVE_OPENCL
+cl_kernel kernel_reduce_sum,
+ kernel_reduce_sum_stage1of2,
+ kernel_reduce_sum_stage2of2,
+ kernel_reduce_sum_int_stage1of2,
+ kernel_reduce_sum_int_stage2of2,
+ kernel_reduce_product,
+ kernel_reduce_max,
+ kernel_reduce_max_stage1of2,
+ kernel_reduce_max_stage2of2,
+ kernel_reduce_min,
+ kernel_reduce_min_stage1of2,
+ kernel_reduce_min_stage2of2;
+#endif
+
+void init_kernels_reduce(void);
+void init_kernel_sum(void);
+void init_kernel_2stage_sum(void);
+void init_kernel_2stage_sum_int(void);
+void init_kernel_product(void);
+void init_kernel_max(void);
+void init_kernel_2stage_max(void);
+void init_kernel_min(void);
+void init_kernel_2stage_min(void);
+
+void terminate_kernel_2stage_sum(void);
+void terminate_kernel_2stage_sum_int(void);
+
+void release_kernels_reduce();
+void release_kernel_sum();
+void release_kernel_2stage_sum();
+void release_kernel_2stage_sum_int();
+void release_kernel_product();
+void release_kernel_max();
+void release_kernel_2stage_max();
+void release_kernel_min();
+void release_kernel_2stage_min();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _REDUCE_H_ */
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/s7.c?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.c Mon Sep 4 07:25:36 2017
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "s7.h"
+
+void S7_Sort(
+ void * array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype
+ )
+{
+ int n, child, parent, i;
+
+ int qint;
+ long qlong;
+ long long qlonglong;
+ float qfloat;
+ double qdouble;
+
+ int
+ *int_data_ptr;
+ long
+ *long_data_ptr;
+ long long
+ *longlong_data_ptr;
+ float
+ *float_data_ptr;
+ double
+ *double_data_ptr;
+
+ // Heapsort
+
+ i=nsize/2;
+ n = nsize;
+
+ switch (S7_datatype){
+ case S7_INTEGER4:
+ case S7_INT:
+ int_data_ptr = (int *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ qint=int_data_ptr[--i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+ break; // End the sort here!
+ } // if n
+ qint=int_data_ptr[n];
+ int_data_ptr[n]=int_data_ptr[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && int_data_ptr[child+1] > int_data_ptr[child]) child++;
+ if (int_data_ptr[child] > qint) {
+ int_data_ptr[parent] = int_data_ptr[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break; // Break out of sift while loop
+ } // else
+ } // while
+ int_data_ptr[parent]=qint;
+ } // for
+
+ break;
+
+ case S7_LONG:
+ long_data_ptr = (long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ qlong=long_data_ptr[--i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+ break; // End the sort here!
+ } // if n
+ qlong=long_data_ptr[n];
+ long_data_ptr[n]=long_data_ptr[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && long_data_ptr[child+1] > long_data_ptr[child]) child++;
+ if (long_data_ptr[child] > qlong) {
+ long_data_ptr[parent] = long_data_ptr[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break; // Break out of sift while loop
+ } // else
+ } // while
+ long_data_ptr[parent]=qlong;
+ } // for
+
+ break;
+
+ case S7_LONG_LONG_INT:
+ case S7_INTEGER8:
+ longlong_data_ptr = (long long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ qlonglong=longlong_data_ptr[--i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+ break; // End the sort here!
+ } // if n
+ qlonglong=longlong_data_ptr[n];
+ longlong_data_ptr[n]=longlong_data_ptr[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && longlong_data_ptr[child+1] > longlong_data_ptr[child]) child++;
+ if (longlong_data_ptr[child] > qlonglong) {
+ longlong_data_ptr[parent] = longlong_data_ptr[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break; // Break out of sift while loop
+ } // else
+ } // while
+ longlong_data_ptr[parent]=qlonglong;
+ } // for
+
+ break;
+
+ case S7_FLOAT:
+ case S7_REAL4:
+ float_data_ptr = (float *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ qfloat=float_data_ptr[--i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+ break; // End the sort here!
+ } // if n
+ qfloat=float_data_ptr[n];
+ float_data_ptr[n]=float_data_ptr[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && float_data_ptr[child+1] > float_data_ptr[child]) child++;
+ if (float_data_ptr[child] > qfloat) {
+ float_data_ptr[parent] = float_data_ptr[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break; // Break out of sift while loop
+ } // else
+ } // while
+ float_data_ptr[parent]=qfloat;
+ } // for
+
+ break;
+
+ case S7_DOUBLE:
+ case S7_REAL8:
+ double_data_ptr = (double *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ qdouble=double_data_ptr[--i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+ break; // End the sort here!
+ } // if n
+ qdouble=double_data_ptr[n];
+ double_data_ptr[n]=double_data_ptr[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && double_data_ptr[child+1] > double_data_ptr[child]) child++;
+ if (double_data_ptr[child] > qdouble) {
+ double_data_ptr[parent] = double_data_ptr[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break; // Break out of sift while loop
+ } // else
+ } // while
+ double_data_ptr[parent]=qdouble;
+ } // for
+
+ break;
+
+ default:
+ printf("Error -- S7_Datatype not supported in S7_Sort\n");
+ exit(1);
+ break;
+
+ }
+}
+
+
+void S7_Sort_2Arrays(
+ void * array_in1,
+ void * array_in2,
+ const int nsize,
+ const enum S7_Datatype S7_datatype
+ )
+{
+ int n, child, parent, i;
+
+ int qint1, qint2;
+ long qlong1, qlong2;
+ long long qlonglong1, qlonglong2;
+ float qfloat1, qfloat2;
+ double qdouble1, qdouble2;
+
+ int
+ *int_data_ptr1, *int_data_ptr2;
+ long
+ *long_data_ptr1, *long_data_ptr2;
+ long long
+ *longlong_data_ptr1, *longlong_data_ptr2;
+ float
+ *float_data_ptr1, *float_data_ptr2;
+ double
+ *double_data_ptr1, *double_data_ptr2;
+
+ // Heapsort
+
+ i=nsize/2;
+ n = nsize;
+
+ switch (S7_datatype){
+ case S7_INTEGER4:
+ case S7_INT:
+ int_data_ptr1 = (int *)array_in1;
+ int_data_ptr2 = (int *)array_in2;
+
+ for (;;) {
+ if (i > 0) {
+ qint1=int_data_ptr1[--i];
+ qint2=int_data_ptr2[i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return; // End of sort
+ } // if n
+ qint1=int_data_ptr1[n];
+ qint2=int_data_ptr2[n];
+ int_data_ptr1[n]=int_data_ptr1[0];
+ int_data_ptr2[n]=int_data_ptr2[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && int_data_ptr1[child+1] > int_data_ptr1[child]) child++;
+ if (int_data_ptr1[child] > qint1) {
+ int_data_ptr1[parent] = int_data_ptr1[child];
+ int_data_ptr2[parent] = int_data_ptr2[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ int_data_ptr1[parent]=qint1;
+ int_data_ptr2[parent]=qint2;
+ } // for
+ break;
+
+ case S7_LONG:
+ long_data_ptr1 = (long *)array_in1;
+ long_data_ptr2 = (long *)array_in2;
+
+ for (;;) {
+ if (i > 0) {
+ qlong1=long_data_ptr1[--i];
+ qlong2=long_data_ptr2[i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return; // End of sort
+ } // if n
+ qlong1=long_data_ptr1[n];
+ qlong2=long_data_ptr2[n];
+ long_data_ptr1[n]=long_data_ptr1[0];
+ long_data_ptr2[n]=long_data_ptr2[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && long_data_ptr1[child+1] > long_data_ptr1[child]) child++;
+ if (long_data_ptr1[child] > qlong1) {
+ long_data_ptr1[parent] = long_data_ptr1[child];
+ long_data_ptr2[parent] = long_data_ptr2[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ long_data_ptr1[parent]=qlong1;
+ long_data_ptr2[parent]=qlong2;
+ } // for
+ break;
+
+ case S7_LONG_LONG_INT:
+ case S7_INTEGER8:
+ longlong_data_ptr1 = (long long *)array_in1;
+ longlong_data_ptr2 = (long long *)array_in2;
+
+ for (;;) {
+ if (i > 0) {
+ qlonglong1=longlong_data_ptr1[--i];
+ qlonglong2=longlong_data_ptr2[i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return; // End of sort
+ } // if n
+ qlonglong1=longlong_data_ptr1[n];
+ qlonglong2=longlong_data_ptr2[n];
+ longlong_data_ptr1[n]=longlong_data_ptr1[0];
+ longlong_data_ptr2[n]=longlong_data_ptr2[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && longlong_data_ptr1[child+1] > longlong_data_ptr1[child]) child++;
+ if (longlong_data_ptr1[child] > qlonglong1) {
+ longlong_data_ptr1[parent] = longlong_data_ptr1[child];
+ longlong_data_ptr2[parent] = longlong_data_ptr2[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ longlong_data_ptr1[parent]=qlonglong1;
+ longlong_data_ptr2[parent]=qlonglong2;
+ } // for
+ break;
+
+ case S7_FLOAT:
+ case S7_REAL4:
+ float_data_ptr1 = (float *)array_in1;
+ float_data_ptr2 = (float *)array_in2;
+
+ for (;;) {
+ if (i > 0) {
+ qfloat1=float_data_ptr1[--i];
+ qfloat2=float_data_ptr2[i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return; // End of sort
+ } // if n
+ qfloat1=float_data_ptr1[n];
+ qfloat2=float_data_ptr2[n];
+ float_data_ptr1[n]=float_data_ptr1[0];
+ float_data_ptr2[n]=float_data_ptr2[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && float_data_ptr1[child+1] > float_data_ptr1[child]) child++;
+ if (float_data_ptr1[child] > qfloat1) {
+ float_data_ptr1[parent] = float_data_ptr1[child];
+ float_data_ptr2[parent] = float_data_ptr2[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ float_data_ptr1[parent]=qfloat1;
+ float_data_ptr2[parent]=qfloat2;
+ } // for
+ break;
+
+ case S7_DOUBLE:
+ case S7_REAL8:
+ double_data_ptr1 = (double *)array_in1;
+ double_data_ptr2 = (double *)array_in2;
+
+ for (;;) {
+ if (i > 0) {
+ qdouble1=double_data_ptr1[--i];
+ qdouble2=double_data_ptr2[i];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return; // End of sort
+ } // if n
+ qdouble1=double_data_ptr1[n];
+ qdouble2=double_data_ptr2[n];
+ double_data_ptr1[n]=double_data_ptr1[0];
+ double_data_ptr2[n]=double_data_ptr2[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && double_data_ptr1[child+1] > double_data_ptr1[child]) child++;
+ if (double_data_ptr1[child] > qdouble1) {
+ double_data_ptr1[parent] = double_data_ptr1[child];
+ double_data_ptr2[parent] = double_data_ptr2[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ double_data_ptr1[parent]=qdouble1;
+ double_data_ptr2[parent]=qdouble2;
+ } // for
+ break;
+
+ default:
+ printf("Error -- S7_Datatype not supported in S7_Sort\n");
+ exit(1);
+ break;
+ }
+}
+
+
+
+
+
+void S7_Index_Sort(
+ void * array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype,
+ int * index
+ )
+{
+ int n, j, child, parent, i;
+ int indext;
+
+ int qint;
+ long qlong;
+ long long qlonglong;
+ float qfloat;
+ double qdouble;
+
+ int
+ *int_data_ptr;
+ long
+ *long_data_ptr;
+ long long
+ *longlong_data_ptr;
+ float
+ *float_data_ptr;
+ double
+ *double_data_ptr;
+
+ // Heapsort
+
+ // Initialize array with consecutive integers
+ for (j=0; j<nsize; j++) index[j]=j;
+
+ i=nsize/2;
+ n = nsize;
+
+ switch (S7_datatype){
+ case S7_INTEGER4:
+ case S7_INT:
+ int_data_ptr = (int *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qint=int_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qint=int_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && int_data_ptr[index[child+1]] > int_data_ptr[index[child]]) child++;
+ if (int_data_ptr[index[child]] > qint) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_LONG:
+ long_data_ptr = (long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qlong=long_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qlong=long_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && long_data_ptr[index[child+1]] > long_data_ptr[index[child]]) child++;
+ if (long_data_ptr[index[child]] > qlong) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_LONG_LONG_INT:
+ case S7_INTEGER8:
+ longlong_data_ptr = (long long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qlonglong=longlong_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qlonglong=longlong_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && longlong_data_ptr[index[child+1]] > longlong_data_ptr[index[child]]) child++;
+ if (longlong_data_ptr[index[child]] > qlonglong) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_FLOAT:
+ case S7_REAL4:
+ float_data_ptr = (float *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qfloat=float_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qfloat=float_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && float_data_ptr[index[child+1]] > float_data_ptr[index[child]]) child++;
+ if (float_data_ptr[index[child]] > qfloat) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+
+ case S7_DOUBLE:
+ case S7_REAL8:
+ double_data_ptr = (double *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qdouble=double_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qdouble=double_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && double_data_ptr[index[child+1]] > double_data_ptr[index[child]]) child++;
+ if (double_data_ptr[index[child]] > qdouble) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ default:
+ printf("Error -- S7_Datatype not supported in S7_Index_Sort\n");
+ exit(1);
+ break;
+
+ }
+}
+
+void S7_Indexi8_Sort(
+ void * array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype,
+ long * index
+ )
+{
+ int n, j, child, parent, i;
+ long indext;
+
+ int qint;
+ long qlong;
+ long long qlonglong;
+ float qfloat;
+ double qdouble;
+
+ int
+ *int_data_ptr;
+ long
+ *long_data_ptr;
+ long long
+ *longlong_data_ptr;
+ float
+ *float_data_ptr;
+ double
+ *double_data_ptr;
+
+ // Heapsort
+
+ // Initialize array with consecutive integers
+ for (j=0; j<nsize; j++) index[j]=j;
+
+ i=nsize/2;
+ n = nsize;
+
+ switch (S7_datatype){
+ case S7_INTEGER4:
+ case S7_INT:
+ int_data_ptr = (int *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qint=int_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qint=int_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && int_data_ptr[index[child+1]] > int_data_ptr[index[child]]) child++;
+ if (int_data_ptr[index[child]] > qint) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_LONG:
+ long_data_ptr = (long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qlong=long_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qlong=long_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && long_data_ptr[index[child+1]] > long_data_ptr[index[child]]) child++;
+ if (long_data_ptr[index[child]] > qlong) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_LONG_LONG_INT:
+ case S7_INTEGER8:
+ longlong_data_ptr = (long long *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qlonglong=longlong_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qlonglong=longlong_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && longlong_data_ptr[index[child+1]] > longlong_data_ptr[index[child]]) child++;
+ if (longlong_data_ptr[index[child]] > qlonglong) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ case S7_FLOAT:
+ case S7_REAL4:
+ float_data_ptr = (float *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qfloat=float_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qfloat=float_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && float_data_ptr[index[child+1]] > float_data_ptr[index[child]]) child++;
+ if (float_data_ptr[index[child]] > qfloat) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+
+ case S7_DOUBLE:
+ case S7_REAL8:
+ double_data_ptr = (double *)array_in;
+
+ for (;;) {
+ if (i > 0) {
+ indext=index[--i];
+ qdouble=double_data_ptr[indext];
+ } // if i > 0
+ else {
+ n--;
+ if (n == 0) {
+
+ return;
+ } // if n
+ indext=index[n];
+ qdouble=double_data_ptr[indext];
+ index[n]=index[0];
+ } // else
+
+ parent=i;
+ child = i*2 + 1;
+ while (child < n) {
+ if (child +1 < n && double_data_ptr[index[child+1]] > double_data_ptr[index[child]]) child++;
+ if (double_data_ptr[index[child]] > qdouble) {
+ index[parent] = index[child];
+ parent=child;
+ child = parent*2 + 1;
+ } // if q
+ else {
+ break;
+ } // else
+ } // while
+ index[parent]=indext;
+ } // for
+ break;
+
+ default:
+ printf("Error -- S7_Datatype not supported in S7_Indexi8_Sort\n");
+ exit(1);
+ break;
+
+ }
+}
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/s7.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/s7.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef S7_H_
+#define S7_H_
+
+//#define _S7_DEBUG
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+ /*
+ * Some S7 parameters.
+ */
+
+#define S7_OK 0 /* Successful return. */
+
+ enum S7_Datatype
+ {
+ S7_GENERIC8 = 0,
+ S7_BYTE,
+ S7_PACKED,
+
+ S7_CHAR,
+ S7_INT,
+ S7_LONG,
+ S7_LONG_LONG_INT,
+ S7_FLOAT,
+ S7_DOUBLE,
+
+ S7_CHARACTER,
+ S7_LOGICAL,
+ S7_INTEGER4,
+ S7_INTEGER8,
+ S7_REAL4,
+ S7_REAL8,
+
+ S7_DATATYPE_MIN = S7_GENERIC8,
+ S7_DATATYPE_MAX = S7_REAL8
+ };
+
+
+ void S7_Sort(
+ void *array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype
+ );
+
+ void S7_Sort_2Arrays(
+ void * array_in1,
+ void * array_in2,
+ const int nsize,
+ const enum S7_Datatype S7_datatype
+ );
+
+ void S7_Index_Sort(
+ void * array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype,
+ int * index
+ );
+
+ void S7_Indexi8_Sort(
+ void * array_in,
+ const int nsize,
+ const enum S7_Datatype S7_datatype,
+ long * index
+ );
+
+
+ void S7_Index_sort_real8(const int n,double array_in[],int index[]);
+ void S7_Index_sort_int8(const int n,long long iarray_in[], int index[]);
+ void S7_Index_sort_int4(const int n, int iarray_in[], int index[]);
+ void S7_Index_sort_real8_int8(const int n,double array_in[],long long index[]);
+
+ void S7_Index_sort_int8_int8(const int n,long long iarray_in[], long long index[]);
+ void S7_Index_sort_int4_int8(const int n, int iarray_in[], long long index[]);
+ void S7_Sort_real8(const int n,double array_in[]);
+ void S7_Sort_int8(const int n,long long array_in[]);
+ void S7_Sort_int4(const int n,int array_in[]);
+ void S7_Sort_real8_real8(const int n,double array_in[],double array_in2[]);
+ void S7_Sort_int8_int8(const int n,long long array_in[],long long array_in2[]);
+ void S7_Sort_int4_int4(const int n,int array_in[],int array_in2[]);
+
+ /*
+ * End prototypes.
+ */
+
+ /*
+ * remove typesafe linkage if compiling under c++
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* S7_H */
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/state.cpp?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.cpp Mon Sep 4 07:25:36 2017
@@ -0,0 +1,3966 @@
+/*
+ * Copyright (c) 2011-2013, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#include "mesh.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+#include <algorithm>
+#include <queue>
+#include "state.h"
+#include "timer.h"
+#ifdef HAVE_MPI
+#include <mpi.h>
+#endif
+
+#undef DEBUG
+//#define DEBUG 0
+#define DEBUG_RESTORE_VALS 1
+#define TIMING_LEVEL 2
+
+#if defined(MINIMUM_PRECISION)
+#define ZERO 0.0f
+#define ONE 1.0f
+#define HALF 0.5f
+#define EPSILON 1.0f-30
+#define STATE_EPS 15.0
+// calc refine is done in single precision
+#define REFINE_GRADIENT 0.10f
+#define COARSEN_GRADIENT 0.05f
+#define REFINE_HALF 0.5f
+#define REFINE_NEG_THOUSAND -1000.0f
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+#define ZERO 0.0
+#define ONE 1.0
+#define HALF 0.5
+#define EPSILON 1.0e-30
+#define STATE_EPS .02
+// calc refine is done in single precision
+#define REFINE_GRADIENT 0.10f
+#define COARSEN_GRADIENT 0.05f
+#define REFINE_HALF 0.5f
+#define REFINE_NEG_THOUSAND -1000.0f
+
+#elif defined(FULL_PRECISION)
+#define ZERO 0.0
+#define ONE 1.0
+#define HALF 0.5
+#define EPSILON 1.0e-30
+#define STATE_EPS .02
+// calc refine is done in single precision
+#define REFINE_GRADIENT 0.10
+#define COARSEN_GRADIENT 0.05
+#define REFINE_HALF 0.5
+#define REFINE_NEG_THOUSAND -1000.0
+
+#endif
+
+#ifdef _OPENMP
+static bool iversion_flag = false;
+#endif
+
+typedef unsigned int uint;
+
+static const char *state_timer_descriptor[STATE_TIMER_SIZE] = {
+ "state_timer_apply_BCs",
+ "state_timer_set_timestep",
+ "state_timer_finite_difference",
+ "state_timer_refine_potential",
+ "state_timer_calc_mpot",
+ "state_timer_rezone_all",
+ "state_timer_mass_sum",
+ "state_timer_read",
+ "state_timer_write"
+};
+
+#ifdef HAVE_OPENCL
+#include "state_kernel.inc"
+#endif
+
+struct esum_type{
+ double sum;
+ double correction;
+};
+#ifdef HAVE_MPI
+MPI_Datatype MPI_TWO_DOUBLES;
+MPI_Op KNUTH_SUM;
+int commutative = 1;
+void knuth_sum(struct esum_type *in, struct esum_type *inout, int *len, MPI_Datatype *MPI_TWO_DOUBLES);
+#endif
+
+int save_ncells;
+
+#define CONSERVED_EQNS
+
+#define SQR(x) ( x*x )
+#define MIN3(x,y,z) ( min( min(x,y), z) )
+
+#ifdef HAVE_OPENCL
+cl_kernel kernel_set_timestep;
+cl_kernel kernel_reduction_min;
+cl_kernel kernel_copy_state_data;
+cl_kernel kernel_copy_state_ghost_data;
+cl_kernel kernel_apply_boundary_conditions;
+cl_kernel kernel_apply_boundary_conditions_local;
+cl_kernel kernel_apply_boundary_conditions_ghost;
+cl_kernel kernel_calc_finite_difference;
+cl_kernel kernel_refine_potential;
+cl_kernel kernel_reduce_sum_mass_stage1of2;
+cl_kernel kernel_reduce_sum_mass_stage2of2;
+cl_kernel kernel_reduce_epsum_mass_stage1of2;
+cl_kernel kernel_reduce_epsum_mass_stage2of2;
+#endif
+
+inline real_t U_halfstep(// XXX Fix the subindices to be more intuitive XXX
+ real_t deltaT, // Timestep
+ real_t U_i, // Initial cell's (downwind's) state variable
+ real_t U_n, // Next cell's (upwind's) state variable
+ real_t F_i, // Initial cell's (downwind's) state variable flux
+ real_t F_n, // Next cell's (upwind's) state variable flux
+ real_t r_i, // Initial cell's (downwind's) center to face distance
+ real_t r_n, // Next cell's (upwind's) center to face distance
+ real_t A_i, // Cell's face surface area
+ real_t A_n, // Cell's neighbor's face surface area
+ real_t V_i, // Cell's volume
+ real_t V_n) { // Cell's neighbor's volume
+
+ return (( r_i*U_n + r_n*U_i ) / ( r_i + r_n ))
+ - HALF*deltaT*(( F_n*A_n*min(ONE, A_i/A_n) - F_i*A_i*min(ONE, A_n/A_i) )
+ / ( V_n*min(HALF, V_i/V_n) + V_i*min(HALF, V_n/V_i) ));
+
+}
+
+inline real_t U_fullstep(
+ real_t deltaT,
+ real_t dr,
+ real_t U,
+ real_t F_plus,
+ real_t F_minus,
+ real_t G_plus,
+ real_t G_minus) {
+
+ return (U - (deltaT / dr)*(F_plus - F_minus + G_plus - G_minus));
+
+}
+
+
+inline real_t w_corrector(
+ real_t deltaT, // Timestep
+ real_t dr, // Cell's center to face distance
+ real_t U_eigen, // State variable's eigenvalue (speed)
+ real_t grad_half, // Centered gradient
+ real_t grad_minus, // Downwind gradient
+ real_t grad_plus) { // Upwind gradient
+
+ real_t nu = HALF * U_eigen * deltaT / dr;
+ nu = nu * (ONE - nu);
+
+ real_t rdenom = ONE / max(SQR(grad_half), EPSILON);
+ real_t rplus = (grad_plus * grad_half) * rdenom;
+ real_t rminus = (grad_minus * grad_half) * rdenom;
+
+ return HALF*nu*(ONE- max(MIN3(ONE, rplus, rminus), ZERO));
+}
+
+State::State(Mesh *mesh_in)
+{
+ for (int i = 0; i < STATE_TIMER_SIZE; i++){
+ cpu_timers[i] = 0.0;
+ }
+ for (int i = 0; i < STATE_TIMER_SIZE; i++){
+ gpu_timers[i] = 0L;
+ }
+
+ mesh = mesh_in;
+
+#ifdef HAVE_MPI
+ int mpi_init;
+ MPI_Initialized(&mpi_init);
+ if (mpi_init){
+ MPI_Type_contiguous(2, MPI_DOUBLE, &MPI_TWO_DOUBLES);
+ MPI_Type_commit(&MPI_TWO_DOUBLES);
+ MPI_Op_create((MPI_User_function *)knuth_sum, commutative, &KNUTH_SUM);
+ // FIXME add fini and set size
+ if (mesh->parallel) state_memory.pinit(MPI_COMM_WORLD, 2L * 1024 * 1024 * 1024);
+ }
+#endif
+}
+
+void State::init(int do_gpu_calc)
+{
+ if (do_gpu_calc) {
+#ifdef HAVE_OPENCL
+ cl_context context = ezcl_get_context();
+
+ if (mesh->mype == 0) printf("Starting compile of kernels in state\n");
+ const char *defines = NULL;
+ cl_program program = ezcl_create_program_wsource(context, defines, state_kern_source);
+
+ kernel_set_timestep = ezcl_create_kernel_wprogram(program, "set_timestep_cl");
+ kernel_reduction_min = ezcl_create_kernel_wprogram(program, "finish_reduction_min_cl");
+ kernel_copy_state_data = ezcl_create_kernel_wprogram(program, "copy_state_data_cl");
+ kernel_copy_state_ghost_data = ezcl_create_kernel_wprogram(program, "copy_state_ghost_data_cl");
+ kernel_apply_boundary_conditions = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_cl");
+ kernel_apply_boundary_conditions_local = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_local_cl");
+ kernel_apply_boundary_conditions_ghost = ezcl_create_kernel_wprogram(program, "apply_boundary_conditions_ghost_cl");
+ kernel_calc_finite_difference = ezcl_create_kernel_wprogram(program, "calc_finite_difference_cl");
+ kernel_refine_potential = ezcl_create_kernel_wprogram(program, "refine_potential_cl");
+ kernel_reduce_sum_mass_stage1of2 = ezcl_create_kernel_wprogram(program, "reduce_sum_mass_stage1of2_cl");
+ kernel_reduce_sum_mass_stage2of2 = ezcl_create_kernel_wprogram(program, "reduce_sum_mass_stage2of2_cl");
+ kernel_reduce_epsum_mass_stage1of2 = ezcl_create_kernel_wprogram(program, "reduce_epsum_mass_stage1of2_cl");
+ kernel_reduce_epsum_mass_stage2of2 = ezcl_create_kernel_wprogram(program, "reduce_epsum_mass_stage2of2_cl");
+
+ ezcl_program_release(program);
+ if (mesh->mype == 0) printf("Finishing compile of kernels in state\n");
+#endif
+ }
+
+ //printf("\nDEBUG -- Calling state memory memory malloc at line %d\n",__LINE__);
+ allocate(mesh->ncells);
+ //state_memory.memory_report();
+ //printf("DEBUG -- Finished state memory memory malloc at line %d\n\n",__LINE__);
+
+}
+
+void State::allocate(size_t ncells)
+{
+ int flags = 0;
+ flags = RESTART_DATA;
+#ifdef HAVE_J7
+ if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+ H = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "H", flags);
+ U = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "U", flags);
+ V = (state_t *)state_memory.memory_malloc(ncells, sizeof(state_t), "V", flags);
+}
+
+void State::resize(size_t new_ncells){
+ size_t current_size = state_memory.get_memory_size(H);
+ if (new_ncells > current_size) state_memory.memory_realloc_all(new_ncells);
+
+ //printf("\nDEBUG -- Calling state memory resize at line %d\n",__LINE__);
+ //state_memory.memory_report();
+ //printf("DEBUG -- Finished state memory resize at line %d\n\n",__LINE__);
+}
+
+void State::memory_reset_ptrs(void){
+ H = (state_t *)state_memory.get_memory_ptr("H");
+ U = (state_t *)state_memory.get_memory_ptr("U");
+ V = (state_t *)state_memory.get_memory_ptr("V");
+
+ //printf("\nDEBUG -- Calling state memory reset_ptrs at line %d\n",__LINE__);
+ //state_memory.memory_report();
+ //printf("DEBUG -- Finished state memory reset_ptrs at line %d\n\n",__LINE__);
+}
+
+void State::terminate(void)
+{
+ state_memory.memory_delete(H);
+ state_memory.memory_delete(U);
+ state_memory.memory_delete(V);
+
+#ifdef HAVE_OPENCL
+ ezcl_device_memory_delete(dev_deltaT);
+
+ gpu_state_memory.memory_delete(dev_H);
+ gpu_state_memory.memory_delete(dev_U);
+ gpu_state_memory.memory_delete(dev_V);
+
+ ezcl_kernel_release(kernel_set_timestep);
+ ezcl_kernel_release(kernel_reduction_min);
+ ezcl_kernel_release(kernel_copy_state_data);
+ ezcl_kernel_release(kernel_copy_state_ghost_data);
+ ezcl_kernel_release(kernel_apply_boundary_conditions);
+ ezcl_kernel_release(kernel_apply_boundary_conditions_local);
+ ezcl_kernel_release(kernel_apply_boundary_conditions_ghost);
+ ezcl_kernel_release(kernel_calc_finite_difference);
+ ezcl_kernel_release(kernel_refine_potential);
+ ezcl_kernel_release(kernel_reduce_sum_mass_stage1of2);
+ ezcl_kernel_release(kernel_reduce_sum_mass_stage2of2);
+ ezcl_kernel_release(kernel_reduce_epsum_mass_stage1of2);
+ ezcl_kernel_release(kernel_reduce_epsum_mass_stage2of2);
+#endif
+#ifdef HAVE_MPI
+ if (mesh->parallel) state_memory.pfini();
+#endif
+}
+
+#ifdef HAVE_MPI
+void knuth_sum(struct esum_type *in, struct esum_type *inout, int *len, MPI_Datatype *MPI_TWO_DOUBLES)
+{
+ double u, v, upt, up, vpp;
+ u = inout->sum;
+ v = in->sum + (in->correction+inout->correction);
+ upt = u + v;
+ up = upt - v;
+ vpp = upt - up;
+ inout->sum = upt;
+ inout->correction = (u - up) + (v - vpp);
+
+ // Just to block compiler warnings
+ if (1==2) printf("DEBUG len %d datatype %lld\n",*len,(long long)(*MPI_TWO_DOUBLES) );
+}
+#endif
+
+void State::add_boundary_cells(void)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ // This is for a mesh with no boundary cells -- they are added and
+ // the mesh sizes increased
+ size_t &ncells = mesh->ncells;
+ vector<int> &index = mesh->index;
+ vector<spatial_t> &x = mesh->x;
+ vector<spatial_t> &dx = mesh->dx;
+ vector<spatial_t> &y = mesh->y;
+ vector<spatial_t> &dy = mesh->dy;
+
+ int *i = mesh->i;
+ int *j = mesh->j;
+ int *level = mesh->level;
+ int *celltype = mesh->celltype;
+ int *nlft = mesh->nlft;
+ int *nrht = mesh->nrht;
+ int *nbot = mesh->nbot;
+ int *ntop = mesh->ntop;
+
+ vector<int> &lev_ibegin = mesh->lev_ibegin;
+ vector<int> &lev_iend = mesh->lev_iend;
+ vector<int> &lev_jbegin = mesh->lev_jbegin;
+ vector<int> &lev_jend = mesh->lev_jend;
+
+ // Pre-count number of cells to add
+ int icount = 0;
+ for (uint ic=0; ic<ncells; ic++) {
+ if (i[ic] == lev_ibegin[level[ic]]) icount++; // Left boundary
+ if (i[ic] == lev_iend[level[ic]]) icount++; // Right boundary
+ if (j[ic] == lev_jbegin[level[ic]]) icount++; // Bottom boundary
+ if (j[ic] == lev_jend[level[ic]]) icount++; // Top boundary
+ }
+
+ int new_ncells = ncells + icount;
+ // Increase the arrays for the new boundary cells
+ H=(state_t *)state_memory.memory_realloc(new_ncells, H);
+ U=(state_t *)state_memory.memory_realloc(new_ncells, U);
+ V=(state_t *)state_memory.memory_realloc(new_ncells, V);
+ //printf("\nDEBUG add_boundary cells\n");
+ //state_memory.memory_report();
+ //printf("DEBUG end add_boundary cells\n\n");
+
+ mesh->i =(int *)mesh->mesh_memory.memory_realloc(new_ncells, i);
+ mesh->j =(int *)mesh->mesh_memory.memory_realloc(new_ncells, j);
+ mesh->level =(int *)mesh->mesh_memory.memory_realloc(new_ncells, level);
+ mesh->celltype =(int *)mesh->mesh_memory.memory_realloc(new_ncells, celltype);
+ mesh->nlft =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nlft);
+ mesh->nrht =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nrht);
+ mesh->nbot =(int *)mesh->mesh_memory.memory_realloc(new_ncells, nbot);
+ mesh->ntop =(int *)mesh->mesh_memory.memory_realloc(new_ncells, ntop);
+ //memory_reset_ptrs();
+ i = mesh->i;
+ j = mesh->j;
+ level = mesh->level;
+ celltype = mesh->celltype;
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+
+ index.resize(new_ncells);
+ x.resize(new_ncells);
+ dx.resize(new_ncells);
+ y.resize(new_ncells);
+ dy.resize(new_ncells);
+
+ for (int nc=ncells; nc<new_ncells; nc++) {
+ nlft[nc] = -1;
+ nrht[nc] = -1;
+ nbot[nc] = -1;
+ ntop[nc] = -1;
+ }
+
+ // In the first pass, set two of the neighbor indices and all
+ // the other data to be brought across. Set the inverse of the
+ // the velocity to enforce the reflective boundary condition
+ uint nc=ncells;
+ for (uint ic=0; ic<ncells; ic++) {
+ if (i[ic] == lev_ibegin[level[ic]]) {
+ nlft[ic] = nc;
+ nlft[nc] = nc;
+ nrht[nc] = ic;
+ i[nc] = lev_ibegin[level[ic]]-1;
+ j[nc] = j[ic];
+ level[nc] = level[ic];
+ dx[nc] = dx[ic];
+ dy[nc] = dy[ic];
+ x[nc] = x[ic]-dx[ic];
+ y[nc] = y[ic];
+ H[nc] = H[ic];
+ U[nc] = -U[ic];
+ V[nc] = V[ic];
+ nc++;
+ }
+ if (i[ic] == lev_iend[level[ic]]) {
+ nrht[ic] = nc;
+ nrht[nc] = nc;
+ nlft[nc] = ic;
+ i[nc] = lev_iend[level[ic]]+1;
+ j[nc] = j[ic];
+ level[nc] = level[ic];
+ dx[nc] = dx[ic];
+ dy[nc] = dy[ic];
+ x[nc] = x[ic]+dx[ic];
+ y[nc] = y[ic];
+ H[nc] = H[ic];
+ U[nc] = -U[ic];
+ V[nc] = V[ic];
+ nc++;
+ }
+ if (j[ic] == lev_jbegin[level[ic]]) {
+ nbot[ic] = nc;
+ nbot[nc] = nc;
+ ntop[nc] = ic;
+ i[nc] = i[ic];
+ j[nc] = lev_jbegin[level[ic]]-1;
+ level[nc] = level[ic];
+ dx[nc] = dx[ic];
+ dy[nc] = dy[ic];
+ x[nc] = x[ic];
+ y[nc] = y[ic]-dy[ic];
+ H[nc] = H[ic];
+ U[nc] = U[ic];
+ V[nc] = -V[ic];
+ nc++;
+ }
+ if (j[ic] == lev_jend[level[ic]]) {
+ ntop[ic] = nc;
+ ntop[nc] = nc;
+ nbot[nc] = ic;
+ i[nc] = i[ic];
+ j[nc] = lev_jend[level[ic]]+1;
+ level[nc] = level[ic];
+ dx[nc] = dx[ic];
+ dy[nc] = dy[ic];
+ x[nc] = x[ic];
+ y[nc] = y[ic]+dy[ic];
+ H[nc] = H[ic];
+ U[nc] = U[ic];
+ V[nc] = -V[ic];
+ nc++;
+ }
+ }
+
+ // Now set the other two neighbor indices
+ for (int nc=ncells; nc<new_ncells; nc++) {
+ if (i[nc] == lev_ibegin[level[nc]]-1) {
+ // Need to check if also a bottom boundary cell
+ if (j[nc] == lev_jbegin[level[nc]]){
+ nbot[nc] = nc;
+ } else {
+ nbot[nc] = nlft[nbot[nrht[nc]]];
+ }
+ if (j[nc] == lev_jend[level[nc]]){
+ ntop[nc] = nc;
+ } else {
+ ntop[nc] = nlft[ntop[nrht[nc]]];
+ }
+ }
+ if (i[nc] == lev_iend[level[nc]]+1) {
+ if (level[nc] <= level[nbot[nlft[nc]]]){
+ if (j[nc] == lev_jbegin[level[nc]]){
+ nbot[nc] = nc;
+ } else {
+ nbot[nc] = nrht[nbot[nlft[nc]]];
+ }
+ if (j[nc] == lev_jend[level[nc]]){
+ ntop[nc] = nc;
+ } else {
+ ntop[nc] = nrht[ntop[nlft[nc]]];
+ }
+ // calculation is a little different if going through a
+ // finer zoned region
+ } else {
+ nbot[nc] = nrht[nrht[nbot[nlft[nc]]]];
+ ntop[nc] = nrht[nrht[ntop[nlft[nc]]]];
+ }
+ }
+ if (j[nc] == lev_jbegin[level[nc]]-1) {
+ if (i[nc] == lev_ibegin[level[nc]]){
+ nlft[nc] = nc;
+ } else {
+ nlft[nc] = nbot[nlft[ntop[nc]]];
+ }
+ if (i[nc] == lev_iend[level[nc]]){
+ nrht[nc] = nc;
+ } else {
+ nrht[nc] = nbot[nrht[ntop[nc]]];
+ }
+ }
+ if (j[nc] == lev_jend[level[nc]]+1) {
+ if (level[nc] <= level[nlft[nbot[nc]]]){
+ if (i[nc] == lev_ibegin[level[nc]]){
+ nlft[nc] = nc;
+ } else {
+ nlft[nc] = ntop[nlft[nbot[nc]]];
+ }
+ if (i[nc] == lev_iend[level[nc]]){
+ nrht[nc] = nc;
+ } else {
+ nrht[nc] = ntop[nrht[nbot[nc]]];
+ }
+ } else {
+ nlft[nc] = ntop[ntop[nlft[nbot[nc]]]];
+ nrht[nc] = ntop[ntop[nrht[nbot[nc]]]];
+ }
+ }
+ }
+ save_ncells = ncells;
+ ncells = new_ncells;
+
+ cpu_timers[STATE_TIMER_APPLY_BCS] += cpu_timer_stop(tstart_cpu);
+}
+
+void State::apply_boundary_conditions_local(void)
+{
+ static int *nlft, *nrht, *nbot, *ntop;
+
+ size_t &ncells = mesh->ncells;
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+
+ // This is for a mesh with boundary cells
+ int lowerBound, upperBound;
+ mesh->get_bounds(lowerBound, upperBound);
+ for (uint ic=lowerBound; ic<upperBound; ic++) {
+ if (mesh->is_left_boundary(ic)) {
+ int nr = nrht[ic];
+ if (nr < (int)ncells) {
+ H[ic] = H[nr];
+ U[ic] = -U[nr];
+ V[ic] = V[nr];
+ }
+ }
+ if (mesh->is_right_boundary(ic)) {
+ int nl = nlft[ic];
+ if (nl < (int)ncells) {
+ H[ic] = H[nl];
+ U[ic] = -U[nl];
+ V[ic] = V[nl];
+ }
+ }
+ if (mesh->is_bottom_boundary(ic)) {
+ int nt = ntop[ic];
+ if (nt < (int)ncells) {
+ H[ic] = H[nt];
+ U[ic] = U[nt];
+ V[ic] = -V[nt];
+ }
+ }
+ if (mesh->is_top_boundary(ic)) {
+ int nb = nbot[ic];
+ if (nb < (int)ncells) {
+ H[ic] = H[nb];
+ U[ic] = U[nb];
+ V[ic] = -V[nb];
+ }
+ }
+ }
+}
+
+void State::apply_boundary_conditions_ghost(void)
+{
+ static int *nlft, *nrht, *nbot, *ntop;
+
+ size_t &ncells = mesh->ncells;
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+
+ // This is for a mesh with boundary cells
+ int lowerBound, upperBound;
+ mesh->get_bounds(lowerBound, upperBound);
+ for (uint ic=lowerBound; ic<upperBound; ic++) {
+ if (mesh->is_left_boundary(ic)) {
+ int nr = nrht[ic];
+ if (nr >= (int)ncells) {
+ H[ic] = H[nr];
+ U[ic] = -U[nr];
+ V[ic] = V[nr];
+ }
+ }
+ if (mesh->is_right_boundary(ic)) {
+ int nl = nlft[ic];
+ if (nl >= (int)ncells) {
+ H[ic] = H[nl];
+ U[ic] = -U[nl];
+ V[ic] = V[nl];
+ }
+ }
+ if (mesh->is_bottom_boundary(ic)) {
+ int nt = ntop[ic];
+ if (nt >= (int)ncells) {
+ H[ic] = H[nt];
+ U[ic] = U[nt];
+ V[ic] = -V[nt];
+ }
+ }
+ if (mesh->is_top_boundary(ic)) {
+ int nb = nbot[ic];
+ if (nb >= (int)ncells) {
+ H[ic] = H[nb];
+ U[ic] = U[nb];
+ V[ic] = -V[nb];
+ }
+ }
+ }
+}
+
+void State::apply_boundary_conditions(void)
+{
+ int *nlft, *nrht, *nbot, *ntop;
+
+ size_t &ncells = mesh->ncells;
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+
+ // This is for a mesh with boundary cells
+ int lowerBound, upperBound;
+ mesh->get_bounds(lowerBound, upperBound);
+ for (uint ic=lowerBound; ic<upperBound; ic++) {
+ if (mesh->is_left_boundary(ic)) {
+ int nr = nrht[ic];
+ H[ic] = H[nr];
+ U[ic] = -U[nr];
+ V[ic] = V[nr];
+ }
+ if (mesh->is_right_boundary(ic)) {
+ int nl = nlft[ic];
+ H[ic] = H[nl];
+ U[ic] = -U[nl];
+ V[ic] = V[nl];
+ }
+ if (mesh->is_bottom_boundary(ic)) {
+ int nt = ntop[ic];
+ H[ic] = H[nt];
+ U[ic] = U[nt];
+ V[ic] = -V[nt];
+ }
+ if (mesh->is_top_boundary(ic)) {
+ int nb = nbot[ic];
+ H[ic] = H[nb];
+ U[ic] = U[nb];
+ V[ic] = -V[nb];
+ }
+ }
+}
+
+void State::remove_boundary_cells(void)
+{
+ if(! mesh->have_boundary) {
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ size_t &ncells = mesh->ncells;
+
+ // Resize to drop all the boundary cells
+ ncells = save_ncells;
+ H=(state_t *)state_memory.memory_realloc(save_ncells, H);
+ U=(state_t *)state_memory.memory_realloc(save_ncells, U);
+ V=(state_t *)state_memory.memory_realloc(save_ncells, V);
+ //printf("\nDEBUG remove_boundary cells\n");
+ //state_memory.memory_report();
+ //printf("DEBUG end remove_boundary cells\n\n");
+
+ mesh->i = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->i);
+ mesh->j = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->j);
+ mesh->level = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->level);
+ mesh->celltype = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->celltype);
+ mesh->nlft = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nlft);
+ mesh->nrht = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nrht);
+ mesh->nbot = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->nbot);
+ mesh->ntop = (int *)mesh->mesh_memory.memory_realloc(save_ncells, mesh->ntop);
+
+ // Reset the neighbors due to the dropped boundary cells
+ mesh->index.resize(save_ncells);
+ mesh->x.resize(save_ncells);
+ mesh->dx.resize(save_ncells);
+ mesh->y.resize(save_ncells);
+ mesh->dy.resize(save_ncells);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ mesh->set_bounds(mesh->ncells);
+
+ int lowerBound, upperBound;
+ mesh->get_bounds(lowerBound, upperBound);
+ for (uint ic=lowerBound; ic<upperBound; ic++) {
+ if (mesh->i[ic] == mesh->lev_ibegin[mesh->level[ic]]) mesh->nlft[ic] = ic;
+ if (mesh->i[ic] == mesh->lev_iend[mesh->level[ic]]) mesh->nrht[ic] = ic;
+ if (mesh->j[ic] == mesh->lev_jbegin[mesh->level[ic]]) mesh->nbot[ic] = ic;
+ if (mesh->j[ic] == mesh->lev_jend[mesh->level[ic]]) mesh->ntop[ic] = ic;
+ }
+
+ } // if have_boundary
+}
+
+double State::set_timestep(double g, double sigma)
+{
+ double globalmindeltaT;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ static double mindeltaT;
+
+ int lowerBounds, upperBounds;
+ mesh->set_bounds(mesh->ncells);
+ mesh->get_bounds(lowerBounds, upperBounds);
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ mindeltaT = 1000;
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ double mymindeltaT = 1000.0; // private for each thread
+
+ for (int ic=lowerBounds; ic<upperBounds; ic++) {
+ if (mesh->celltype[ic] == REAL_CELL) {
+ int lev = mesh->level[ic];
+ double wavespeed = sqrt(g*H[ic]);
+ double xspeed = (fabs(U[ic])+wavespeed)/mesh->lev_deltax[lev];
+ double yspeed = (fabs(V[ic])+wavespeed)/mesh->lev_deltay[lev];
+ double deltaT=sigma/(xspeed+yspeed);
+ if (deltaT < mymindeltaT) mymindeltaT = deltaT;
+ }
+ }
+
+#ifdef _OPENMP
+#pragma omp critical
+ {
+#endif
+ if (mymindeltaT < mindeltaT) mindeltaT = mymindeltaT;
+#ifdef _OPENMP
+ } // End critical region
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+
+
+ globalmindeltaT = mindeltaT;
+#ifdef HAVE_MPI
+ if (mesh->parallel) MPI_Allreduce(&mindeltaT, &globalmindeltaT, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+ cpu_timers[STATE_TIMER_SET_TIMESTEP] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+ } // End master region
+#pragma omp barrier
+#endif
+
+ return(globalmindeltaT);
+}
+
+#ifdef HAVE_OPENCL
+double State::gpu_set_timestep(double sigma)
+{
+ double deltaT, globalmindeltaT;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t &ncells = mesh->ncells;
+#ifdef HAVE_MPI
+ int ¶llel = mesh->parallel;
+#endif
+ cl_mem &dev_level = mesh->dev_level;
+ cl_mem &dev_celltype = mesh->dev_celltype;
+ cl_mem &dev_levdx = mesh->dev_levdx;
+ cl_mem &dev_levdy = mesh->dev_levdy;
+
+ assert(dev_H);
+ assert(dev_U);
+ assert(dev_V);
+ assert(dev_level);
+ assert(dev_celltype);
+ assert(dev_levdx);
+ assert(dev_levdy);
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+ size_t block_size = global_work_size/local_work_size;
+
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+ /*
+ __kernel void set_timestep_cl(
+ const int ncells, // 0 Total number of cells.
+ const real_t sigma, // 1
+ __global const state_t *H, // 2
+ __global const state_t *U, // 3
+ __global const state_t *V, // 4
+ __global const int *level, // 5 Array of level information.
+ __global const int *celltype, // 6
+ __global const real_t *lev_dx, // 7
+ __global const real_t *lev_dy, // 8
+ __global real_t *redscratch, // 9
+ __global real_t *deltaT, // 10
+ __local real_t *tile) // 11
+ */
+
+ real_t sigma_local = sigma;
+ ezcl_set_kernel_arg(kernel_set_timestep, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_set_timestep, 1, sizeof(cl_real_t), (void *)&sigma_local);
+ ezcl_set_kernel_arg(kernel_set_timestep, 2, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_set_timestep, 3, sizeof(cl_mem), (void *)&dev_U);
+ ezcl_set_kernel_arg(kernel_set_timestep, 4, sizeof(cl_mem), (void *)&dev_V);
+ ezcl_set_kernel_arg(kernel_set_timestep, 5, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_set_timestep, 6, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_set_timestep, 7, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_set_timestep, 8, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_set_timestep, 9, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_set_timestep, 10, sizeof(cl_mem), (void *)&dev_deltaT);
+ ezcl_set_kernel_arg(kernel_set_timestep, 11, local_work_size*sizeof(cl_real_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_set_timestep, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ if (block_size > 1){
+ /*
+ __kernel void finish_reduction_min_cl(
+ const int isize,
+ __global real_t *redscratch,
+ __global real_t *deltaT,
+ __local real_t *tile)
+ */
+ ezcl_set_kernel_arg(kernel_reduction_min, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduction_min, 1, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduction_min, 2, sizeof(cl_mem), (void *)&dev_deltaT);
+ ezcl_set_kernel_arg(kernel_reduction_min, 3, local_work_size*sizeof(cl_real_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduction_min, 1, NULL, &local_work_size, &local_work_size, NULL);
+ }
+
+ real_t deltaT_local;
+ ezcl_enqueue_read_buffer(command_queue, dev_deltaT, CL_TRUE, 0, sizeof(cl_real_t), &deltaT_local, NULL);
+ deltaT = deltaT_local;
+
+ globalmindeltaT = deltaT;
+#ifdef HAVE_MPI
+ if (parallel) MPI_Allreduce(&deltaT, &globalmindeltaT, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+#endif
+
+ ezcl_device_memory_delete(dev_redscratch);
+
+ gpu_timers[STATE_TIMER_SET_TIMESTEP] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+ return(globalmindeltaT);
+}
+#endif
+
+void State::fill_circle(double circ_radius,// Radius of circle in grid units.
+ double fill_value, // Circle height for shallow water.
+ double background) // Background height for shallow water.
+{
+ size_t &ncells = mesh->ncells;
+ vector<spatial_t> &x = mesh->x;
+ vector<spatial_t> &dx = mesh->dx;
+ vector<spatial_t> &y = mesh->y;
+ vector<spatial_t> &dy = mesh->dy;
+
+ for (uint ic = 0; ic < ncells; ic++)
+ { H[ic] = background;
+ U[ic] = V[ic] = 0.0; }
+
+ // Clear the old k-D tree and generate new data (slow but necessary here).
+ //KDTree_Destroy(&mesh->tree);
+ mesh->kdtree_setup();
+
+ int nez;
+ vector<int> ind(ncells);
+ vector<double> weight(ncells);
+
+#ifdef FULL_PRECISION
+ KDTree_QueryCircleInterior_Double(&mesh->tree, &nez, &(ind[0]), circ_radius, ncells,
+ &x[0], &dx[0],
+ &y[0], &dy[0]);
+#else
+ KDTree_QueryCircleInterior_Float(&mesh->tree, &nez, &(ind[0]), circ_radius, ncells,
+ &x[0], &dx[0],
+ &y[0], &dy[0]);
+#endif
+ for (int ic = 0; ic < nez; ++ic)
+ { H[ind[ic]] = fill_value; }
+
+#ifdef FULL_PRECISION
+ KDTree_QueryCircleIntersectWeighted_Double(&mesh->tree, &nez, &(ind[0]), &(weight[0]),
+ circ_radius, ncells,
+ &x[0], &dx[0],
+ &y[0], &dy[0]);
+#else
+ KDTree_QueryCircleIntersectWeighted_Float(&mesh->tree, &nez, &(ind[0]), &(weight[0]),
+ circ_radius, ncells,
+ &x[0], &dx[0],
+ &y[0], &dy[0]);
+#endif
+
+ for (int ic = 0; ic < nez; ++ic)
+ { H[ind[ic]] = background + (fill_value - background) * weight[ic]; }
+
+ KDTree_Destroy(&mesh->tree);
+}
+
+void State::state_reorder(vector<int> iorder)
+{
+ H = state_memory.memory_reorder(H, &iorder[0]);
+ U = state_memory.memory_reorder(U, &iorder[0]);
+ V = state_memory.memory_reorder(V, &iorder[0]);
+ //printf("\nDEBUG reorder cells\n");
+ //state_memory.memory_report();
+ //printf("DEBUG end reorder cells\n\n");
+}
+
+void State::rezone_all(int icount, int jcount, vector<int> mpot)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ mesh->rezone_all(icount, jcount, mpot, 1, state_memory);
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ memory_reset_ptrs();
+
+ cpu_timers[STATE_TIMER_REZONE_ALL] += cpu_timer_stop(tstart_cpu);
+#ifdef _OPENMP
+ } // end master region
+#endif
+}
+
+
+#ifdef HAVE_OPENCL
+void State::gpu_rezone_all(int icount, int jcount, bool localStencil)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("DEBUG -- localStencil is %d\n",localStencil);
+
+ mesh->gpu_rezone_all(icount, jcount, dev_mpot, gpu_state_memory);
+ dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H");
+ dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U");
+ dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V");
+
+ gpu_timers[STATE_TIMER_REZONE_ALL] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+}
+#endif
+
+//define macro for squaring a number
+#define SQ(x) ((x)*(x))
+//define macro to find minimum of 3 values
+//#define MIN3(a,b,c) (min(min((a),(b)),(c)))
+
+#define HXFLUX(ic) ( U[ic] )
+#define UXFLUX(ic) ( SQ(U[ic])/H[ic] + ghalf*SQ(H[ic]) )
+#define UVFLUX(ic) ( U[ic]*V[ic]/H[ic] )
+
+#define HXFLUXIC ( Uic )
+#define HXFLUXNL ( Ul )
+#define HXFLUXNR ( Ur )
+#define HXFLUXNB ( Ub )
+#define HXFLUXNT ( Ut )
+
+#define UXFLUXIC ( SQ(Uic)/Hic + ghalf*SQ(Hic) )
+#define UXFLUXNL ( SQ(Ul)/Hl + ghalf*SQ(Hl) )
+#define UXFLUXNR ( SQ(Ur)/Hr + ghalf*SQ(Hr) )
+#define UXFLUXNB ( SQ(Ub)/Hb + ghalf*SQ(Hb) )
+#define UXFLUXNT ( SQ(Ut)/Ht + ghalf*SQ(Ht) )
+
+#define UVFLUXIC ( Uic*Vic/Hic )
+#define UVFLUXNL ( Ul*Vl/Hl )
+#define UVFLUXNR ( Ur*Vr/Hr )
+#define UVFLUXNB ( Ub*Vb/Hb )
+#define UVFLUXNT ( Ut*Vt/Ht )
+
+#define HYFLUX(ic) ( V[ic] )
+#define VUFLUX(ic) ( V[ic]*U[ic]/H[ic] )
+#define VYFLUX(ic) ( SQ(V[ic])/H[ic] + ghalf*SQ(H[ic]) )
+
+#define HYFLUXIC ( Vic )
+#define HYFLUXNL ( Vl )
+#define HYFLUXNR ( Vr )
+#define HYFLUXNB ( Vb )
+#define HYFLUXNT ( Vt )
+
+#define VUFLUXIC ( Vic*Uic/Hic )
+#define VUFLUXNL ( Vl*Ul/Hl )
+#define VUFLUXNR ( Vr*Ur/Hr )
+#define VUFLUXNB ( Vb*Ub/Hb )
+#define VUFLUXNT ( Vt*Ut/Ht )
+
+#define VYFLUXIC ( SQ(Vic)/Hic + ghalf*SQ(Hic) )
+#define VYFLUXNL ( SQ(Vl)/Hl + ghalf*SQ(Hl) )
+#define VYFLUXNR ( SQ(Vr)/Hr + ghalf*SQ(Hr) )
+#define VYFLUXNB ( SQ(Vb)/Hb + ghalf*SQ(Hb) )
+#define VYFLUXNT ( SQ(Vt)/Ht + ghalf*SQ(Ht) )
+
+
+#define HNEWXFLUXMINUS ( Uxminus )
+#define HNEWXFLUXPLUS ( Uxplus )
+#define UNEWXFLUXMINUS ( SQ(Uxminus)/Hxminus + ghalf*SQ(Hxminus) )
+#define UNEWXFLUXPLUS ( SQ(Uxplus) /Hxplus + ghalf*SQ(Hxplus) )
+#define UVNEWFLUXMINUS ( Uxminus*Vxminus/Hxminus )
+#define UVNEWFLUXPLUS ( Uxplus *Vxplus /Hxplus )
+
+#define HNEWYFLUXMINUS ( Vyminus )
+#define HNEWYFLUXPLUS ( Vyplus )
+#define VNEWYFLUXMINUS ( SQ(Vyminus)/Hyminus + ghalf*SQ(Hyminus) )
+#define VNEWYFLUXPLUS ( SQ(Vyplus) /Hyplus + ghalf*SQ(Hyplus) )
+#define VUNEWFLUXMINUS ( Vyminus*Uyminus/Hyminus )
+#define VUNEWFLUXPLUS ( Vyplus *Uyplus /Hyplus )
+
+// XXX ADDED XXX
+#define HXFLUXNLT ( Ult )
+#define HXFLUXNRT ( Urt )
+#define UXFLUXNLT ( SQR(Ult)/Hlt + ghalf*SQR(Hlt) )
+#define UXFLUXNRT ( SQR(Urt)/Hrt + ghalf*SQR(Hrt) )
+#define UVFLUXNLT ( Ult*Vlt/Hlt )
+#define UVFLUXNRT ( Urt*Vrt/Hrt )
+#define HYFLUXNBR ( Vbr )
+#define HYFLUXNTR ( Vtr )
+#define VUFLUXNBR ( Vbr*Ubr/Hbr )
+#define VUFLUXNTR ( Vtr*Utr/Htr )
+#define VYFLUXNBR ( SQR(Vbr)/Hbr + ghalf*SQR(Hbr) )
+#define VYFLUXNTR ( SQR(Vtr)/Htr + ghalf*SQR(Htr) )
+#define HNEWXFLUXMINUS2 ( Uxminus2 )
+#define HNEWXFLUXPLUS2 ( Uxplus2 )
+#define UNEWXFLUXMINUS2 ( SQR(Uxminus2)/Hxminus2 + ghalf*SQR(Hxminus2) )
+#define UNEWXFLUXPLUS2 ( SQR(Uxplus2) /Hxplus2 + ghalf*SQR(Hxplus2) )
+#define UVNEWFLUXMINUS2 ( Uxminus2*Vxminus2/Hxminus2 )
+#define UVNEWFLUXPLUS2 ( Uxplus2 *Vxplus2 /Hxplus2 )
+#define HNEWYFLUXMINUS2 ( Vyminus2 )
+#define HNEWYFLUXPLUS2 ( Vyplus2 )
+#define VNEWYFLUXMINUS2 ( SQR(Vyminus2)/Hyminus2 + ghalf*SQR(Hyminus2) )
+#define VNEWYFLUXPLUS2 ( SQR(Vyplus2) /Hyplus2 + ghalf*SQR(Hyplus2) )
+#define VUNEWFLUXMINUS2 ( Vyminus2*Uyminus2/Hyminus2 )
+#define VUNEWFLUXPLUS2 ( Vyplus2 *Uyplus2 /Hyplus2 )
+
+void State::calc_finite_difference(double deltaT){
+ real_t g = 9.80; // gravitational constant
+ real_t ghalf = 0.5*g;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ size_t ncells = mesh->ncells;
+ size_t &ncells_ghost = mesh->ncells_ghost;
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ if (ncells_ghost < ncells) ncells_ghost = ncells;
+
+ //printf("\nDEBUG finite diff\n");
+
+#ifdef HAVE_MPI
+ // We need to populate the ghost regions since the calc neighbors has just been
+ // established for the mesh shortly before
+ if (mesh->numpe > 1) {
+ apply_boundary_conditions_local();
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ H=(state_t *)state_memory.memory_realloc(ncells_ghost, H);
+ U=(state_t *)state_memory.memory_realloc(ncells_ghost, U);
+ V=(state_t *)state_memory.memory_realloc(ncells_ghost, V);
+
+ L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ apply_boundary_conditions_ghost();
+ } else {
+ apply_boundary_conditions();
+ }
+#else
+ apply_boundary_conditions();
+#endif
+
+ static state_t *H_new, *U_new, *V_new;
+ int *nlft, *nrht, *nbot, *ntop, *level;
+
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+ level = mesh->level;
+
+ vector<real_t> &lev_deltax = mesh->lev_deltax;
+ vector<real_t> &lev_deltay = mesh->lev_deltay;
+
+ int flags = 0;
+ flags = RESTART_DATA;
+#if defined (HAVE_J7)
+ if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ {
+ H_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+ sizeof(state_t),
+ "H_new", flags);
+ U_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+ sizeof(state_t),
+ "U_new", flags);
+ V_new = (state_t *)state_memory.memory_malloc(ncells_ghost,
+ sizeof(state_t),
+ "V_new", flags);
+ }
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+
+ int lowerBound, upperBound;
+ mesh->get_bounds(lowerBound, upperBound);
+
+ for(int gix = lowerBound; gix < upperBound; gix++) {
+#if DEBUG >= 3
+ printf("%d: DEBUG gix is %d at line %d in file %s\n",mesh->mype,gix,__LINE__,__FILE__);
+#endif
+
+ int lvl = level[gix];
+ int nl = nlft[gix];
+ int nr = nrht[gix];
+ int nt = ntop[gix];
+ int nb = nbot[gix];
+
+ real_t Hic = H[gix];
+ real_t Uic = U[gix];
+ real_t Vic = V[gix];
+
+#if DEBUG >= 3
+ if (nl < 0 || nl >= ncells_ghost ) printf("%d: Problem at file %s line %d with nl %ld\n",mesh->mype,__FILE__,__LINE__,nl);
+#endif
+ int nll = nlft[nl];
+ real_t Hl = H[nl];
+ real_t Ul = U[nl];
+ real_t Vl = V[nl];
+
+#if DEBUG >= 3
+ if (nr < 0 || nr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nr %ld\n",mesh->mype,__FILE__,__LINE__,nr);
+#endif
+ int nrr = nrht[nr];
+ real_t Hr = H[nr];
+ real_t Ur = U[nr];
+ real_t Vr = V[nr];
+
+#if DEBUG >= 3
+ if (nt < 0 || nt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nt %ld\n",mesh->mype,__FILE__,__LINE__,nt);
+#endif
+ int ntt = ntop[nt];
+ real_t Ht = H[nt];
+ real_t Ut = U[nt];
+ real_t Vt = V[nt];
+
+#if DEBUG >= 3
+ if (nb < 0 || nb >= ncells_ghost ) printf("%d: Problem at file %s line %d with nb %ld\n",mesh->mype,__FILE__,__LINE__,nb);
+#endif
+ int nbb = nbot[nb];
+ real_t Hb = H[nb];
+ real_t Ub = U[nb];
+ real_t Vb = V[nb];
+
+ int nlt = ntop[nl];
+ int nrt = ntop[nr];
+ int ntr = nrht[nt];
+ int nbr = nrht[nb];
+
+#if DEBUG >= 3
+ if (nll < 0 || nll >= ncells_ghost ) printf("%d: Problem at file %s line %d with nll %ld\n",mesh->mype,__FILE__,__LINE__,nll);
+#endif
+ real_t Hll = H[nll];
+ real_t Ull = U[nll];
+ //real_t Vll = V[nll];
+
+#if DEBUG >= 3
+ if (nrr < 0 || nrr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nrr %ld\n",mesh->mype,__FILE__,__LINE__,nrr);
+#endif
+ real_t Hrr = H[nrr];
+ real_t Urr = U[nrr];
+ //real_t Vrr = V[nrr];
+
+#if DEBUG >= 3
+ if (ntt < 0 || ntt >= ncells_ghost ) printf("%d: Problem at file %s line %d with ntt %ld\n",mesh->mype,__FILE__,__LINE__,ntt);
+#endif
+ real_t Htt = H[ntt];
+ //real_t Utt = U[ntt];
+ real_t Vtt = V[ntt];
+
+#if DEBUG >= 3
+ if (nbb < 0 || nbb >= ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with nbb %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbb); sleep(15); }
+#endif
+ real_t Hbb = H[nbb];
+ //real_t Ubb = U[nbb];
+ real_t Vbb = V[nbb];
+
+#if DEBUG >= 3
+ if (lvl < 0 || lvl >= (int)lev_deltax.size() ) printf("%d: Problem at file %s line %d with lvl %d\n",mesh->mype,__FILE__,__LINE__,lvl);
+#endif
+ real_t dxic = lev_deltax[lvl];
+ real_t dyic = lev_deltay[lvl];
+
+ real_t dxl = lev_deltax[level[nl]];
+ real_t dxr = lev_deltax[level[nr]];
+
+ real_t dyt = lev_deltay[level[nt]];
+ real_t dyb = lev_deltay[level[nb]];
+
+ real_t drl = dxl;
+ real_t drr = dxr;
+ real_t drt = dyt;
+ real_t drb = dyb;
+
+ real_t dric = dxic;
+
+ int nltl = 0;
+ real_t Hlt = 0.0, Ult = 0.0, Vlt = 0.0;
+ real_t Hll2 = 0.0;
+ real_t Ull2 = 0.0;
+ if(lvl < level[nl]) {
+#if DEBUG >= 3
+ if (nlt < 0 || nlt > ncells_ghost ) printf("%d: Problem at file %s line %d with nlt %ld\n",mesh->mype,__FILE__,__LINE__,nlt);
+#endif
+ Hlt = H[ ntop[nl] ];
+ Ult = U[ ntop[nl] ];
+ Vlt = V[ ntop[nl] ];
+ nltl = nlft[nlt];
+#if DEBUG >= 3
+ if (nltl < 0 || nltl > ncells_ghost ) printf("%d: Problem at file %s line %d with nltl %ld\n",mesh->mype,__FILE__,__LINE__,nltl);
+#endif
+ Hll2 = H[nltl];
+ Ull2 = U[nltl];
+ }
+
+ int nrtr = 0;
+ real_t Hrt = 0.0, Urt = 0.0, Vrt = 0.0;
+ real_t Hrr2 = 0.0;
+ real_t Urr2 = 0.0;
+ if(lvl < level[nr]) {
+#if DEBUG >= 3
+ if (nrt < 0 || nrt > ncells_ghost ) printf("%d: Problem at file %s line %d with nrt %ld\n",mesh->mype,__FILE__,__LINE__,nrt);
+#endif
+ Hrt = H[ ntop[nr] ];
+ Urt = U[ ntop[nr] ];
+ Vrt = V[ ntop[nr] ];
+ nrtr = nrht[nrt];
+#if DEBUG >= 3
+ if (nrtr < 0 || nrtr > ncells_ghost ) printf("%d: Problem at file %s line %d with nrtr %ld\n",mesh->mype,__FILE__,__LINE__,nrtr);
+#endif
+ Hrr2 = H[nrtr];
+ Urr2 = U[nrtr];
+ }
+
+ int nbrb = 0;
+ real_t Hbr = 0.0, Ubr = 0.0, Vbr = 0.0;
+ real_t Hbb2 = 0.0;
+ real_t Vbb2 = 0.0;
+ if(lvl < level[nb]) {
+#if DEBUG >= 3
+ if (nbr < 0 || nbr > ncells_ghost ) printf("%d: Problem at file %s line %d with nbr %ld\n",mesh->mype,__FILE__,__LINE__,nbr);
+#endif
+ Hbr = H[ nrht[nb] ];
+ Ubr = U[ nrht[nb] ];
+ Vbr = V[ nrht[nb] ];
+ nbrb = nbot[nbr];
+#if DEBUG >= 3
+ if (nbrb < 0 || nbrb > ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with nbrb %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbrb); sleep(20);}
+#endif
+ Hbb2 = H[nbrb];
+ Vbb2 = V[nbrb];
+ }
+
+ int ntrt = 0;
+ real_t Htr = 0.0, Utr = 0.0, Vtr = 0.0;
+ real_t Htt2 = 0.0;
+ real_t Vtt2 = 0.0;
+ if(lvl < level[nt]) {
+#if DEBUG >= 3
+ if (ntr < 0 || ntr > ncells_ghost ) printf("%d: Problem at file %s line %d with ntr %ld\n",mesh->mype,__FILE__,__LINE__,ntr);
+#endif
+ Htr = H[ nrht[nt] ];
+ Utr = U[ nrht[nt] ];
+ Vtr = V[ nrht[nt] ];
+ ntrt = ntop[ntr];
+#if DEBUG >= 3
+ if (ntrt < 0 || ntrt > ncells_ghost ) {printf("%d: Problem at file %s line %d ic %d %d with ntrt %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,ntrt); sleep(20); }
+#endif
+ Htt2 = H[ntrt];
+ Vtt2 = V[ntrt];
+ }
+
+
+ real_t Hxminus = U_halfstep(deltaT, Hl, Hic, HXFLUXNL, HXFLUXIC,
+ dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+ real_t Uxminus = U_halfstep(deltaT, Ul, Uic, UXFLUXNL, UXFLUXIC,
+ dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+ real_t Vxminus = U_halfstep(deltaT, Vl, Vic, UVFLUXNL, UVFLUXIC,
+ dxl, dxic, dxl, dxic, SQR(dxl), SQR(dxic));
+
+ real_t Hxplus = U_halfstep(deltaT, Hic, Hr, HXFLUXIC, HXFLUXNR,
+ dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+ real_t Uxplus = U_halfstep(deltaT, Uic, Ur, UXFLUXIC, UXFLUXNR,
+ dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+ real_t Vxplus = U_halfstep(deltaT, Vic, Vr, UVFLUXIC, UVFLUXNR,
+ dxic, dxr, dxic, dxr, SQR(dxic), SQR(dxr));
+
+ real_t Hyminus = U_halfstep(deltaT, Hb, Hic, HYFLUXNB, HYFLUXIC,
+ dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+ real_t Uyminus = U_halfstep(deltaT, Ub, Uic, VUFLUXNB, VUFLUXIC,
+ dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+ real_t Vyminus = U_halfstep(deltaT, Vb, Vic, VYFLUXNB, VYFLUXIC,
+ dyb, dyic, dyb, dyic, SQR(dyb), SQR(dyic));
+
+ real_t Hyplus = U_halfstep(deltaT, Hic, Ht, HYFLUXIC, HYFLUXNT,
+ dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+ real_t Uyplus = U_halfstep(deltaT, Uic, Ut, VUFLUXIC, VUFLUXNT,
+ dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+ real_t Vyplus = U_halfstep(deltaT, Vic, Vt, VYFLUXIC, VYFLUXNT,
+ dyic, dyt, dyic, dyt, SQR(dyic), SQR(dyt));
+
+ real_t Hxfluxminus = HNEWXFLUXMINUS;
+ real_t Uxfluxminus = UNEWXFLUXMINUS;
+ real_t Vxfluxminus = UVNEWFLUXMINUS;
+
+ real_t Hxfluxplus = HNEWXFLUXPLUS;
+ real_t Uxfluxplus = UNEWXFLUXPLUS;
+ real_t Vxfluxplus = UVNEWFLUXPLUS;
+
+ real_t Hyfluxminus = HNEWYFLUXMINUS;
+ real_t Uyfluxminus = VUNEWFLUXMINUS;
+ real_t Vyfluxminus = VNEWYFLUXMINUS;
+
+ real_t Hyfluxplus = HNEWYFLUXPLUS;
+ real_t Uyfluxplus = VUNEWFLUXPLUS;
+ real_t Vyfluxplus = VNEWYFLUXPLUS;
+
+ real_t Hxminus2 = 0.0;
+ real_t Uxminus2 = 0.0;
+ real_t Vxminus2 = 0.0;
+ if(lvl < level[nl]) {
+
+ Hxminus2 = U_halfstep(deltaT, Hlt, Hic, HXFLUXNLT, HXFLUXIC,
+ drl, dric, drl, dric, SQR(drl), SQR(dric));
+ Uxminus2 = U_halfstep(deltaT, Ult, Uic, UXFLUXNLT, UXFLUXIC,
+ drl, dric, drl, dric, SQR(drl), SQR(dric));
+ Vxminus2 = U_halfstep(deltaT, Vlt, Vic, UVFLUXNLT, UVFLUXIC,
+ drl, dric, drl, dric, SQR(drl), SQR(dric));
+
+ Hxfluxminus = (Hxfluxminus + HNEWXFLUXMINUS2) * HALF;
+ Uxfluxminus = (Uxfluxminus + UNEWXFLUXMINUS2) * HALF;
+ Vxfluxminus = (Vxfluxminus + UVNEWFLUXMINUS2) * HALF;
+
+ }
+
+ real_t Hxplus2 = 0.0;
+ real_t Uxplus2 = 0.0;
+ real_t Vxplus2 = 0.0;
+ if(lvl < level[nr]) {
+
+ Hxplus2 = U_halfstep(deltaT, Hic, Hrt, HXFLUXIC, HXFLUXNRT,
+ dric, drr, dric, drr, SQR(dric), SQR(drr));
+ Uxplus2 = U_halfstep(deltaT, Uic, Urt, UXFLUXIC, UXFLUXNRT,
+ dric, drr, dric, drr, SQR(dric), SQR(drr));
+ Vxplus2 = U_halfstep(deltaT, Vic, Vrt, UVFLUXIC, UVFLUXNRT,
+ dric, drr, dric, drr, SQR(dric), SQR(drr));
+
+ Hxfluxplus = (Hxfluxplus + HNEWXFLUXPLUS2) * HALF;
+ Uxfluxplus = (Uxfluxplus + UNEWXFLUXPLUS2) * HALF;
+ Vxfluxplus = (Vxfluxplus + UVNEWFLUXPLUS2) * HALF;
+
+ }
+
+ real_t Hyminus2 = 0.0;
+ real_t Uyminus2 = 0.0;
+ real_t Vyminus2 = 0.0;
+ if(lvl < level[nb]) {
+
+ Hyminus2 = U_halfstep(deltaT, Hbr, Hic, HYFLUXNBR, HYFLUXIC,
+ drb, dric, drb, dric, SQR(drb), SQR(dric));
+ Uyminus2 = U_halfstep(deltaT, Ubr, Uic, VUFLUXNBR, VUFLUXIC,
+ drb, dric, drb, dric, SQR(drb), SQR(dric));
+ Vyminus2 = U_halfstep(deltaT, Vbr, Vic, VYFLUXNBR, VYFLUXIC,
+ drb, dric, drb, dric, SQR(drb), SQR(dric));
+
+ Hyfluxminus = (Hyfluxminus + HNEWYFLUXMINUS2) * HALF;
+ Uyfluxminus = (Uyfluxminus + VUNEWFLUXMINUS2) * HALF;
+ Vyfluxminus = (Vyfluxminus + VNEWYFLUXMINUS2) * HALF;
+
+ }
+
+ real_t Hyplus2 = 0.0;
+ real_t Uyplus2 = 0.0;
+ real_t Vyplus2 = 0.0;
+ if(lvl < level[nt]) {
+
+ Hyplus2 = U_halfstep(deltaT, Hic, Htr, HYFLUXIC, HYFLUXNTR,
+ dric, drt, dric, drt, SQR(dric), SQR(drt));
+ Uyplus2 = U_halfstep(deltaT, Uic, Utr, VUFLUXIC, VUFLUXNTR,
+ dric, drt, dric, drt, SQR(dric), SQR(drt));
+ Vyplus2 = U_halfstep(deltaT, Vic, Vtr, VYFLUXIC, VYFLUXNTR,
+ dric, drt, dric, drt, SQR(dric), SQR(drt));
+
+ Hyfluxplus = (Hyfluxplus + HNEWYFLUXPLUS2) * HALF;
+ Uyfluxplus = (Uyfluxplus + VUNEWFLUXPLUS2) * HALF;
+ Vyfluxplus = (Vyfluxplus + VNEWYFLUXPLUS2) * HALF;
+
+ }
+
+ //if (DEBUG >= 2) {
+ // printf("1st pass x direction nz %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf\n",
+ // gix, nl, nr,
+ // Hxplus,Hxplus2,Uxplus,Uxplus2,Vxplus,Vxplus2);
+ // //H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+ //}
+
+ ////////////////////////////////////////
+ /// Artificial Viscosity corrections ///
+ ////////////////////////////////////////
+
+
+ if(level[nl] < level[nll]) {
+#if DEBUG >= 3
+ size_t nllt = ntop[nll];
+ if (nllt < 0 || nllt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nllt %ld\n",mesh->mype,__FILE__,__LINE__,nllt);
+#endif
+ Hll = (Hll + H[ ntop[nll] ]) * HALF;
+ Ull = (Ull + U[ ntop[nll] ]) * HALF;
+ }
+
+ real_t Hr2 = Hr;
+ real_t Ur2 = Ur;
+ if(lvl < level[nr]) {
+ Hr2 = (Hr2 + Hrt) * HALF;
+ Ur2 = (Ur2 + Urt) * HALF;
+ }
+
+ real_t wminusx_H = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+ Hic-Hl, Hl-Hll, Hr2-Hic);
+
+ wminusx_H *= Hic - Hl;
+
+ if(lvl < level[nl]) {
+ if(level[nlt] < level[nltl])
+ Hll2 = (Hll2 + H[ ntop[nltl] ]) * HALF;
+ wminusx_H = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+ sqrt(g*Hxminus2), Hic-Hlt, Hlt-Hll2, Hr2-Hic) *
+ (Hic - Hlt)) + wminusx_H)*HALF*HALF;
+ }
+
+
+ if(level[nr] < level[nrr]) {
+#if DEBUG >= 3
+ size_t nrrt = ntop[nrr];
+ if (nrrt < 0 || nrrt >= ncells_ghost ) printf("%d: Problem at file %s line %d with nrrt %ld\n",mesh->mype,__FILE__,__LINE__,nrrt);
+#endif
+ Hrr = (Hrr + H[ ntop[nrr] ]) * HALF;
+ Urr = (Urr + U[ ntop[nrr] ]) * HALF;
+ }
+
+ real_t Hl2 = Hl;
+ real_t Ul2 = Ul;
+ if(lvl < level[nl]) {
+ Hl2 = (Hl2 + Hlt) * HALF;
+ Ul2 = (Ul2 + Ult) * HALF;
+ }
+
+ real_t wplusx_H = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+ Hr-Hic, Hic-Hl2, Hrr-Hr);
+
+ wplusx_H *= Hr - Hic;
+
+ if(lvl < level[nr]) {
+ if(level[nrt] < level[nrtr])
+ Hrr2 = (Hrr2 + H[ ntop[nrtr] ]) * HALF;
+ wplusx_H = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+ sqrt(g*Hxplus2), Hrt-Hic, Hic-Hl2, Hrr2-Hrt) *
+ (Hrt - Hic))+wplusx_H)*HALF*HALF;
+ }
+
+
+ real_t wminusx_U = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+ Uic-Ul, Ul-Ull, Ur2-Uic);
+
+ wminusx_U *= Uic - Ul;
+
+ if(lvl < level[nl]) {
+ if(level[nlt] < level[nltl])
+ Ull2 = (Ull2 + U[ ntop[nltl] ]) * HALF;
+ wminusx_U = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+ sqrt(g*Hxminus2), Uic-Ult, Ult-Ull2, Ur2-Uic) *
+ (Uic - Ult))+wminusx_U)*HALF*HALF;
+ }
+
+
+ real_t wplusx_U = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+ Ur-Uic, Uic-Ul2, Urr-Ur);
+
+ wplusx_U *= Ur - Uic;
+
+ if(lvl < level[nr]) {
+ if(level[nrt] < level[nrtr])
+ Urr2 = (Urr2 + U[ ntop[nrtr] ]) * HALF;
+ wplusx_U = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+ sqrt(g*Hxplus2), Urt-Uic, Uic-Ul2, Urr2-Urt) *
+ (Urt - Uic))+wplusx_U)*HALF*HALF;
+ }
+
+
+ if(level[nb] < level[nbb]) {
+#if DEBUG >= 3
+ size_t nbbr = nrht[nbb];
+ if (nbbr < 0 || nbbr >= ncells_ghost ) printf("%d: Problem at file %s line %d gix %d %d with nbbr %ld\n",mesh->mype,__FILE__,__LINE__,gix,gix+mesh->noffset,nbbr);
+#endif
+ Hbb = (Hbb + H[ nrht[nbb] ]) * HALF;
+ Vbb = (Vbb + V[ nrht[nbb] ]) * HALF;
+ }
+
+ real_t Ht2 = Ht;
+ real_t Vt2 = Vt;
+ if(lvl < level[nt]) {
+ Ht2 = (Ht2 + Htr) * HALF;
+ Vt2 = (Vt2 + Vtr) * HALF;
+ }
+
+ real_t wminusy_H = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+ Hic-Hb, Hb-Hbb, Ht2-Hic);
+
+ wminusy_H *= Hic - Hb;
+
+ if(lvl < level[nb]) {
+ if(level[nbr] < level[nbrb])
+ Hbb2 = (Hbb2 + H[ nrht[nbrb] ]) * HALF;
+ wminusy_H = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+ sqrt(g*Hyminus2), Hic-Hbr, Hbr-Hbb2, Ht2-Hic) *
+ (Hic - Hbr))+wminusy_H)*HALF*HALF;
+ }
+
+
+ if(level[nt] < level[ntt]) {
+#if DEBUG >= 3
+ size_t nttr = nrht[ntt];
+ if (nttr < 0 || nttr >= ncells_ghost ) printf("%d: Problem at file %s line %d with nttr %ld\n",mesh->mype,__FILE__,__LINE__,nttr);
+#endif
+ Htt = (Htt + H[ nrht[ntt] ]) * HALF;
+ Vtt = (Vtt + V[ nrht[ntt] ]) * HALF;
+ }
+
+ real_t Hb2 = Hb;
+ real_t Vb2 = Vb;
+ if(lvl < level[nb]) {
+ Hb2 = (Hb2 + Hbr) * HALF;
+ Vb2 = (Vb2 + Vbr) * HALF;
+ }
+
+ real_t wplusy_H = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+ Ht-Hic, Hic-Hb2, Htt-Ht);
+
+ wplusy_H *= Ht - Hic;
+
+ if(lvl < level[nt]) {
+ if(level[ntr] < level[ntrt])
+ Htt2 = (Htt2 + H[ nrht[ntrt] ]) * HALF;
+ wplusy_H = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+ sqrt(g*Hyplus2), Htr-Hic, Hic-Hb2, Htt2-Htr) *
+ (Htr - Hic))+wplusy_H)*HALF*HALF;
+ }
+
+ real_t wminusy_V = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+ Vic-Vb, Vb-Vbb, Vt2-Vic);
+
+ wminusy_V *= Vic - Vb;
+
+ if(lvl < level[nb]) {
+ if(level[nbr] < level[nbrb])
+ Vbb2 = (Vbb2 + V[ nrht[nbrb] ]) * HALF;
+ wminusy_V = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+ sqrt(g*Hyminus2), Vic-Vbr, Vbr-Vbb2, Vt2-Vic) *
+ (Vic - Vbr))+wminusy_V)*HALF*HALF;
+ }
+
+ real_t wplusy_V = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+ Vt-Vic, Vic-Vb2, Vtt-Vt);
+
+ wplusy_V *= Vt - Vic;
+
+ if(lvl < level[nt]) {
+ if(level[ntr] < level[ntrt])
+ Vtt2 = (Vtt2 + V[ nrht[ntrt] ]) * HALF;
+ wplusy_V = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+ sqrt(g*Hyplus2), Vtr-Vic, Vic-Vb2, Vtt2-Vtr) *
+ (Vtr - Vic))+wplusy_V)*HALF*HALF;
+ }
+
+ H_new[gix] = U_fullstep(deltaT, dxic, Hic,
+ Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus)
+ - wminusx_H + wplusx_H - wminusy_H + wplusy_H;
+ U_new[gix] = U_fullstep(deltaT, dxic, Uic,
+ Uxfluxplus, Uxfluxminus, Uyfluxplus, Uyfluxminus)
+ - wminusx_U + wplusx_U;
+ V_new[gix] = U_fullstep(deltaT, dxic, Vic,
+ Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus)
+ - wminusy_V + wplusy_V;
+
+#if DEBUG >= 1
+ if (DEBUG >= 1) {
+ real_t U_tmp = U_new[gix];
+ real_t V_tmp = V_new[gix];
+ if (U_tmp == 0.0) U_tmp = 0.0;
+ if (V_tmp == 0.0) V_tmp = 0.0;
+ printf("DEBUG ic %d H_new %lf U_new %lf V_new %lf\n",gix,H_new[gix],U_tmp,V_tmp);
+ }
+#endif
+
+/*
+ printf("DEBUG ic %d deltaT, %lf dxic, %lf Hic, %lf Hxfluxplus, %lf Hxfluxminus, %lf Hyfluxplus, %lf Hyfluxminus %lf\n",
+ gix, deltaT, dxic, Hic, Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus);
+ printf("DEBUG ic %d wminusx_H %lf wplusx_H %lf wminusy_H %lf wplusy_H %lf\n",gix, wminusx_H, wplusx_H, wminusy_H, wplusy_H);
+ printf("DEBUG ic %d deltaT, %lf dxic, %lf Vic, %lf Vxfluxplus, %lf Vxfluxminus, %lf Vyfluxplus, %lf Vyfluxminus %lf\n",
+ gix, deltaT, dxic, Vic, Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus);
+ printf("DEBUG ic %d wminusy_V %lf wplusy_V %lf\n",gix, wminusy_V, wplusy_V);
+*/
+ } // cell loop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ // Replace H with H_new and deallocate H. New memory will have the characteristics
+ // of the new memory and the name of the old. Both return and arg1 will be reset to new memory
+ H = (state_t *)state_memory.memory_replace(H, H_new);
+ U = (state_t *)state_memory.memory_replace(U, U_new);
+ V = (state_t *)state_memory.memory_replace(V, V_new);
+
+ //state_memory.memory_report();
+ //printf("DEBUG end finite diff\n\n");
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += cpu_timer_stop(tstart_cpu);
+}
+
+void State::calc_finite_difference_via_faces(double deltaT){
+ real_t g = 9.80; // gravitational constant
+ real_t ghalf = HALF*g;
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ size_t ncells = mesh->ncells;
+ size_t &ncells_ghost = mesh->ncells_ghost;
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ if (ncells_ghost < ncells) ncells_ghost = ncells;
+
+ //printf("\nDEBUG finite diff\n");
+
+#ifdef HAVE_MPI
+ // We need to populate the ghost regions since the calc neighbors has just been
+ // established for the mesh shortly before
+ if (mesh->numpe > 1) {
+ apply_boundary_conditions_local();
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ H=(state_t *)state_memory.memory_realloc(ncells_ghost, H);
+ U=(state_t *)state_memory.memory_realloc(ncells_ghost, U);
+ V=(state_t *)state_memory.memory_realloc(ncells_ghost, V);
+
+ L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ apply_boundary_conditions_ghost();
+ } else {
+ apply_boundary_conditions();
+ }
+#else
+ apply_boundary_conditions();
+#endif
+
+ int *nlft, *nrht, *nbot, *ntop, *level;
+
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+ level = mesh->level;
+
+ vector<real_t> &lev_deltax = mesh->lev_deltax;
+ vector<real_t> &lev_deltay = mesh->lev_deltay;
+
+ int flags = 0;
+ flags = RESTART_DATA;
+#if defined (HAVE_J7)
+ if (mesh->parallel) flags = LOAD_BALANCE_MEMORY;
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ mesh->calc_face_list_wbidirmap();
+#ifdef _OPENMP
+ }
+#endif
+
+ static vector<state_t> Hx, Ux, Vx;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ Hx.resize(mesh->nxface);
+ Ux.resize(mesh->nxface);
+ Vx.resize(mesh->nxface);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int iface = 0; iface < mesh->nxface; iface++){
+ int cell_lower = mesh->map_xface2cell_lower[iface];
+ int cell_upper = mesh->map_xface2cell_upper[iface];
+ int level_lower = level[cell_lower];
+ int level_upper = level[cell_upper];
+ if (level_lower == level_upper) {
+ int lev = level_upper;
+ real_t Cxhalf = 0.5*deltaT/mesh->lev_deltax[lev];
+ Hx[iface]=HALF*(H[cell_upper]+H[cell_lower]) - Cxhalf*( HXFLUX(cell_upper)-HXFLUX(cell_lower) );
+ Ux[iface]=HALF*(U[cell_upper]+U[cell_lower]) - Cxhalf*( UXFLUX(cell_upper)-UXFLUX(cell_lower) );
+ Vx[iface]=HALF*(V[cell_upper]+V[cell_lower]) - Cxhalf*( UVFLUX(cell_upper)-UVFLUX(cell_lower) );
+ } else {
+ real_t dx_lower = mesh->lev_deltax[level[cell_lower]];
+ real_t dx_upper = mesh->lev_deltax[level[cell_upper]];
+
+ real_t FA_lower = dx_lower;
+ real_t FA_upper = dx_upper;
+ real_t FA_lolim = FA_lower*min(ONE, FA_upper/FA_lower);
+ real_t FA_uplim = FA_upper*min(ONE, FA_lower/FA_upper);
+
+ real_t CV_lower = SQ(dx_lower);
+ real_t CV_upper = SQ(dx_upper);
+ real_t CV_lolim = CV_lower*min(HALF, CV_upper/CV_lower);
+ real_t CV_uplim = CV_upper*min(HALF, CV_lower/CV_upper);
+
+ // Weighted half-step calculation
+ //
+ // (dx_lower*H[cell_upper]+dx_upper*H[cell_lower])
+ // ----------------------------------------------- -
+ // (dx_lower+dx_upper)
+ //
+ // ( (FA_uplim*HXFLUX(cell_upper))-(FA_lolim*HXFLUX(cell_lower)) )
+ // 0.5*deltaT * ----------------------------------------------------------------
+ // (CV_uplim+CV_lolim)
+ //
+
+ Hx[iface]=(dx_lower*H[cell_upper]+dx_upper*H[cell_lower])/(dx_lower+dx_upper) -
+ HALF*deltaT*( (FA_uplim*HXFLUX(cell_upper))-(FA_lolim*HXFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+ Ux[iface]=(dx_lower*U[cell_upper]+dx_upper*U[cell_lower])/(dx_lower+dx_upper) -
+ HALF*deltaT*( (FA_uplim*UXFLUX(cell_upper))-(FA_lolim*UXFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+ Vx[iface]=(dx_lower*V[cell_upper]+dx_upper*V[cell_lower])/(dx_lower+dx_upper) -
+ HALF*deltaT*( (FA_uplim*UVFLUX(cell_upper))-(FA_lolim*UVFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+ }
+#if DEBUG >= 2
+ if (DEBUG >= 2) {
+ printf("1st pass x direction iface %d i %d j %d lev %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
+ iface, mesh->xface_i[iface], mesh->xface_j[iface], mesh->xface_level[iface],
+ mesh->map_xface2cell_lower[iface], mesh->map_xface2cell_upper[iface],
+ Hx[iface],Ux[iface],Vx[iface],
+ H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+ }
+#endif
+ }
+#if DEBUG >= 2
+ if (DEBUG >= 2) {
+ printf("\n");
+ }
+#endif
+
+ static vector<state_t> Hy, Uy, Vy;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ Hy.resize(mesh->nyface);
+ Uy.resize(mesh->nyface);
+ Vy.resize(mesh->nyface);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp for
+#endif
+ for (int iface = 0; iface < mesh->nyface; iface++){
+ int cell_lower = mesh->map_yface2cell_lower[iface];
+ int cell_upper = mesh->map_yface2cell_upper[iface];
+ int level_lower = level[cell_lower];
+ int level_upper = level[cell_upper];
+ if (level_lower == level_upper) {
+ int lev = level_upper;
+ real_t Cyhalf = 0.5*deltaT/mesh->lev_deltay[lev];
+ Hy[iface]=HALF*(H[cell_upper]+H[cell_lower]) - Cyhalf*( HYFLUX(cell_upper)-HYFLUX(cell_lower) );
+ Uy[iface]=HALF*(U[cell_upper]+U[cell_lower]) - Cyhalf*( UVFLUX(cell_upper)-UVFLUX(cell_lower) );
+ Vy[iface]=HALF*(V[cell_upper]+V[cell_lower]) - Cyhalf*( VYFLUX(cell_upper)-VYFLUX(cell_lower) );
+ } else {
+ real_t dy_lower = mesh->lev_deltay[level[cell_lower]];
+ real_t dy_upper = mesh->lev_deltay[level[cell_upper]];
+
+ real_t FA_lower = dy_lower;
+ real_t FA_upper = dy_upper;
+ real_t FA_lolim = FA_lower*min(ONE, FA_upper/FA_lower);
+ real_t FA_uplim = FA_upper*min(ONE, FA_lower/FA_upper);
+
+ real_t CV_lower = SQ(dy_lower);
+ real_t CV_upper = SQ(dy_upper);
+ real_t CV_lolim = CV_lower*min(HALF, CV_upper/CV_lower);
+ real_t CV_uplim = CV_upper*min(HALF, CV_lower/CV_upper);
+
+ // Weighted half-step calculation
+ //
+ // (dy_lower*H[cell_upper]+dy_upper*H[cell_lower])
+ // ----------------------------------------------- -
+ // (dy_lower+dy_upper)
+ //
+ // ( (FA_uplim*HYFLUX(cell_upper))-(FA_lolim*HYFLUX(cell_lower)) )
+ // 0.5*deltaT * ----------------------------------------------------------------
+ // (CV_uplim+CV_lolim)
+ //
+
+ Hy[iface]=(dy_lower*H[cell_upper]+dy_upper*H[cell_lower])/(dy_lower+dy_upper) -
+ HALF*deltaT*( (FA_uplim*HYFLUX(cell_upper))-(FA_lolim*HYFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+ Uy[iface]=(dy_lower*U[cell_upper]+dy_upper*U[cell_lower])/(dy_lower+dy_upper) -
+ HALF*deltaT*( (FA_uplim*UVFLUX(cell_upper))-(FA_lolim*UVFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+ Vy[iface]=(dy_lower*V[cell_upper]+dy_upper*V[cell_lower])/(dy_lower+dy_upper) -
+ HALF*deltaT*( (FA_uplim*VYFLUX(cell_upper))-(FA_lolim*VYFLUX(cell_lower)) )/
+ (CV_uplim+CV_lolim);
+
+ }
+
+#if DEBUG >= 2
+ if (DEBUG >= 2) {
+ printf("1st pass y direction iface %d i %d j %d lev %d nzlower %d nzupper %d %lf %lf %lf %lf %lf %lf %lf %lf %lf\n",
+ iface, mesh->yface_i[iface], mesh->yface_j[iface], mesh->yface_level[iface],
+ mesh->map_yface2cell_lower[iface], mesh->map_yface2cell_upper[iface],
+ Hy[iface],Uy[iface],Vy[iface],
+ H[cell_upper],H[cell_lower],U[cell_upper],U[cell_lower],V[cell_upper],V[cell_lower]);
+ }
+#endif
+ }
+#if DEBUG >= 2
+ if (DEBUG >= 2) {
+ printf("\n");
+ }
+#endif
+
+ static state_t *H_new, *U_new, *V_new;
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ H_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "H_new", flags);
+ U_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "U_new", flags);
+ V_new = (state_t *)state_memory.memory_malloc(mesh->ncells_ghost, sizeof(state_t), "V_new", flags);
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+ int lowerBound, upperBound;
+
+ mesh->get_bounds(lowerBound, upperBound);
+ for (int ic = lowerBound; ic < upperBound; ic++){
+
+ int lvl = level[ic];
+ int nl = nlft[ic];
+ int nr = nrht[ic];
+ int nt = ntop[ic];
+ int nb = nbot[ic];
+
+ real_t Hic = H[ic];
+ real_t Uic = U[ic];
+ real_t Vic = V[ic];
+
+ int nll = nlft[nl];
+ real_t Hl = H[nl];
+ real_t Ul = U[nl];
+ //real_t Vl = V[nl];
+
+ int nrr = nrht[nr];
+ real_t Hr = H[nr];
+ real_t Ur = U[nr];
+ //real_t Vr = V[nr];
+
+ int ntt = ntop[nt];
+ real_t Ht = H[nt];
+ //real_t Ut = U[nt];
+ real_t Vt = V[nt];
+
+ int nbb = nbot[nb];
+ real_t Hb = H[nb];
+ //real_t Ub = U[nb];
+ real_t Vb = V[nb];
+
+ int nlt = ntop[nl];
+ int nrt = ntop[nr];
+ int ntr = nrht[nt];
+ int nbr = nrht[nb];
+
+ real_t Hll = H[nll];
+ real_t Ull = U[nll];
+ //real_t Vll = V[nll];
+
+ real_t Hrr = H[nrr];
+ real_t Urr = U[nrr];
+ //real_t Vrr = V[nrr];
+
+ real_t Htt = H[ntt];
+ //real_t Utt = U[ntt];
+ real_t Vtt = V[ntt];
+
+ real_t Hbb = H[nbb];
+ //real_t Ubb = U[nbb];
+ real_t Vbb = V[nbb];
+
+ real_t dxic = lev_deltax[lvl];
+ //real_t dyic = lev_deltay[lvl];
+
+ real_t dxl = lev_deltax[level[nl]];
+ real_t dxr = lev_deltax[level[nr]];
+
+ real_t dyt = lev_deltay[level[nt]];
+ real_t dyb = lev_deltay[level[nb]];
+
+ //real_t drl = dxl;
+ //real_t drr = dxr;
+ //real_t drt = dyt;
+ //real_t drb = dyb;
+
+ real_t dric = dxic;
+
+ int nltl = 0;
+ real_t Hlt = 0.0, Ult = 0.0; // Vlt = 0.0;
+ real_t Hll2 = 0.0;
+ real_t Ull2 = 0.0;
+ if(lvl < level[nl]) {
+ Hlt = H[ ntop[nl] ];
+ Ult = U[ ntop[nl] ];
+ //Vlt = V[ ntop[nl] ];
+
+ nltl = nlft[nlt];
+ Hll2 = H[nltl];
+ Ull2 = U[nltl];
+ }
+
+ int nrtr = 0;
+ real_t Hrt = 0.0, Urt = 0.0; // Vrt = 0.0;
+ real_t Hrr2 = 0.0;
+ real_t Urr2 = 0.0;
+ if(lvl < level[nr]) {
+ Hrt = H[ ntop[nr] ];
+ Urt = U[ ntop[nr] ];
+ //Vrt = V[ ntop[nr] ];
+
+ nrtr = nrht[nrt];
+ Hrr2 = H[nrtr];
+ Urr2 = U[nrtr];
+ }
+
+ int nbrb = 0;
+ real_t Hbr = 0.0, Vbr = 0.0; // Ubr = 0.0
+ real_t Hbb2 = 0.0;
+ real_t Vbb2 = 0.0;
+ if(lvl < level[nb]) {
+ Hbr = H[ nrht[nb] ];
+ //Ubr = U[ nrht[nb] ];
+ Vbr = V[ nrht[nb] ];
+
+ nbrb = nbot[nbr];
+ Hbb2 = H[nbrb];
+ Vbb2 = V[nbrb];
+ }
+
+ int ntrt = 0;
+ real_t Htr = 0.0, Vtr = 0.0; // Utr = 0.0
+ real_t Htt2 = 0.0;
+ real_t Vtt2 = 0.0;
+ if(lvl < level[nt]) {
+ Htr = H[ nrht[nt] ];
+ //Utr = U[ nrht[nt] ];
+ Vtr = V[ nrht[nt] ];
+
+ ntrt = ntop[ntr];
+ Htt2 = H[ntrt];
+ Vtt2 = V[ntrt];
+ }
+
+ ////////////////////////////////////////
+ /// Artificial Viscosity corrections ///
+ ////////////////////////////////////////
+
+ real_t Hxminus = H[ic];
+ real_t Uxminus = 0.0;
+ real_t Vxminus = 0.0;
+ if (mesh->map_xcell2face_left1[ic] >= 0){
+ Hxminus = Hx[mesh->map_xcell2face_left1[ic]];
+ Uxminus = Ux[mesh->map_xcell2face_left1[ic]];
+ Vxminus = Vx[mesh->map_xcell2face_left1[ic]];
+ }
+
+ real_t Hxminus2 = 0.0;
+ if(lvl < level[nl]) Hxminus2 = H[ic];
+ real_t Uxminus2 = 0.0;
+ real_t Vxminus2 = 0.0;
+ if (mesh->map_xcell2face_left2[ic] >= 0) {
+ Hxminus2 = Hx[mesh->map_xcell2face_left2[ic]];
+ Uxminus2 = Ux[mesh->map_xcell2face_left2[ic]];
+ Vxminus2 = Vx[mesh->map_xcell2face_left2[ic]];
+ }
+
+ real_t Hxplus = H[ic];
+ real_t Uxplus = 0.0;
+ real_t Vxplus = 0.0;
+ if (mesh->map_xcell2face_right1[ic] >= 0){
+ Hxplus = Hx[mesh->map_xcell2face_right1[ic]];
+ Uxplus = Ux[mesh->map_xcell2face_right1[ic]];
+ Vxplus = Vx[mesh->map_xcell2face_right1[ic]];
+ }
+
+ real_t Hxplus2 = 0.0;
+ if(lvl < level[nr]) Hxplus2 = H[ic];
+ real_t Uxplus2 = 0.0;
+ real_t Vxplus2 = 0.0;
+ if (mesh->map_xcell2face_right2[ic] >= 0){
+ Hxplus2 = Hx[mesh->map_xcell2face_right2[ic]];
+ Uxplus2 = Ux[mesh->map_xcell2face_right2[ic]];
+ Vxplus2 = Vx[mesh->map_xcell2face_right2[ic]];
+ }
+
+ if(level[nl] < level[nll]) {
+ Hll = (Hll + H[ ntop[nll] ]) * HALF;
+ Ull = (Ull + U[ ntop[nll] ]) * HALF;
+ }
+
+ real_t Hr2 = Hr;
+ real_t Ur2 = Ur;
+ if(lvl < level[nr]) {
+ Hr2 = (Hr2 + Hrt) * HALF;
+ Ur2 = (Ur2 + Urt) * HALF;
+ }
+
+ real_t wminusx_H = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+ Hic-Hl, Hl-Hll, Hr2-Hic);
+
+ wminusx_H *= Hic - Hl;
+
+ if(lvl < level[nl]) {
+ if(level[nlt] < level[nltl])
+ Hll2 = (Hll2 + H[ ntop[nltl] ]) * HALF;
+ wminusx_H = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+ sqrt(g*Hxminus2), Hic-Hlt, Hlt-Hll2, Hr2-Hic) *
+ (Hic - Hlt)) + wminusx_H)*HALF*HALF;
+ }
+
+ if(level[nr] < level[nrr]) {
+ Hrr = (Hrr + H[ ntop[nrr] ]) * HALF;
+ Urr = (Urr + U[ ntop[nrr] ]) * HALF;
+ }
+
+ real_t Hl2 = Hl;
+ real_t Ul2 = Ul;
+ if(lvl < level[nl]) {
+ Hl2 = (Hl2 + Hlt) * HALF;
+ Ul2 = (Ul2 + Ult) * HALF;
+ }
+
+ real_t wplusx_H = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+ Hr-Hic, Hic-Hl2, Hrr-Hr);
+
+ wplusx_H *= Hr - Hic;
+
+ if(lvl < level[nr]) {
+ if(level[nrt] < level[nrtr])
+ Hrr2 = (Hrr2 + H[ ntop[nrtr] ]) * HALF;
+ wplusx_H = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+ sqrt(g*Hxplus2), Hrt-Hic, Hic-Hl2, Hrr2-Hrt) *
+ (Hrt - Hic))+wplusx_H)*HALF*HALF;
+ }
+
+
+ real_t wminusx_U = w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus/Hxminus) + sqrt(g*Hxminus),
+ Uic-Ul, Ul-Ull, Ur2-Uic);
+
+ wminusx_U *= Uic - Ul;
+
+ if(lvl < level[nl]) {
+ if(level[nlt] < level[nltl])
+ Ull2 = (Ull2 + U[ ntop[nltl] ]) * HALF;
+ wminusx_U = ((w_corrector(deltaT, (dric+dxl)*HALF, fabs(Uxminus2/Hxminus2) +
+ sqrt(g*Hxminus2), Uic-Ult, Ult-Ull2, Ur2-Uic) *
+ (Uic - Ult))+wminusx_U)*HALF*HALF;
+ }
+
+
+ real_t wplusx_U = w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus/Hxplus) + sqrt(g*Hxplus),
+ Ur-Uic, Uic-Ul2, Urr-Ur);
+
+ wplusx_U *= Ur - Uic;
+
+ if(lvl < level[nr]) {
+ if(level[nrt] < level[nrtr])
+ Urr2 = (Urr2 + U[ ntop[nrtr] ]) * HALF;
+ wplusx_U = ((w_corrector(deltaT, (dric+dxr)*HALF, fabs(Uxplus2/Hxplus2) +
+ sqrt(g*Hxplus2), Urt-Uic, Uic-Ul2, Urr2-Urt) *
+ (Urt - Uic))+wplusx_U)*HALF*HALF;
+ }
+
+
+ if(level[nb] < level[nbb]) {
+ Hbb = (Hbb + H[ nrht[nbb] ]) * HALF;
+ Vbb = (Vbb + V[ nrht[nbb] ]) * HALF;
+ }
+
+ real_t Ht2 = Ht;
+ real_t Vt2 = Vt;
+ if(lvl < level[nt]) {
+ Ht2 = (Ht2 + Htr) * HALF;
+ Vt2 = (Vt2 + Vtr) * HALF;
+ }
+
+ real_t Hyminus = H[ic];
+ real_t Uyminus = 0.0;
+ real_t Vyminus = 0.0;
+ if (mesh->map_ycell2face_bot1[ic] >= 0){
+ Hyminus = Hy[mesh->map_ycell2face_bot1[ic]];
+ Uyminus = Uy[mesh->map_ycell2face_bot1[ic]];
+ Vyminus = Vy[mesh->map_ycell2face_bot1[ic]];
+ }
+
+ real_t Hyminus2 = 0.0;
+ if(lvl < level[nb]) Hyminus2 = H[ic];
+ real_t Uyminus2 = 0.0;
+ real_t Vyminus2 = 0.0;
+ if (mesh->map_ycell2face_bot2[ic] >= 0){
+ Hyminus2 = Hy[mesh->map_ycell2face_bot2[ic]];
+ Uyminus2 = Uy[mesh->map_ycell2face_bot2[ic]];
+ Vyminus2 = Vy[mesh->map_ycell2face_bot2[ic]];
+ }
+
+ real_t Hyplus = H[ic];
+ real_t Uyplus = 0.0;
+ real_t Vyplus = 0.0;
+ if (mesh->map_ycell2face_top1[ic] >= 0){
+ Hyplus = Hy[mesh->map_ycell2face_top1[ic]];
+ Uyplus = Uy[mesh->map_ycell2face_top1[ic]];
+ Vyplus = Vy[mesh->map_ycell2face_top1[ic]];
+ }
+
+ real_t Hyplus2 = 0.0;
+ if(lvl < level[nt]) Hyplus2 = H[ic];
+ real_t Uyplus2 = 0.0;
+ real_t Vyplus2 = 0.0;
+ if (mesh->map_ycell2face_top2[ic] >= 0){
+ Hyplus2 = Hy[mesh->map_ycell2face_top2[ic]];
+ Uyplus2 = Uy[mesh->map_ycell2face_top2[ic]];
+ Vyplus2 = Vy[mesh->map_ycell2face_top2[ic]];
+ }
+
+ real_t wminusy_H = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+ Hic-Hb, Hb-Hbb, Ht2-Hic);
+
+ wminusy_H *= Hic - Hb;
+
+ if(lvl < level[nb]) {
+ if(level[nbr] < level[nbrb])
+ Hbb2 = (Hbb2 + H[ nrht[nbrb] ]) * HALF;
+ wminusy_H = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+ sqrt(g*Hyminus2), Hic-Hbr, Hbr-Hbb2, Ht2-Hic) *
+ (Hic - Hbr))+wminusy_H)*HALF*HALF;
+ }
+
+
+ if(level[nt] < level[ntt]) {
+ Htt = (Htt + H[ nrht[ntt] ]) * HALF;
+ Vtt = (Vtt + V[ nrht[ntt] ]) * HALF;
+ }
+
+ real_t Hb2 = Hb;
+ real_t Vb2 = Vb;
+ if(lvl < level[nb]) {
+ Hb2 = (Hb2 + Hbr) * HALF;
+ Vb2 = (Vb2 + Vbr) * HALF;
+ }
+
+ real_t wplusy_H = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+ Ht-Hic, Hic-Hb2, Htt-Ht);
+
+ wplusy_H *= Ht - Hic;
+
+ if(lvl < level[nt]) {
+ if(level[ntr] < level[ntrt])
+ Htt2 = (Htt2 + H[ nrht[ntrt] ]) * HALF;
+ wplusy_H = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+ sqrt(g*Hyplus2), Htr-Hic, Hic-Hb2, Htt2-Htr) *
+ (Htr - Hic))+wplusy_H)*HALF*HALF;
+ }
+
+ real_t wminusy_V = w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus/Hyminus) + sqrt(g*Hyminus),
+ Vic-Vb, Vb-Vbb, Vt2-Vic);
+
+ wminusy_V *= Vic - Vb;
+
+ if(lvl < level[nb]) {
+ if(level[nbr] < level[nbrb])
+ Vbb2 = (Vbb2 + V[ nrht[nbrb] ]) * HALF;
+ wminusy_V = ((w_corrector(deltaT, (dric+dyb)*HALF, fabs(Vyminus2/Hyminus2) +
+ sqrt(g*Hyminus2), Vic-Vbr, Vbr-Vbb2, Vt2-Vic) *
+ (Vic - Vbr))+wminusy_V)*HALF*HALF;
+ }
+
+ real_t wplusy_V = w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus/Hyplus) + sqrt(g*Hyplus),
+ Vt-Vic, Vic-Vb2, Vtt-Vt);
+
+ wplusy_V *= Vt - Vic;
+
+ if(lvl < level[nt]) {
+ if(level[ntr] < level[ntrt])
+ Vtt2 = (Vtt2 + V[ nrht[ntrt] ]) * HALF;
+ wplusy_V = ((w_corrector(deltaT, (dric+dyt)*HALF, fabs(Vyplus2/Hyplus2) +
+ sqrt(g*Hyplus2), Vtr-Vic, Vic-Vb2, Vtt2-Vtr) *
+ (Vtr - Vic))+wplusy_V)*HALF*HALF;
+ }
+
+ real_t Hxfluxminus = HNEWXFLUXMINUS;
+ real_t Uxfluxminus = UNEWXFLUXMINUS;
+ real_t Vxfluxminus = UVNEWFLUXMINUS;
+
+ real_t Hxfluxplus = HNEWXFLUXPLUS;
+ real_t Uxfluxplus = UNEWXFLUXPLUS;
+ real_t Vxfluxplus = UVNEWFLUXPLUS;
+
+ real_t Hyfluxminus = HNEWYFLUXMINUS;
+ real_t Uyfluxminus = VUNEWFLUXMINUS;
+ real_t Vyfluxminus = VNEWYFLUXMINUS;
+
+ real_t Hyfluxplus = HNEWYFLUXPLUS;
+ real_t Uyfluxplus = VUNEWFLUXPLUS;
+ real_t Vyfluxplus = VNEWYFLUXPLUS;
+
+ if(lvl < level[nl]) {
+ Hxfluxminus = (Hxfluxminus + HNEWXFLUXMINUS2) * HALF;
+ Uxfluxminus = (Uxfluxminus + UNEWXFLUXMINUS2) * HALF;
+ Vxfluxminus = (Vxfluxminus + UVNEWFLUXMINUS2) * HALF;
+ }
+
+ if(lvl < level[nr]) {
+ Hxfluxplus = (Hxfluxplus + HNEWXFLUXPLUS2) * HALF;
+ Uxfluxplus = (Uxfluxplus + UNEWXFLUXPLUS2) * HALF;
+ Vxfluxplus = (Vxfluxplus + UVNEWFLUXPLUS2) * HALF;
+ }
+
+ if(lvl < level[nb]) {
+ Hyfluxminus = (Hyfluxminus + HNEWYFLUXMINUS2) * HALF;
+ Uyfluxminus = (Uyfluxminus + VUNEWFLUXMINUS2) * HALF;
+ Vyfluxminus = (Vyfluxminus + VNEWYFLUXMINUS2) * HALF;
+ }
+
+ if(lvl < level[nt]) {
+ Hyfluxplus = (Hyfluxplus + HNEWYFLUXPLUS2) * HALF;
+ Uyfluxplus = (Uyfluxplus + VUNEWFLUXPLUS2) * HALF;
+ Vyfluxplus = (Vyfluxplus + VNEWYFLUXPLUS2) * HALF;
+ }
+
+ H_new[ic] = U_fullstep(deltaT, dxic, Hic,
+ Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus)
+ - wminusx_H + wplusx_H - wminusy_H + wplusy_H;
+ U_new[ic] = U_fullstep(deltaT, dxic, Uic,
+ Uxfluxplus, Uxfluxminus, Uyfluxplus, Uyfluxminus)
+ - wminusx_U + wplusx_U;
+ V_new[ic] = U_fullstep(deltaT, dxic, Vic,
+ Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus)
+ - wminusy_V + wplusy_V;
+
+#if DEBUG >= 1
+ if (DEBUG >= 1) {
+ real_t U_tmp = U_new[ic];
+ real_t V_tmp = V_new[ic];
+ if (U_tmp == 0.0) U_tmp = 0.0;
+ if (V_tmp == 0.0) V_tmp = 0.0;
+ printf("DEBUG ic %d H_new %lf U_new %lf V_new %lf\n",ic,H_new[ic],U_tmp,V_tmp);
+ }
+#endif
+
+/*
+ printf("DEBUG ic %d deltaT, %lf dxic, %lf Hic, %lf Hxfluxplus, %lf Hxfluxminus, %lf Hyfluxplus, %lf Hyfluxminus %lf\n",
+ ic, deltaT, dxic, Hic, Hxfluxplus, Hxfluxminus, Hyfluxplus, Hyfluxminus);
+ printf("DEBUG ic %d wminusx_H %lf wplusx_H %lf wminusy_H %lf wplusy_H %lf\n",ic, wminusx_H, wplusx_H, wminusy_H, wplusy_H);
+ printf("DEBUG ic %d deltaT, %lf dxic, %lf Vic, %lf Vxfluxplus, %lf Vxfluxminus, %lf Vyfluxplus, %lf Vyfluxminus %lf\n",
+ ic, deltaT, dxic, Vic, Vxfluxplus, Vxfluxminus, Vyfluxplus, Vyfluxminus);
+ printf("DEBUG ic %d wminusy_V %lf wplusy_V %lf\n",ic, wminusy_V, wplusy_V);
+*/
+ }//end forloop
+
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+ {
+#endif
+ // Replace H with H_new and deallocate H. New memory will have the characteristics
+ // of the new memory and the name of the old. Both return and arg1 will be reset to new memory
+ H = (state_t *)state_memory.memory_replace(H, H_new);
+ U = (state_t *)state_memory.memory_replace(U, U_new);
+ V = (state_t *)state_memory.memory_replace(V, V_new);
+
+ //state_memory.memory_report();
+ //printf("DEBUG end finite diff\n\n");
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef _OPENMP
+#pragma omp master
+#endif
+ cpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += cpu_timer_stop(tstart_cpu);
+}
+
+#ifdef HAVE_OPENCL
+void State::gpu_calc_finite_difference(double deltaT)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ //cl_mem dev_ptr = NULL;
+
+ size_t &ncells = mesh->ncells;
+ size_t &ncells_ghost = mesh->ncells_ghost;
+ if (ncells_ghost < ncells) ncells_ghost = ncells;
+ int &levmx = mesh->levmx;
+ cl_mem &dev_celltype = mesh->dev_celltype;
+ cl_mem &dev_nlft = mesh->dev_nlft;
+ cl_mem &dev_nrht = mesh->dev_nrht;
+ cl_mem &dev_nbot = mesh->dev_nbot;
+ cl_mem &dev_ntop = mesh->dev_ntop;
+ cl_mem &dev_level = mesh->dev_level;
+ cl_mem &dev_levdx = mesh->dev_levdx;
+ cl_mem &dev_levdy = mesh->dev_levdy;
+
+ assert(dev_H);
+ assert(dev_U);
+ assert(dev_V);
+ assert(dev_nlft);
+ assert(dev_nrht);
+ assert(dev_nbot);
+ assert(dev_ntop);
+ assert(dev_level);
+ assert(dev_levdx);
+ assert(dev_levdy);
+
+ cl_mem dev_H_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_H_new"), DEVICE_REGULAR_MEMORY);
+ cl_mem dev_U_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_U_new"), DEVICE_REGULAR_MEMORY);
+ cl_mem dev_V_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_V_new"), DEVICE_REGULAR_MEMORY);
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+
+#ifdef HAVE_MPI
+ if (mesh->numpe > 1) {
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_local, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ /*
+ __kernel void copy_state_data_cl(
+ const int isize, // 0
+ __global state_t *H, // 1
+ __global state_t *U, // 2
+ __global state_t *V, // 3
+ __global state_t *H_new, // 4
+ __global state_t *U_new, // 5
+ __global state_t *V_new) // 6
+ */
+
+ ezcl_set_kernel_arg(kernel_copy_state_data, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 1, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 2, sizeof(cl_mem), (void *)&dev_U);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 3, sizeof(cl_mem), (void *)&dev_V);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 4, sizeof(cl_mem), (void *)&dev_H_new);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 5, sizeof(cl_mem), (void *)&dev_U_new);
+ ezcl_set_kernel_arg(kernel_copy_state_data, 6, sizeof(cl_mem), (void *)&dev_V_new);
+
+ //ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_state_data, 1, NULL, &global_work_size, &local_work_size, ©_state_data_event);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_copy_state_data, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ dev_H = (cl_mem)gpu_state_memory.memory_replace(dev_H, dev_H_new);
+ dev_U = (cl_mem)gpu_state_memory.memory_replace(dev_U, dev_U_new);
+ dev_V = (cl_mem)gpu_state_memory.memory_replace(dev_V, dev_V_new);
+
+ L7_Dev_Update(dev_H, L7_STATE_T, mesh->cell_handle);
+ L7_Dev_Update(dev_U, L7_STATE_T, mesh->cell_handle);
+ L7_Dev_Update(dev_V, L7_STATE_T, mesh->cell_handle);
+
+ dev_H_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_H_new"), DEVICE_REGULAR_MEMORY);
+ dev_U_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_U_new"), DEVICE_REGULAR_MEMORY);
+ dev_V_new = (cl_mem)gpu_state_memory.memory_malloc(ncells_ghost, sizeof(cl_state_t), const_cast<char *>("dev_V_new"), DEVICE_REGULAR_MEMORY);
+
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_ghost, 1, NULL, &global_work_size, &local_work_size, NULL);
+ } else {
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions, 1, NULL, &global_work_size, &local_work_size, NULL);
+ }
+#else
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions, 1, NULL, &global_work_size, &local_work_size, NULL);
+#endif
+
+ /*
+ __kernel void calc_finite_difference_cl(
+ const int ncells, // 0 Total number of cells.
+ const int lvmax, // 1 Maximum level
+ __global state_t *H, // 2
+ __global state_t *U, // 3
+ __global state_t *V, // 4
+ __global state_t *H_new, // 5
+ __global state_t *U_new, // 6
+ __global state_t *V_new, // 7
+ __global const int *nlft, // 8 Array of left neighbors.
+ __global const int *nrht, // 9 Array of right neighbors.
+ __global const int *ntop, // 10 Array of bottom neighbors.
+ __global const int *nbot, // 11 Array of top neighbors.
+ __global const int *level, // 12 Array of level information.
+ const real_t deltaT, // 13 Size of time step.
+ __global const real_t *lev_dx, // 14
+ __global const real_t *lev_dy, // 15
+ __local state4_t *tile, // 16 Tile size in state4.
+ __local int8 *itile) // 17 Tile size in int8.
+ */
+ cl_event calc_finite_difference_event;
+
+ real_t deltaT_local = deltaT;
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 2, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 3, sizeof(cl_mem), (void *)&dev_U);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 4, sizeof(cl_mem), (void *)&dev_V);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 5, sizeof(cl_mem), (void *)&dev_H_new);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 6, sizeof(cl_mem), (void *)&dev_U_new);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 7, sizeof(cl_mem), (void *)&dev_V_new);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 8, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference, 9, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,10, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,11, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,12, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,13, sizeof(cl_real_t), (void *)&deltaT_local);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,14, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,15, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,16, local_work_size*sizeof(cl_state4_t), NULL);
+ ezcl_set_kernel_arg(kernel_calc_finite_difference,17, local_work_size*sizeof(cl_int8), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_calc_finite_difference, 1, NULL, &global_work_size, &local_work_size, &calc_finite_difference_event);
+
+ ezcl_wait_for_events(1, &calc_finite_difference_event);
+ ezcl_event_release(calc_finite_difference_event);
+
+ dev_H = (cl_mem)gpu_state_memory.memory_replace(dev_H, dev_H_new);
+ dev_U = (cl_mem)gpu_state_memory.memory_replace(dev_U, dev_U_new);
+ dev_V = (cl_mem)gpu_state_memory.memory_replace(dev_V, dev_V_new);
+
+ gpu_timers[STATE_TIMER_FINITE_DIFFERENCE] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+}
+#endif
+
+void State::symmetry_check(const char *string, vector<int> sym_index, double eps,
+ SIGN_RULE sign_rule, int &flag)
+{
+ size_t &ncells = mesh->ncells;
+
+ double xsign = 1.0, ysign = 1.0;
+
+ if (sign_rule == DIAG_RULE || sign_rule == X_RULE) {
+ xsign = -1.0;
+ }
+
+ if (sign_rule == DIAG_RULE || sign_rule == Y_RULE) {
+ ysign = -1.0;
+ }
+
+ for (uint ic=0; ic<ncells; ic++) {
+ /* Symmetrical check */
+ if (fabs(H[ic] - H[sym_index[ic]]) > eps) {
+ printf("%s ic %d sym %d H[ic] %lf Hsym %lf diff %lf\n",
+ string,ic,sym_index[ic],H[ic],H[sym_index[ic]],fabs(H[ic]-H[sym_index[ic]]));
+ flag++;
+ }
+ if (fabs(U[ic] - xsign*U[sym_index[ic]]) > eps) {
+ printf("%s ic %d sym %d U[ic] %lf Usym %lf diff %lf\n",
+ string,ic,sym_index[ic],U[ic],U[sym_index[ic]],fabs(U[ic]-xsign*U[sym_index[ic]]));
+ flag++;
+ }
+ if (fabs(V[ic] - ysign*V[sym_index[ic]]) > eps) {
+ printf("%s ic %d sym %d V[ic] %lf Vsym %lf diff %lf\n",
+ string,ic,sym_index[ic],V[ic],V[sym_index[ic]],fabs(V[ic]-ysign*V[sym_index[ic]]));
+ flag++;
+ }
+ }
+
+}
+
+size_t State::calc_refine_potential(vector<int> &mpot,int &icount, int &jcount)
+{
+
+ struct timeval tstart_cpu;
+#ifdef _OPENMP
+#pragma omp parallel
+{
+#endif
+
+ struct timeval tstart_lev2;
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+ cpu_timer_start(&tstart_cpu);
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+#ifdef _OPENMP
+}
+#endif
+
+ int *nlft, *nrht, *nbot, *ntop, *level;
+
+ size_t ncells = mesh->ncells;
+ nlft = mesh->nlft;
+ nrht = mesh->nrht;
+ nbot = mesh->nbot;
+ ntop = mesh->ntop;
+ level = mesh->level;
+
+#ifdef _OPENMP
+#pragma omp master
+ {
+#endif
+ icount=0;
+ jcount=0;
+#ifdef _OPENMP
+ }
+#pragma omp barrier
+#endif
+
+#ifdef HAVE_MPI
+ // We need to update the ghost regions and boundary regions for the state
+ // variables since they were changed in the finite difference routine. We
+ // want to use the updated values for refinement decisions
+ if (mesh->numpe > 1) {
+ apply_boundary_conditions_local();
+#ifdef _OPENMP
+#pragma omp barrier
+#pragma omp master
+{
+#endif
+ L7_Update(&H[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&U[0], L7_STATE_T, mesh->cell_handle);
+ L7_Update(&V[0], L7_STATE_T, mesh->cell_handle);
+#ifdef _OPENMP
+}
+#pragma omp barrier
+#endif
+ apply_boundary_conditions_ghost();
+ } else {
+ apply_boundary_conditions();
+ }
+#else
+ apply_boundary_conditions();
+#endif
+
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
+/*****HIGH LEVEL OMP******/
+
+ int lowerBound, upperBound;
+ //mesh->set_bounds(ncells);
+ mesh->get_bounds(lowerBound,upperBound);
+ for (int ic=lowerBound; ic<upperBound; ic++) {
+
+ if (mesh->celltype[ic] != REAL_CELL) continue;
+
+ state_t Hic = H[ic];
+ //state_t Uic = U[ic];
+ //state_t Vic = V[ic];
+
+ int nl = nlft[ic];
+ state_t Hl = H[nl];
+ //state_t Ul = U[nl];
+ //state_t Vl = V[nl];
+
+ if (level[nl] > level[ic]){
+ int nlt = ntop[nl];
+ Hl = REFINE_HALF * (Hl + H[nlt]);
+ }
+
+ int nr = nrht[ic];
+ state_t Hr = H[nr];
+ //state_t Ur = U[nr];
+ //state_t Vr = V[nr];
+
+ if (level[nr] > level[ic]){
+ int nrt = ntop[nr];
+ Hr = REFINE_HALF * (Hr + H[nrt]);
+ }
+
+ int nb = nbot[ic];
+ state_t Hb = H[nb];
+ //state_t Ub = U[nb];
+ //state_t Vb = V[nb];
+
+ if (level[nb] > level[ic]){
+ int nbr = nrht[nb];
+ Hb = REFINE_HALF * (Hb + H[nbr]);
+ }
+
+ int nt = ntop[ic];
+ state_t Ht = H[nt];
+ //state_t Ut = U[nt];
+ //state_t Vt = V[nt];
+
+ if (level[nt] > level[ic]){
+ int ntr = nrht[nt];
+ Ht = REFINE_HALF * (Ht + H[ntr]);
+ }
+
+ state_t duplus1; //, duplus2;
+ state_t duhalf1; //, duhalf2;
+ state_t duminus1; //, duminus2;
+
+ duplus1 = Hr-Hic;
+ //duplus2 = Ur-Uic;
+ duhalf1 = Hic-Hl;
+ //duhalf2 = Uic-Ul;
+
+ state_t qmax = REFINE_NEG_THOUSAND;
+
+ state_t qpot = max(fabs(duplus1/Hic), fabs(duhalf1/Hic));
+ if (qpot > qmax) qmax = qpot;
+
+ duminus1 = Hic-Hl;
+ //duminus2 = Uic-Ul;
+ duhalf1 = Hr-Hic;
+ //duhalf2 = Ur-Uic;
+
+ qpot = max(fabs(duminus1/Hic), fabs(duhalf1/Hic));
+ if (qpot > qmax) qmax = qpot;
+
+ duplus1 = Ht-Hic;
+ //duplus2 = Vt-Vic;
+ duhalf1 = Hic-Hb;
+ //duhalf2 = Vic-Vb;
+
+ qpot = max(fabs(duplus1/Hic), fabs(duhalf1/Hic));
+ if (qpot > qmax) qmax = qpot;
+
+ duminus1 = Hic-Hb;
+ //duminus2 = Vic-Vb;
+ duhalf1 = Ht-Hic;
+ //duhalf2 = Vt-Vic;
+
+ qpot = max(fabs(duminus1/Hic), fabs(duhalf1/Hic));
+ if (qpot > qmax) qmax = qpot;
+
+ mpot[ic]=0;
+ if (qmax > REFINE_GRADIENT && level[ic] < mesh->levmx) {
+ mpot[ic]=1;
+ } else if (qmax < COARSEN_GRADIENT && level[ic] > 0) {
+ mpot[ic] = -1;
+ }
+ //if (mpot[ic]) printf("DEBUG cpu cell is %d mpot %d\n",ic,mpot[ic]);
+ }
+
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+ if (TIMING_LEVEL >= 2) {
+ cpu_timers[STATE_TIMER_CALC_MPOT] += cpu_timer_stop(tstart_lev2);
+ }
+#ifdef _OPENMP
+}
+#endif
+
+#ifdef _OPENMP
+}
+#pragma omp barrier
+#endif
+ int newcount = mesh->refine_smooth(mpot, icount, jcount);
+ //printf("DEBUG -- after refine smooth in file %s line %d icount %d jcount %d newcount %d\n",__FILE__,__LINE__,icount,jcount,newcount);
+
+ cpu_timers[STATE_TIMER_REFINE_POTENTIAL] += cpu_timer_stop(tstart_cpu);
+
+ return(newcount);
+}
+
+#ifdef HAVE_OPENCL
+size_t State::gpu_calc_refine_potential(int &icount, int &jcount)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ struct timeval tstart_lev2;
+ if (TIMING_LEVEL >= 2) cpu_timer_start(&tstart_lev2);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t &ncells = mesh->ncells;
+ int &levmx = mesh->levmx;
+ cl_mem &dev_nlft = mesh->dev_nlft;
+ cl_mem &dev_nrht = mesh->dev_nrht;
+ cl_mem &dev_nbot = mesh->dev_nbot;
+ cl_mem &dev_ntop = mesh->dev_ntop;
+ //cl_mem &dev_mpot = mesh->dev_mpot;
+ cl_mem &dev_i = mesh->dev_i;
+ cl_mem &dev_j = mesh->dev_j;
+ cl_mem &dev_level = mesh->dev_level;
+ cl_mem &dev_celltype = mesh->dev_celltype;
+ cl_mem &dev_levdx = mesh->dev_levdx;
+ cl_mem &dev_levdy = mesh->dev_levdy;
+
+ assert(dev_H);
+ assert(dev_U);
+ assert(dev_V);
+ assert(dev_nlft);
+ assert(dev_nrht);
+ assert(dev_nbot);
+ assert(dev_ntop);
+ assert(dev_i);
+ assert(dev_j);
+ assert(dev_level);
+ //assert(dev_mpot);
+ //assert(dev_ioffset);
+ assert(dev_levdx);
+ assert(dev_levdy);
+
+ icount = 0;
+ jcount = 0;
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+ size_t block_size = global_work_size/local_work_size;
+
+#ifdef HAVE_MPI
+ //size_t nghost_local = mesh->ncells_ghost - ncells;
+
+ if (mesh->numpe > 1) {
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_local, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_local, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ L7_Dev_Update(dev_H, L7_STATE_T, mesh->cell_handle);
+ L7_Dev_Update(dev_U, L7_STATE_T, mesh->cell_handle);
+ L7_Dev_Update(dev_V, L7_STATE_T, mesh->cell_handle);
+
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions_ghost, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions_ghost, 1, NULL, &global_work_size, &local_work_size, NULL);
+ } else {
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions, 1, NULL, &global_work_size, &local_work_size, NULL);
+ }
+#else
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 0, sizeof(cl_int), &ncells);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 1, sizeof(cl_mem), &dev_celltype);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 2, sizeof(cl_mem), &dev_nlft);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 3, sizeof(cl_mem), &dev_nrht);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 4, sizeof(cl_mem), &dev_ntop);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 5, sizeof(cl_mem), &dev_nbot);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 6, sizeof(cl_mem), &dev_H);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 7, sizeof(cl_mem), &dev_U);
+ ezcl_set_kernel_arg(kernel_apply_boundary_conditions, 8, sizeof(cl_mem), &dev_V);
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_apply_boundary_conditions, 1, NULL, &global_work_size, &local_work_size, NULL);
+#endif
+
+#ifdef BOUNDS_CHECK
+ {
+ vector<int> nlft_tmp(mesh->ncells_ghost);
+ vector<int> nrht_tmp(mesh->ncells_ghost);
+ vector<int> nbot_tmp(mesh->ncells_ghost);
+ vector<int> ntop_tmp(mesh->ncells_ghost);
+ vector<int> level_tmp(mesh->ncells_ghost);
+ vector<state_t> H_tmp(mesh->ncells_ghost);
+ ezcl_enqueue_read_buffer(command_queue, dev_nlft, CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nlft_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nrht, CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nrht_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_nbot, CL_FALSE, 0, mesh->ncells_ghost*sizeof(cl_int), &nbot_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_ntop, CL_TRUE, 0, mesh->ncells_ghost*sizeof(cl_int), &ntop_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_level, CL_TRUE, 0, mesh->ncells_ghost*sizeof(cl_int), &level_tmp[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_H, CL_TRUE, 0, mesh->ncells_ghost*sizeof(cl_int), &H_tmp[0], NULL);
+ for (uint ic=0; ic<ncells; ic++){
+ int nl = nlft_tmp[ic];
+ if (nl<0 || nl>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nlft %d\n",mesh->mype,__LINE__,ic,nl);
+ if (level_tmp[nl] > level_tmp[ic]){
+ int ntl = ntop_tmp[nl];
+ if (ntl<0 || ntl>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d global %d nlft %d ntop of nlft %d\n",mesh->mype,__LINE__,ic,ic+mesh->noffset,nl,ntl);
+ }
+ int nr = nrht_tmp[ic];
+ if (nr<0 || nr>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht %d\n",mesh->mype,__LINE__,ic,nr);
+ if (level_tmp[nr] > level_tmp[ic]){
+ int ntr = ntop_tmp[nr];
+ if (ntr<0 || ntr>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d ntop of nrht %d\n",mesh->mype,__LINE__,ic,ntr);
+ }
+ int nb = nbot_tmp[ic];
+ if (nb<0 || nb>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nbot %d\n",mesh->mype,__LINE__,ic,nb);
+ if (level_tmp[nb] > level_tmp[ic]){
+ int nrb = nrht_tmp[nb];
+ if (nrb<0 || nrb>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht of nbot %d\n",mesh->mype,__LINE__,ic,nrb);
+ }
+ int nt = ntop_tmp[ic];
+ if (nt<0 || nt>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d ntop %d\n",mesh->mype,__LINE__,ic,nt);
+ if (level_tmp[nt] > level_tmp[ic]){
+ int nrt = nrht_tmp[nt];
+ if (nrt<0 || nrt>= (int)mesh->ncells_ghost) printf("%d: Warning at line %d cell %d nrht of ntop %d\n",mesh->mype,__LINE__,ic,nrt);
+ }
+ }
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++){
+ if (H_tmp[ic] < 1.0) printf("%d: Warning at line %d cell %d H %lf\n",mesh->mype,__LINE__,ic,H_tmp[ic]);
+ }
+ }
+#endif
+
+ size_t result_size = 1;
+ cl_mem dev_result = ezcl_malloc(NULL, const_cast<char *>("dev_result"), &result_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+ cl_mem dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_int2), CL_MEM_READ_WRITE, 0);
+
+ dev_mpot = ezcl_malloc(NULL, const_cast<char *>("dev_mpot"), &mesh->ncells_ghost, sizeof(cl_int), CL_MEM_READ_WRITE, 0);
+
+ /*
+ __kernel void refine_potential
+ const int ncells, // 0 Total number of cells.
+ const int levmx, // 1 Maximum level
+ __global state_t *H, // 2
+ __global state_t *U, // 3
+ __global state_t *V, // 4
+ __global const int *nlft, // 5 Array of left neighbors.
+ __global const int *nrht, // 6 Array of right neighbors.
+ __global const int *ntop, // 7 Array of bottom neighbors.
+ __global const int *nbot, // 8 Array of top neighbors.
+ __global const int *level, // 9 Array of level information.
+ __global const int *celltype, // 10 Array of celltype information.
+ __global int *mpot, // 11 Array of mesh potential information.
+ __global int2 *redscratch, // 12
+ __global const real_t *lev_dx, // 13
+ __global const real_t *lev_dy, // 14
+ __global int2 *result, // 15
+ __local state_t *tile, // 16 Tile size in real4.
+ __local int8 *itile) // 17 Tile size in int8.
+ */
+
+ ezcl_set_kernel_arg(kernel_refine_potential, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_refine_potential, 1, sizeof(cl_int), (void *)&levmx);
+ ezcl_set_kernel_arg(kernel_refine_potential, 2, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_refine_potential, 3, sizeof(cl_mem), (void *)&dev_U);
+ ezcl_set_kernel_arg(kernel_refine_potential, 4, sizeof(cl_mem), (void *)&dev_V);
+ ezcl_set_kernel_arg(kernel_refine_potential, 5, sizeof(cl_mem), (void *)&dev_nlft);
+ ezcl_set_kernel_arg(kernel_refine_potential, 6, sizeof(cl_mem), (void *)&dev_nrht);
+ ezcl_set_kernel_arg(kernel_refine_potential, 7, sizeof(cl_mem), (void *)&dev_ntop);
+ ezcl_set_kernel_arg(kernel_refine_potential, 8, sizeof(cl_mem), (void *)&dev_nbot);
+ ezcl_set_kernel_arg(kernel_refine_potential, 9, sizeof(cl_mem), (void *)&dev_i);
+ ezcl_set_kernel_arg(kernel_refine_potential,10, sizeof(cl_mem), (void *)&dev_j);
+ ezcl_set_kernel_arg(kernel_refine_potential,11, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_refine_potential,12, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_refine_potential,13, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_refine_potential,14, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_refine_potential,15, sizeof(cl_mem), (void *)&dev_mpot);
+ ezcl_set_kernel_arg(kernel_refine_potential,16, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_refine_potential,17, sizeof(cl_mem), (void *)&dev_result);
+ ezcl_set_kernel_arg(kernel_refine_potential,18, local_work_size*sizeof(cl_state_t), NULL);
+ ezcl_set_kernel_arg(kernel_refine_potential,19, local_work_size*sizeof(cl_int8), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_refine_potential, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ mesh->gpu_rezone_count2(block_size, local_work_size, dev_redscratch, dev_result);
+
+ int count[2] = {0, 0};
+ ezcl_enqueue_read_buffer(command_queue, dev_result, CL_TRUE, 0, sizeof(cl_int2), count, NULL);
+ icount = count[0];
+ jcount = count[1];
+ //size_t result = ncells + icount - jcount;
+
+ //int mpot_check[ncells];
+ //ezcl_enqueue_read_buffer(command_queue, dev_mpot, CL_TRUE, 0, ncells*sizeof(cl_int), mpot_check, NULL);
+ //for (int ic=0; ic<ncells; ic++){
+ // if (mpot_check[ic]) printf("DEBUG -- cell %d mpot %d\n",ic,mpot_check[ic]);
+ //}
+
+ //printf("result = %lu after first refine potential icount %d jcount %d\n",result, icount, jcount);
+// int which_smooth = 1;
+
+ ezcl_device_memory_delete(dev_redscratch);
+ ezcl_device_memory_delete(dev_result);
+
+ if (TIMING_LEVEL >= 2) {
+ gpu_timers[STATE_TIMER_CALC_MPOT] += (long)(cpu_timer_stop(tstart_lev2)*1.0e9);
+ }
+
+ int my_result = mesh->gpu_refine_smooth(dev_mpot, icount, jcount);
+ //printf("DEBUG gpu calc refine potential %d icount %d jcount %d\n",my_result,icount,jcount);
+
+ gpu_timers[STATE_TIMER_REFINE_POTENTIAL] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+ return((size_t)my_result);
+}
+#endif
+
+double State::mass_sum(int enhanced_precision_sum)
+{
+ size_t &ncells = mesh->ncells;
+ int *celltype = mesh->celltype;
+ int *level = mesh->level;
+
+#ifdef HAVE_MPI
+ //int &mype = mesh->mype;
+#endif
+
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ double summer = 0.0;
+ double total_sum = 0.0;
+
+ if (enhanced_precision_sum == SUM_KAHAN) {
+ //printf("DEBUG -- kahan_sum\n");
+ double corrected_next_term, new_sum;
+ struct esum_type local;
+#ifdef HAVE_MPI
+ struct esum_type global;
+#endif
+
+ local.sum = 0.0;
+ local.correction = 0.0;
+ int ic;
+ for (ic = 0; ic < (int)ncells; ic++) {
+ if (celltype[ic] == REAL_CELL) {
+ // Exclude boundary cells.
+ corrected_next_term= H[ic]*mesh->lev_deltax[level[ic]]*mesh->lev_deltay[level[ic]] + local.correction;
+ new_sum = local.sum + local.correction;
+ local.correction = corrected_next_term - (new_sum - local.sum);
+ local.sum = new_sum;
+ }
+ }
+
+#ifdef HAVE_MPI
+ if (mesh->parallel) {
+ MPI_Allreduce(&local, &global, 1, MPI_TWO_DOUBLES, KNUTH_SUM, MPI_COMM_WORLD);
+ total_sum = global.sum + global.correction;
+ } else {
+ total_sum = local.sum + local.correction;
+ }
+
+//if(mype == 0) printf("MYPE %d: Line %d Iteration %d \t local_sum = %12.6lg, global_sum = %12.6lg\n", mype, __LINE__, mesh->m_ncycle, local.sum, global.sum);
+
+#else
+ total_sum = local.sum + local.correction;
+#endif
+
+ } else if (enhanced_precision_sum == SUM_REGULAR) {
+ //printf("DEBUG -- regular_sum\n");
+ for (uint ic=0; ic < ncells; ic++){
+ if (celltype[ic] == REAL_CELL) {
+ summer += H[ic]*mesh->lev_deltax[level[ic]]*mesh->lev_deltay[level[ic]];
+ }
+ }
+#ifdef HAVE_MPI
+ if (mesh->parallel) {
+ MPI_Allreduce(&summer, &total_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ } else {
+ total_sum = summer;
+ }
+#else
+ total_sum = summer;
+#endif
+ }
+
+ cpu_timers[STATE_TIMER_MASS_SUM] += cpu_timer_stop(tstart_cpu);
+
+ return(total_sum);
+}
+
+#ifdef HAVE_OPENCL
+double State::gpu_mass_sum(int enhanced_precision_sum)
+{
+ struct timeval tstart_cpu;
+ cpu_timer_start(&tstart_cpu);
+
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ size_t &ncells = mesh->ncells;
+ cl_mem &dev_levdx = mesh->dev_levdx;
+ cl_mem &dev_levdy = mesh->dev_levdy;
+ cl_mem &dev_celltype = mesh->dev_celltype;
+ cl_mem &dev_level = mesh->dev_level;
+
+ assert(dev_H);
+ assert(dev_level);
+ assert(dev_levdx);
+ assert(dev_levdy);
+ assert(dev_celltype);
+
+ size_t one = 1;
+ cl_mem dev_mass_sum, dev_redscratch;
+ double gpu_mass_sum_total;
+
+ size_t local_work_size = 128;
+ size_t global_work_size = ((ncells+local_work_size - 1) /local_work_size) * local_work_size;
+ size_t block_size = global_work_size/local_work_size;
+
+ if (enhanced_precision_sum) {
+ dev_mass_sum = ezcl_malloc(NULL, const_cast<char *>("dev_mass_sum"), &one, sizeof(cl_real2_t), CL_MEM_READ_WRITE, 0);
+ dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real2_t), CL_MEM_READ_WRITE, 0);
+
+ /*
+ __kernel void reduce_sum_cl(
+ const int isize, // 0
+ __global state_t *array, // 1 Array to be reduced.
+ __global int *level, // 2
+ __global int *levdx, // 3
+ __global int *levdy, // 4
+ __global int *celltype, // 5
+ __global real_t *redscratch, // 6 Final result of operation.
+ __local real_t *tile) // 7
+ */
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 1, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 2, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 3, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 4, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 5, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 6, sizeof(cl_mem), (void *)&dev_mass_sum);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 7, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage1of2, 8, local_work_size*sizeof(cl_real2_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_epsum_mass_stage1of2, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ if (block_size > 1) {
+ /*
+ __kernel void reduce_sum_cl(
+ const int isize, // 0
+ __global int *redscratch, // 1 Array to be reduced.
+ __local real_t *tile) // 2
+ */
+
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 1, sizeof(cl_mem), (void *)&dev_mass_sum);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 2, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduce_epsum_mass_stage2of2, 3, local_work_size*sizeof(cl_real2_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_epsum_mass_stage2of2, 1, NULL, &local_work_size, &local_work_size, NULL);
+ }
+
+ struct esum_type local, global;
+ real2_t mass_sum;
+
+ ezcl_enqueue_read_buffer(command_queue, dev_mass_sum, CL_TRUE, 0, 1*sizeof(cl_real2_t), &mass_sum, NULL);
+
+ local.sum = mass_sum.s0;
+ local.correction = mass_sum.s1;
+ global.sum = local.sum;
+ global.correction = local.correction;
+#ifdef HAVE_MPI
+ MPI_Allreduce(&local, &global, 1, MPI_TWO_DOUBLES, KNUTH_SUM, MPI_COMM_WORLD);
+#endif
+ gpu_mass_sum_total = global.sum + global.correction;
+ } else {
+ dev_mass_sum = ezcl_malloc(NULL, const_cast<char *>("dev_mass_sum"), &one, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+ dev_redscratch = ezcl_malloc(NULL, const_cast<char *>("dev_redscratch"), &block_size, sizeof(cl_real_t), CL_MEM_READ_WRITE, 0);
+
+ /*
+ __kernel void reduce_sum_cl(
+ const int isize, // 0
+ __global state_t *array, // 1 Array to be reduced.
+ __global int *level, // 2
+ __global int *levdx, // 3
+ __global int *levdy, // 4
+ __global int *celltype, // 5
+ __global real_t *redscratch, // 6 Final result of operation.
+ __local real_t *tile) // 7
+ */
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 0, sizeof(cl_int), (void *)&ncells);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 1, sizeof(cl_mem), (void *)&dev_H);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 2, sizeof(cl_mem), (void *)&dev_level);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 3, sizeof(cl_mem), (void *)&dev_levdx);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 4, sizeof(cl_mem), (void *)&dev_levdy);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 5, sizeof(cl_mem), (void *)&dev_celltype);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 6, sizeof(cl_mem), (void *)&dev_mass_sum);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 7, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage1of2, 8, local_work_size*sizeof(cl_real_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_mass_stage1of2, 1, NULL, &global_work_size, &local_work_size, NULL);
+
+ if (block_size > 1) {
+ /*
+ __kernel void reduce_sum_cl(
+ const int isize, // 0
+ __global int *redscratch, // 1 Array to be reduced.
+ __local real_t *tile) // 2
+ */
+
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 0, sizeof(cl_int), (void *)&block_size);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 1, sizeof(cl_mem), (void *)&dev_mass_sum);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 2, sizeof(cl_mem), (void *)&dev_redscratch);
+ ezcl_set_kernel_arg(kernel_reduce_sum_mass_stage2of2, 3, local_work_size*sizeof(cl_real_t), NULL);
+
+ ezcl_enqueue_ndrange_kernel(command_queue, kernel_reduce_sum_mass_stage2of2, 1, NULL, &local_work_size, &local_work_size, NULL);
+ }
+
+ double local_sum, global_sum;
+ real_t mass_sum;
+
+ ezcl_enqueue_read_buffer(command_queue, dev_mass_sum, CL_TRUE, 0, 1*sizeof(cl_real_t), &mass_sum, NULL);
+
+ local_sum = mass_sum;
+ global_sum = local_sum;
+#ifdef HAVE_MPI
+ MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#endif
+ gpu_mass_sum_total = global_sum;
+ }
+
+ ezcl_device_memory_delete(dev_redscratch);
+ ezcl_device_memory_delete(dev_mass_sum);
+
+ gpu_timers[STATE_TIMER_MASS_SUM] += (long)(cpu_timer_stop(tstart_cpu)*1.0e9);
+
+ return(gpu_mass_sum_total);
+}
+#endif
+
+#ifdef HAVE_OPENCL
+void State::allocate_device_memory(size_t ncells)
+{
+ dev_H = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_H"), DEVICE_REGULAR_MEMORY);
+ dev_U = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_U"), DEVICE_REGULAR_MEMORY);
+ dev_V = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_V"), DEVICE_REGULAR_MEMORY);
+}
+#endif
+
+void State::resize_old_device_memory(size_t ncells)
+{
+#ifdef HAVE_OPENCL
+ gpu_state_memory.memory_delete(dev_H);
+ gpu_state_memory.memory_delete(dev_U);
+ gpu_state_memory.memory_delete(dev_V);
+ dev_H = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_H"), DEVICE_REGULAR_MEMORY);
+ dev_U = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_U"), DEVICE_REGULAR_MEMORY);
+ dev_V = (cl_mem)gpu_state_memory.memory_malloc(ncells, sizeof(cl_state_t), const_cast<char *>("dev_V"), DEVICE_REGULAR_MEMORY);
+#else
+ // Just to block compiler warnings
+ if (1 == 2) printf("DEBUG -- ncells is %ld\n",ncells);
+#endif
+}
+
+#ifdef HAVE_MPI
+void State::do_load_balance_local(size_t &numcells){
+ mesh->do_load_balance_local(numcells, NULL, state_memory);
+ memory_reset_ptrs();
+}
+#endif
+#ifdef HAVE_OPENCL
+#ifdef HAVE_MPI
+void State::gpu_do_load_balance_local(size_t &numcells){
+ if (mesh->gpu_do_load_balance_local(numcells, NULL, gpu_state_memory) ){
+ //gpu_state_memory.memory_report();
+ dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H");
+ dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U");
+ dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V");
+/*
+ if (dev_H == NULL){
+ dev_H = (cl_mem)gpu_state_memory.get_memory_ptr("dev_H_new");
+ dev_U = (cl_mem)gpu_state_memory.get_memory_ptr("dev_U_new");
+ dev_V = (cl_mem)gpu_state_memory.get_memory_ptr("dev_V_new");
+ }
+ printf("DEBUG memory for proc %d dev_H is %p dev_U is %p dev_V is %p\n",mesh->mype,dev_H,dev_U,dev_V);
+*/
+ }
+}
+#endif
+#endif
+
+static double reference_time = 0.0;
+
+void State::output_timing_info(int do_cpu_calc, int do_gpu_calc, double total_elapsed_time)
+{
+ int parallel = mesh->parallel;
+
+ double cpu_time_compute = 0.0;
+ double gpu_time_compute = 0.0;
+
+ double cpu_elapsed_time = 0.0;
+ double gpu_elapsed_time = 0.0;
+
+ double cpu_mesh_time = 0.0;
+ double gpu_mesh_time = 0.0;
+
+ if (do_cpu_calc) {
+ cpu_time_compute = get_cpu_timer(STATE_TIMER_SET_TIMESTEP) +
+ get_cpu_timer(STATE_TIMER_FINITE_DIFFERENCE) +
+ get_cpu_timer(STATE_TIMER_REFINE_POTENTIAL) +
+ get_cpu_timer(STATE_TIMER_REZONE_ALL) +
+ mesh->get_cpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+ mesh->get_cpu_timer(MESH_TIMER_LOAD_BALANCE) +
+ get_cpu_timer(STATE_TIMER_MASS_SUM) +
+ mesh->get_cpu_timer(MESH_TIMER_CALC_SPATIAL_COORDINATES) +
+ mesh->get_cpu_timer(MESH_TIMER_PARTITION);
+ cpu_elapsed_time = cpu_time_compute;
+ cpu_mesh_time = mesh->get_cpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+ get_cpu_timer(STATE_TIMER_REZONE_ALL) +
+ mesh->get_cpu_timer(MESH_TIMER_REFINE_SMOOTH) +
+ mesh->get_cpu_timer(MESH_TIMER_LOAD_BALANCE);
+ }
+ if (do_gpu_calc) {
+ gpu_time_compute = get_gpu_timer(STATE_TIMER_APPLY_BCS) +
+ get_gpu_timer(STATE_TIMER_SET_TIMESTEP) +
+ get_gpu_timer(STATE_TIMER_FINITE_DIFFERENCE) +
+ get_gpu_timer(STATE_TIMER_REFINE_POTENTIAL) +
+ get_gpu_timer(STATE_TIMER_REZONE_ALL) +
+ mesh->get_gpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+ mesh->get_gpu_timer(MESH_TIMER_LOAD_BALANCE) +
+ get_gpu_timer(STATE_TIMER_MASS_SUM) +
+ mesh->get_gpu_timer(MESH_TIMER_CALC_SPATIAL_COORDINATES) +
+ mesh->get_gpu_timer(MESH_TIMER_COUNT_BCS);
+ gpu_elapsed_time = get_gpu_timer(STATE_TIMER_WRITE) + gpu_time_compute + get_gpu_timer(STATE_TIMER_READ);
+ gpu_mesh_time = mesh->get_gpu_timer(MESH_TIMER_CALC_NEIGHBORS) +
+ get_gpu_timer(STATE_TIMER_REZONE_ALL) +
+ mesh->get_gpu_timer(MESH_TIMER_REFINE_SMOOTH) +
+ mesh->get_gpu_timer(MESH_TIMER_LOAD_BALANCE);
+ }
+
+ if (! parallel && do_cpu_calc) reference_time = cpu_elapsed_time;
+
+ double speedup_ratio = 0.0;
+ if (reference_time > 0.0){
+ if (do_cpu_calc && parallel) speedup_ratio = reference_time/cpu_elapsed_time;
+ if (do_gpu_calc) speedup_ratio = reference_time/gpu_elapsed_time;
+ }
+
+ if (do_cpu_calc) {
+ output_timer_block(MESH_DEVICE_CPU, cpu_elapsed_time, cpu_mesh_time, cpu_time_compute, total_elapsed_time, speedup_ratio);
+ }
+ if (do_gpu_calc) {
+ output_timer_block(MESH_DEVICE_GPU, gpu_elapsed_time, gpu_mesh_time, gpu_time_compute, total_elapsed_time, speedup_ratio);
+ }
+}
+
+void State::output_timer_block(mesh_device_types device_type, double elapsed_time,
+ double mesh_time, double compute_time, double total_elapsed_time, double speedup_ratio)
+{
+ int mype = mesh->mype;
+ int parallel = mesh->parallel;
+
+ int rank = mype;
+ if (! parallel) {
+ // We need to get rank info for check routines
+#ifdef HAVE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+ }
+
+ if (! parallel && rank) return;
+
+ char device_string[10];
+ if (device_type == MESH_DEVICE_CPU) {
+ sprintf(device_string,"CPU");
+ } else {
+ sprintf(device_string,"GPU");
+ }
+
+#ifdef TIMING
+ if (rank == 0) {
+ printf("\n");
+ printf("~~~~~~~~~~~~~~~~ Device timing information ~~~~~~~~~~~~~~~~~~\n");
+ }
+
+ if (rank == 0 && parallel) {
+ printf("\n%3s: Parallel timings\n\n",device_string);
+ }
+
+ if (device_type == MESH_DEVICE_GPU) {
+ mesh->parallel_output("GPU: Write to device time was", get_gpu_timer(STATE_TIMER_WRITE), 0, "s");
+ mesh->parallel_output("GPU: Read from device time was", get_gpu_timer(STATE_TIMER_READ), 0, "s");
+ }
+
+ const char *device_compute_string[2] = {
+ "CPU: Device compute time was",
+ "GPU: Device compute time was"
+ };
+ mesh->parallel_output(device_compute_string[device_type], compute_time, 0, "s");
+
+ timer_output(STATE_TIMER_SET_TIMESTEP, device_type, 1);
+ timer_output(STATE_TIMER_FINITE_DIFFERENCE, device_type, 1);
+ timer_output(STATE_TIMER_REFINE_POTENTIAL, device_type, 1);
+ timer_output(STATE_TIMER_CALC_MPOT, device_type, 2);
+ mesh->timer_output(MESH_TIMER_REFINE_SMOOTH, device_type, 2);
+ timer_output(STATE_TIMER_REZONE_ALL, device_type, 1);
+ mesh->timer_output(MESH_TIMER_PARTITION, device_type, 1);
+ mesh->timer_output(MESH_TIMER_CALC_NEIGHBORS, device_type, 1);
+ if (mesh->get_calc_neighbor_type() == HASH_TABLE) {
+ mesh->timer_output(MESH_TIMER_HASH_SETUP, device_type, 2);
+ mesh->timer_output(MESH_TIMER_HASH_QUERY, device_type, 2);
+ if (parallel) {
+ mesh->timer_output(MESH_TIMER_FIND_BOUNDARY, device_type, 2);
+ mesh->timer_output(MESH_TIMER_PUSH_SETUP, device_type, 2);
+ mesh->timer_output(MESH_TIMER_PUSH_BOUNDARY, device_type, 2);
+ mesh->timer_output(MESH_TIMER_LOCAL_LIST, device_type, 2);
+ mesh->timer_output(MESH_TIMER_LAYER1, device_type, 2);
+ mesh->timer_output(MESH_TIMER_LAYER2, device_type, 2);
+ mesh->timer_output(MESH_TIMER_LAYER_LIST, device_type, 2);
+ mesh->timer_output(MESH_TIMER_COPY_MESH_DATA, device_type, 2);
+ mesh->timer_output(MESH_TIMER_FILL_MESH_GHOST, device_type, 2);
+ mesh->timer_output(MESH_TIMER_FILL_NEIGH_GHOST, device_type, 2);
+ mesh->timer_output(MESH_TIMER_SET_CORNER_NEIGH, device_type, 2);
+ mesh->timer_output(MESH_TIMER_NEIGH_ADJUST, device_type, 2);
+ mesh->timer_output(MESH_TIMER_SETUP_COMM, device_type, 2);
+ }
+ } else {
+ mesh->timer_output(MESH_TIMER_KDTREE_SETUP, device_type, 2);
+ mesh->timer_output(MESH_TIMER_KDTREE_QUERY, device_type, 2);
+ }
+ timer_output(STATE_TIMER_MASS_SUM, device_type, 1);
+ if (parallel) {
+ mesh->timer_output(MESH_TIMER_LOAD_BALANCE, device_type, 1);
+ }
+ mesh->timer_output(MESH_TIMER_CALC_SPATIAL_COORDINATES, device_type, 1);
+ if (! mesh->have_boundary) {
+ mesh->timer_output(MESH_TIMER_COUNT_BCS, device_type, 1);
+ }
+ if (rank == 0) printf("=============================================================\n");
+
+ const char *profile_string[2] = {
+ "Profiling: Total CPU time was",
+ "Profiling: Total GPU time was"
+ };
+ mesh->parallel_output(profile_string[device_type], elapsed_time, 0, "s");
+ if (elapsed_time > 600.0){
+ mesh->parallel_output(" or ", elapsed_time/60.0, 0, "min");
+ }
+
+ if (rank == 0) printf("-------------------------------------------------------------\n");
+ mesh->parallel_output("Mesh Ops (Neigh+rezone+smooth+balance) ",mesh_time, 0, "s");
+ mesh->parallel_output("Mesh Ops Percentage ",mesh_time/elapsed_time*100.0, 0, "percent");
+ if (rank == 0) printf("=============================================================\n");
+
+ mesh->parallel_output("Profiling: Total time was",total_elapsed_time, 0, "s");
+ if (elapsed_time > 600.0){
+ mesh->parallel_output(" or ",total_elapsed_time/60.0, 0, "min");
+ }
+
+ if (speedup_ratio > 0.0) {
+ mesh->parallel_output("Parallel Speed-up: ",speedup_ratio, 0, "Reference Serial CPU");
+ }
+
+ if (rank == 0) printf("=============================================================\n");
+#endif
+}
+
+void State::timer_output(state_timer_category category, mesh_device_types device_type, int timer_level)
+{
+ int mype = mesh->mype;
+
+ double local_time = 0.0;
+ if (device_type == MESH_DEVICE_CPU){
+ local_time = get_cpu_timer(category);
+ } else {
+ local_time = get_gpu_timer(category);
+ }
+
+ char string[80] = "/0";
+
+ if (mype == 0) {
+ const char *blank=" ";
+
+ const char *device_string[2] = {
+ "CPU",
+ "GPU"
+ };
+
+ sprintf(string,"%3s: %.*s%-30.30s\t", device_string[device_type],
+ 2*timer_level, blank, state_timer_descriptor[category]);
+ }
+
+ mesh->parallel_output(string, local_time, timer_level, "s");
+}
+
+#ifdef HAVE_OPENCL
+void State::compare_state_gpu_global_to_cpu_global(const char* string, int cycle, uint ncells)
+{
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ vector<state_t>H_check(ncells);
+ vector<state_t>U_check(ncells);
+ vector<state_t>V_check(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), &H_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), &U_check[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), &V_check[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (fabs(H[ic]-H_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d H & H_check %d %lf %lf\n",string,cycle,ic,H[ic],H_check[ic]);
+ if (fabs(U[ic]-U_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d U & U_check %d %lf %lf\n",string,cycle,ic,U[ic],U_check[ic]);
+ if (fabs(V[ic]-V_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d V & V_check %d %lf %lf\n",string,cycle,ic,V[ic],V_check[ic]);
+ }
+}
+#endif
+
+void State::compare_state_cpu_local_to_cpu_global(State *state_global, const char* string, int cycle, uint ncells, uint ncells_global, int *nsizes, int *ndispl)
+{
+ state_t *H_global = state_global->H;
+ state_t *U_global = state_global->U;
+ state_t *V_global = state_global->V;
+
+ vector<state_t>H_check(ncells_global);
+ vector<state_t>U_check(ncells_global);
+ vector<state_t>V_check(ncells_global);
+#ifdef HAVE_MPI
+ MPI_Allgatherv(&H[0], ncells, MPI_STATE_T, &H_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&U[0], ncells, MPI_STATE_T, &U_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&V[0], ncells, MPI_STATE_T, &V_check[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+#else
+ // Just to block compiler warnings
+ if (1 == 2) printf("DEBUG -- ncells %u nsizes %d ndispl %d\n",ncells, nsizes[0],ndispl[0]);
+#endif
+
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(H_global[ic]-H_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d H & H_check %d %lf %lf\n",string,cycle,ic,H_global[ic],H_check[ic]);
+ if (fabs(U_global[ic]-U_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d U & U_check %d %lf %lf\n",string,cycle,ic,U_global[ic],U_check[ic]);
+ if (fabs(V_global[ic]-V_check[ic]) > STATE_EPS) printf("DEBUG %s at cycle %d V & V_check %d %lf %lf\n",string,cycle,ic,V_global[ic],V_check[ic]);
+ }
+}
+
+#ifdef HAVE_OPENCL
+void State::compare_state_all_to_gpu_local(State *state_global, uint ncells, uint ncells_global, int mype, int ncycle, int *nsizes, int *ndispl)
+{
+#ifdef HAVE_MPI
+ cl_command_queue command_queue = ezcl_get_command_queue();
+
+ state_t *H_global = state_global->H;
+ state_t *U_global = state_global->U;
+ state_t *V_global = state_global->V;
+ cl_mem &dev_H_global = state_global->dev_H;
+ cl_mem &dev_U_global = state_global->dev_U;
+ cl_mem &dev_V_global = state_global->dev_V;
+
+ // Need to compare dev_H to H, etc
+ vector<state_t>H_save(ncells);
+ vector<state_t>U_save(ncells);
+ vector<state_t>V_save(ncells);
+ ezcl_enqueue_read_buffer(command_queue, dev_H, CL_FALSE, 0, ncells*sizeof(cl_state_t), &H_save[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_U, CL_FALSE, 0, ncells*sizeof(cl_state_t), &U_save[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_V, CL_TRUE, 0, ncells*sizeof(cl_state_t), &V_save[0], NULL);
+ for (uint ic = 0; ic < ncells; ic++){
+ if (fabs(H[ic]-H_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d H & H_save %d %lf %lf \n",mype,ncycle,ic,H[ic],H_save[ic]);
+ if (fabs(U[ic]-U_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d U & U_save %d %lf %lf \n",mype,ncycle,ic,U[ic],U_save[ic]);
+ if (fabs(V[ic]-V_save[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 1 at cycle %d V & V_save %d %lf %lf \n",mype,ncycle,ic,V[ic],V_save[ic]);
+ }
+
+ // And compare dev_H gathered to H_global, etc
+ vector<state_t>H_save_global(ncells_global);
+ vector<state_t>U_save_global(ncells_global);
+ vector<state_t>V_save_global(ncells_global);
+ MPI_Allgatherv(&H_save[0], nsizes[mype], MPI_STATE_T, &H_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&U_save[0], nsizes[mype], MPI_STATE_T, &U_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&V_save[0], nsizes[mype], MPI_STATE_T, &V_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ if (mype == 0) {
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d H_global & H_save_global %d %lf %lf \n",mype,ncycle,ic,H_global[ic],H_save_global[ic]);
+ if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d U_global & U_save_global %d %lf %lf \n",mype,ncycle,ic,U_global[ic],U_save_global[ic]);
+ if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 2 at cycle %d V_global & V_save_global %d %lf %lf \n",mype,ncycle,ic,V_global[ic],V_save_global[ic]);
+ }
+ }
+
+ // And compare H gathered to H_global, etc
+ MPI_Allgatherv(&H[0], nsizes[mype], MPI_STATE_T, &H_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&U[0], nsizes[mype], MPI_STATE_T, &U_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ MPI_Allgatherv(&V[0], nsizes[mype], MPI_STATE_T, &V_save_global[0], &nsizes[0], &ndispl[0], MPI_STATE_T, MPI_COMM_WORLD);
+ if (mype == 0) {
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d H_global & H_save_global %d %lf %lf \n",ncycle,ic,H_global[ic],H_save_global[ic]);
+ if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d U_global & U_save_global %d %lf %lf \n",ncycle,ic,U_global[ic],U_save_global[ic]);
+ if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("DEBUG finite_difference 3 at cycle %d V_global & V_save_global %d %lf %lf \n",ncycle,ic,V_global[ic],V_save_global[ic]);
+ }
+ }
+
+ // Now the global dev_H_global to H_global, etc
+ ezcl_enqueue_read_buffer(command_queue, dev_H_global, CL_FALSE, 0, ncells_global*sizeof(cl_state_t), &H_save_global[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_U_global, CL_FALSE, 0, ncells_global*sizeof(cl_state_t), &U_save_global[0], NULL);
+ ezcl_enqueue_read_buffer(command_queue, dev_V_global, CL_TRUE, 0, ncells_global*sizeof(cl_state_t), &V_save_global[0], NULL);
+ if (mype == 0) {
+ for (uint ic = 0; ic < ncells_global; ic++){
+ if (fabs(H_global[ic]-H_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d H_global & H_save_global %d %lf %lf \n",mype,ncycle,ic,H_global[ic],H_save_global[ic]);
+ if (fabs(U_global[ic]-U_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d U_global & U_save_global %d %lf %lf \n",mype,ncycle,ic,U_global[ic],U_save_global[ic]);
+ if (fabs(V_global[ic]-V_save_global[ic]) > STATE_EPS) printf("%d: DEBUG finite_difference 4 at cycle %d V_global & V_save_global %d %lf %lf \n",mype,ncycle,ic,V_global[ic],V_save_global[ic]);
+ }
+ }
+#else
+ // Just to get rid of compiler warnings
+ if (1 == 2) printf("%d: DEBUG -- ncells %d ncells_global %d ncycle %d nsizes[0] %d ndispl %d state_global %p\n",
+ mype,ncells,ncells_global,ncycle,nsizes[0],ndispl[0],state_global);
+#endif
+}
+#endif
+
+void State::print_object_info(void)
+{
+ printf(" ---- State object info -----\n");
+
+#ifdef HAVE_OPENCL
+ int num_elements, elsize;
+
+ num_elements = ezcl_get_device_mem_nelements(dev_H);
+ elsize = ezcl_get_device_mem_elsize(dev_H);
+ printf("dev_H ptr : %p nelements %d elsize %d\n",dev_H,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_U);
+ elsize = ezcl_get_device_mem_elsize(dev_U);
+ printf("dev_U ptr : %p nelements %d elsize %d\n",dev_U,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_V);
+ elsize = ezcl_get_device_mem_elsize(dev_V);
+ printf("dev_V ptr : %p nelements %d elsize %d\n",dev_V,num_elements,elsize);
+ num_elements = ezcl_get_device_mem_nelements(dev_mpot);
+ elsize = ezcl_get_device_mem_elsize(dev_mpot);
+ printf("dev_mpot ptr : %p nelements %d elsize %d\n",dev_mpot,num_elements,elsize);
+ //num_elements = ezcl_get_device_mem_nelements(dev_ioffset);
+ //elsize = ezcl_get_device_mem_elsize(dev_ioffset);
+ //printf("dev_ioffset ptr : %p nelements %d elsize %d\n",dev_ioffset,num_elements,elsize);
+#endif
+ state_memory.memory_report();
+ //printf("vector H ptr : %p nelements %ld elsize %ld\n",&H[0],H.size(),sizeof(H[0]));
+ //printf("vector U ptr : %p nelements %ld elsize %ld\n",&U[0],U.size(),sizeof(U[0]));
+ //printf("vector V ptr : %p nelements %ld elsize %ld\n",&V[0],V.size(),sizeof(V[0]));
+}
+
+void State::print(void)
+{ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+ if (mesh->fp == NULL) {
+ char filename[10];
+ sprintf(filename,"out%1d",mesh->mype);
+ mesh->fp=fopen(filename,"w");
+ }
+
+ if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+ fprintf(mesh->fp,"%d: index global i j lev nlft nrht nbot ntop \n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ } else {
+ fprintf(mesh->fp,"%d: index H U V i j lev\n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+ }
+ }
+}
+
+const int CRUX_STATE_VERSION = 102;
+const int num_int_vals = 1;
+
+size_t State::get_checkpoint_size(void)
+{
+#ifdef FULL_PRECISION
+ size_t nsize = mesh->ncells*3*sizeof(double);
+#else
+ size_t nsize = mesh->ncells*3*sizeof(float);
+#endif
+ nsize += num_int_vals*sizeof(int);
+ nsize += mesh->get_checkpoint_size();
+ return(nsize);
+}
+
+void State::store_checkpoint(Crux *crux)
+{
+ // Store mesh data first
+ mesh->store_checkpoint(crux);
+
+//#ifndef HAVE_MPI
+ // Load up scalar values
+ int int_vals[num_int_vals];
+ int_vals[0] = CRUX_STATE_VERSION;
+
+ // Add to memory database for storing checkpoint
+ state_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "state_int_vals", RESTART_DATA | REPLICATED_DATA);
+ state_memory.memory_add(cpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_cpu_timers", RESTART_DATA);
+ state_memory.memory_add(gpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_gpu_timers", RESTART_DATA);
+
+ crux->store_MallocPlus(state_memory);
+
+ // Remove from database after checkpoint is stored
+ state_memory.memory_remove(int_vals);
+ state_memory.memory_remove(cpu_timers);
+ state_memory.memory_remove(gpu_timers);
+//#endif
+}
+
+void State::restore_checkpoint(Crux *crux)
+{
+ int storage;
+ // Restore mesh data first
+ mesh->restore_checkpoint(crux);
+ crux->restore_named_ints("storage", 7, &storage, 1);
+
+ // Create memory for restoring data into
+ int int_vals[num_int_vals];
+
+ // allocate is a state method
+ allocate(storage);
+
+ // Add to memory database for restoring checkpoint
+ state_memory.memory_add(int_vals, (size_t)num_int_vals, 4, "state_int_vals", RESTART_DATA | REPLICATED_DATA);
+ state_memory.memory_add(cpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_cpu_timers", RESTART_DATA);
+ state_memory.memory_add(gpu_timers, (size_t)STATE_TIMER_SIZE, 8, "state_gpu_timers", RESTART_DATA);
+
+ // Restore memory database
+ crux->restore_MallocPlus(state_memory);
+
+ // Check version number
+ if (int_vals[ 0] != CRUX_STATE_VERSION) {
+ printf("CRUX version mismatch for state data, version on file is %d, version in code is %d\n",
+ int_vals[0], CRUX_STATE_VERSION);
+ exit(0);
+ }
+
+#ifdef DEBUG_RESTORE_VALS
+ if (DEBUG_RESTORE_VALS) {
+ printf("\n");
+ printf(" === Restored state cpu timers ===\n");
+ for (int i = 0; i < STATE_TIMER_SIZE; i++){
+ printf(" %-30s %lg\n",state_timer_descriptor[i], cpu_timers[i]);
+ }
+ printf(" === Restored state cpu timers ===\n");
+ printf("\n");
+ }
+#endif
+
+#ifdef DEBUG_RESTORED_VALS
+ if (DEBUG_RESTORED_VALS) {
+ printf("\n");
+ printf(" === Restored state gpu timers ===\n");
+ for (int i = 0; i < STATE_TIMER_SIZE; i++){
+ printf(" %-30s %lld\n",state_timer_descriptor[i], gpu_timers[i]);
+ }
+ printf(" === Restored state gpu_timers ===\n");
+ printf("\n");
+ }
+#endif
+
+ state_memory.memory_remove(int_vals);
+ state_memory.memory_remove(cpu_timers);
+ state_memory.memory_remove(gpu_timers);
+
+ memory_reset_ptrs();
+//#endif
+}
+
+// Added overloaded print to get mesh information to print in each cycle
+// Brian Atkinson (5-29-14)
+void State::print(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage)
+{ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+ char filename[40];
+ sprintf(filename,"iteration%d",iteration);
+ mesh->fp=fopen(filename,"w");
+
+ if(iteration_mass == 0.0){
+ fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+ fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+ fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tSimulation Time: %lf\n", initial_mass, simTime);
+ }
+ else{
+ double mass_diff = iteration_mass - initial_mass;
+ fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+ fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+ fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+ fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+ }
+
+ if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+ fprintf(mesh->fp,"%d: index global i j lev nlft nrht nbot ntop \n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ } else {
+ fprintf(mesh->fp,"%d: index H U V i j lev\n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+ }
+ }
+}
+
+void State::print_local(int ncycle)
+{ //printf("size is %lu %lu %lu %lu %lu\n",index.size(), i.size(), level.size(), nlft.size(), x.size());
+
+ if (mesh->fp == NULL) {
+ char filename[10];
+ sprintf(filename,"out%1d",mesh->mype);
+ mesh->fp=fopen(filename,"w");
+ }
+
+ fprintf(mesh->fp,"DEBUG in print_local ncycle is %d\n",ncycle);
+ if (mesh->nlft != NULL){
+ fprintf(mesh->fp,"%d: index H U V i j lev nlft nrht nbot ntop\n",mesh->mype);
+ uint state_size = state_memory.get_memory_size(H);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ if (ic >= state_size){
+ fprintf(mesh->fp,"%d: %6d %4d %4d %4d %4d %4d %4d %4d\n", mesh->mype,ic, mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ } else {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d %4d %4d %4d %4d\n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ }
+ } else {
+ fprintf(mesh->fp,"%d: index H U V i j lev\n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d\n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+ }
+ }
+}
+
+void State::print_failure_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, bool got_nan){
+ char filename[] = {"failure.log"};
+ mesh->fp=fopen(filename,"w");
+
+ double mass_diff = iteration_mass - initial_mass;
+ if(got_nan){
+ fprintf(mesh->fp,"Failed because of nan for H_sum was equal to NAN\n");
+ }
+ else{
+ fprintf(mesh->fp,"Failed because mass difference is outside of accepted percentage\n");
+ }
+ fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+ fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+ fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+ fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+
+ if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+ fprintf(mesh->fp,"%d: index global i j lev nlft nrht nbot ntop \n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ } else {
+ fprintf(mesh->fp,"%d: index H U V i j lev\n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+ }
+ }
+}
+
+void State::print_rollback_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, int backup_attempt, int num_of_attempts, int error_status){
+ char filename[40];
+ sprintf(filename, "rollback%d.log",backup_attempt);
+ mesh->fp=fopen(filename,"w");
+
+ double mass_diff = iteration_mass - initial_mass;
+ if(error_status == STATUS_NAN){
+ fprintf(mesh->fp,"Rolling back because of nan for H_sum was equal to NAN\n");
+ }
+ else{
+ fprintf(mesh->fp,"Rolling back because mass difference is outside of accepted percentage\n");
+ }
+ fprintf(mesh->fp,"Rollback attempt %d of %d ---> Number of attempts left:%d\n", backup_attempt, num_of_attempts, num_of_attempts - backup_attempt);
+ fprintf(mesh->fp,"Iteration = %d\t\tSimuation Time = %lf\n", iteration, simTime);
+ fprintf(mesh->fp,"mesh->ncells = %lu\t\tmesh->ncells_ghost = %lu\n", mesh->ncells, mesh->ncells_ghost);
+ fprintf(mesh->fp,"Initial Mass: %14.12lg\t\tIteration Mass: %14.12lg\n", initial_mass, iteration_mass);
+ fprintf(mesh->fp,"Mass Difference: %12.6lg\t\tMass Difference Percentage: %12.6lg%%\n", mass_diff, mass_diff_percentage);
+
+ if (mesh->mesh_memory.get_memory_size(mesh->nlft) >= mesh->ncells_ghost){
+ fprintf(mesh->fp,"%d: index global i j lev nlft nrht nbot ntop \n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ for (uint ic=mesh->ncells; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %6d %4d %4d %4d %4d %4d %4d %4d \n", mesh->mype,ic, ic+mesh->noffset,mesh->i[ic], mesh->j[ic], mesh->level[ic], mesh->nlft[ic], mesh->nrht[ic], mesh->nbot[ic], mesh->ntop[ic]);
+ }
+ } else {
+ fprintf(mesh->fp,"%d: index H U V i j lev\n",mesh->mype);
+ for (uint ic=0; ic<mesh->ncells_ghost; ic++) {
+ fprintf(mesh->fp,"%d: %6d %lf %lf %lf %4d %4d %4d \n", mesh->mype,ic, H[ic], U[ic], V[ic], mesh->i[ic], mesh->j[ic], mesh->level[ic]);
+ }
+ }
+}
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/state.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/state.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2011-2013, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef STATE_H_
+#define STATE_H_
+
+#include <list>
+#include "MallocPlus.h"
+#include "mesh.h"
+#include "crux.h"
+#ifdef HAVE_OPENCL
+#include "ezcl/ezcl.h"
+#endif
+//#include "l7/l7.h"
+
+#define STATUS_OK 0
+#define STATUS_NAN 1
+#define STATUS_MASS_LOSS 2
+
+#if !defined(FULL_PRECISION) && !defined(MIXED_PRECISION) && !defined(MINIMUM_PRECISION)
+#define FULL_PRECISION
+#endif
+#ifdef NO_CL_DOUBLE
+#undef FULL_PRECISION
+#undef MIXED_PRECISION
+#define MINIMUM_PRECISION
+#endif
+
+#if defined(MINIMUM_PRECISION)
+ typedef float state_t; // this is for physics state variables ncell in size
+ typedef float real_t; // this is used for intermediate calculations
+ typedef struct
+ {
+ float s0;
+ float s1;
+ } real2_t;
+#define CONSERVATION_EPS 15.0
+#ifdef HAVE_OPENCL
+ typedef cl_float cl_state_t; // for gpu physics state variables
+ typedef cl_float4 cl_state4_t; // for gpu physics state variables
+ typedef cl_float cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_float2 cl_real2_t; // for intermediate gpu physics state variables
+ typedef cl_float4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+ #define MPI_STATE_T MPI_FLOAT // for MPI communication for physics state variables
+ #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
+ #define L7_STATE_T L7_FLOAT
+ #define L7_REAL_T L7_FLOAT
+#endif
+
+#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
+ typedef float state_t;
+ typedef double real_t;
+ typedef struct
+ {
+ double s0;
+ double s1;
+ } real2_t;
+#define CONSERVATION_EPS .02
+#ifdef HAVE_OPENCL
+ typedef cl_float cl_state_t;
+ typedef cl_float4 cl_state4_t;
+ typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
+ typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+ #define MPI_STATE_T MPI_FLOAT
+ #define MPI_REAL_T MPI_DOUBLE
+ #define L7_STATE_T L7_FLOAT
+ #define L7_REAL_T L7_DOUBLE
+#endif
+
+#elif defined(FULL_PRECISION)
+ typedef double state_t;
+ typedef double real_t;
+ typedef struct
+ {
+ double s0;
+ double s1;
+ } real2_t;
+#define CONSERVATION_EPS .02
+#ifdef HAVE_OPENCL
+ typedef cl_double cl_state_t;
+ typedef cl_double4 cl_state4_t;
+ typedef cl_double cl_real_t; // for intermediate gpu physics state variables
+ typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
+ typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
+#endif
+#ifdef HAVE_MPI
+ #define MPI_STATE_T MPI_DOUBLE
+ #define MPI_REAL_T MPI_DOUBLE
+ #define L7_STATE_T L7_DOUBLE
+ #define L7_REAL_T L7_DOUBLE
+#endif
+#endif
+
+extern "C" void do_calc(void);
+
+enum SUM_TYPE {
+ SUM_REGULAR,
+ SUM_KAHAN
+};
+
+
+enum SIGN_RULE {
+ DIAG_RULE,
+ X_RULE,
+ Y_RULE,
+};
+
+enum state_timers
+{
+ STATE_TIMER_APPLY_BCS,
+ STATE_TIMER_SET_TIMESTEP,
+ STATE_TIMER_FINITE_DIFFERENCE,
+ STATE_TIMER_REFINE_POTENTIAL,
+ STATE_TIMER_CALC_MPOT,
+ STATE_TIMER_REZONE_ALL,
+ STATE_TIMER_MASS_SUM,
+ STATE_TIMER_READ,
+ STATE_TIMER_WRITE,
+ STATE_TIMER_SIZE
+};
+
+typedef enum state_timers state_timer_category;
+
+using namespace std;
+
+class State {
+
+public:
+ MallocPlus state_memory;
+ MallocPlus gpu_state_memory;
+ Mesh *mesh;
+ state_t *H;
+ state_t *U;
+ state_t *V;
+
+#ifdef HAVE_OPENCL
+ cl_mem dev_H;
+ cl_mem dev_U;
+ cl_mem dev_V;
+
+ cl_mem dev_mass_sum;
+ cl_mem dev_deltaT;
+
+ cl_event apply_BCs_event;
+
+ cl_mem dev_mpot;
+ //cl_mem dev_ioffset;
+ cl_mem dev_result;
+#endif
+
+ double cpu_timers[STATE_TIMER_SIZE];
+ long long gpu_timers[STATE_TIMER_SIZE];
+
+ // constructor -- allocates state arrays to size ncells
+ State(Mesh *mesh_in);
+
+ void init(int do_gpu_calc);
+ void terminate(void);
+
+ /* Memory routines for linked list of state arrays */
+ void allocate(size_t ncells);
+ void allocate_from_backup_file(FILE *fp);
+ void allocate_for_rollback(State *state_to_copy);
+ void resize(size_t ncells);
+ void memory_reset_ptrs(void);
+#ifdef HAVE_OPENCL
+ void allocate_device_memory(size_t ncells);
+#endif
+ void resize_old_device_memory(size_t ncells);
+
+ /* Accessor routines */
+ double get_cpu_timer(state_timer_category category) {return(cpu_timers[category]); };
+ /* Convert nanoseconds to msecs */
+ double get_gpu_timer(state_timer_category category) {return((double)(gpu_timers[category])*1.0e-9); };
+
+ /* Boundary routines -- not currently used */
+ void add_boundary_cells(void);
+ void apply_boundary_conditions(void);
+ void apply_boundary_conditions_local(void);
+ void apply_boundary_conditions_ghost(void);
+ void remove_boundary_cells(void);
+
+ /*******************************************************************
+ * set_timestep
+ * Input
+ * H, U, V -- from state object
+ * celltype, level, lev_delta
+ * Output
+ * mindeltaT returned
+ *******************************************************************/
+ double set_timestep(double g, double sigma);
+#ifdef HAVE_OPENCL
+ double gpu_set_timestep(double sigma);
+#endif
+
+ /*******************************************************************
+ * calc finite difference
+ * will add ghost region to H, U, V and fill at start of routine
+ * Input
+ * H, U, V -- from state object
+ * nlft, nrht, nbot, ntop, level, celltype -- from mesh object
+ * Output
+ * H, U, V
+ *******************************************************************/
+ void calc_finite_difference(double deltaT);
+ void calc_finite_difference_via_faces(double deltaT);
+#ifdef HAVE_OPENCL
+ void gpu_calc_finite_difference(double deltaT);
+#endif
+
+ /*******************************************************************
+ * calc refine potential -- state has responsibility to calc initial
+ * refinement potential array that is then passed to mesh for
+ * smoothing and enforcing refinement ruiles
+ * Input
+ * H, U, V -- from state object
+ * Output
+ * mpot
+ * ioffset
+ * count
+ *******************************************************************/
+ size_t calc_refine_potential(vector<int> &mpot, int &icount, int &jcount);
+#ifdef HAVE_OPENCL
+ size_t gpu_calc_refine_potential(int &icount, int &jcount);
+#endif
+
+ /*******************************************************************
+ * rezone all -- most of call is done in mesh
+ * Input
+ * Mesh and state variables
+ * Output
+ * New mesh and state variables on refined mesh
+ *******************************************************************/
+ void rezone_all(int icount, int jcount, vector<int> mpot);
+#ifdef HAVE_OPENCL
+ void gpu_rezone_all(int icount, int jcount, bool localStencil);
+#endif
+
+ /*******************************************************************
+ * load balance -- most of call is done in mesh, but pointers are
+ * reset to newly allocated state arrays
+ * Input
+ * Mesh and state variables
+ * Output
+ * New mesh and state variables on refined mesh
+ *******************************************************************/
+#ifdef HAVE_MPI
+ void do_load_balance_local(size_t &numcells);
+#ifdef HAVE_OPENCL
+ void gpu_do_load_balance_local(size_t &numcells);
+#endif
+#endif
+
+ /*******************************************************************
+ * mass sum -- Conservation of mass check
+ * Input
+ * H from state object
+ * Precision type for sum
+ * Output
+ * total mass is returned
+ *******************************************************************/
+ double mass_sum(int enhanced_precision_sum);
+#ifdef HAVE_OPENCL
+ double gpu_mass_sum(int enhanced_precision_sum);
+#endif
+
+ void fill_circle(double circ_radius, double fill_value, double background);
+ void state_reorder(vector<int> iorder);
+
+ void symmetry_check(const char *string, vector<int> sym_index, double eps,
+ SIGN_RULE sign_rule, int &flag);
+
+ void output_timing_info(int do_cpu_calc, int do_gpu_calc, double total_elapsed_time);
+
+ /* state comparison routines */
+#ifdef HAVE_OPENCL
+ void compare_state_gpu_global_to_cpu_global(const char* string, int cycle, uint ncells);
+#endif
+ void compare_state_cpu_local_to_cpu_global(State *state_global, const char* string, int cycle, uint ncells, uint ncells_global, int *nsizes, int *ndispl);
+#ifdef HAVE_OPENCL
+ void compare_state_all_to_gpu_local(State *state_global, uint ncells, uint ncells_global, int mype, int ncycle, int *nsizes, int *ndispl);
+#endif
+
+ void output_timer_block(mesh_device_types device_type, double elapsed_time,
+ double mesh_time, double compute_time, double total_elapsed_time, double speedup_ratio);
+
+ void timer_output(state_timer_category category, mesh_device_types device_type, int timer_level);
+
+ void print(void);
+
+ size_t get_checkpoint_size(void);
+ void store_checkpoint(Crux *crux);
+ void restore_checkpoint(Crux *crux);
+ //Added to for second print for every interation: Brian Atkinson (5-29-14)
+ void print(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage);
+ void print_local(int ncycle);
+ void print_failure_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, bool got_nan);
+ void print_rollback_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, int backup_attempt, int num_of_attempts, int error_status);
+
+private:
+ State(const State&); // To block copy constructor so copies are not made inadvertently
+
+ void print_object_info(void);
+};
+
+#endif // ifndef STATE_H_
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/timer.c?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.c Mon Sep 4 07:25:36 2017
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#include <sys/time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "timer.h"
+
+void cpu_timer_start(struct timeval *tstart_cpu){
+#ifdef _OPENMP
+ if ( omp_in_parallel() ) {
+#pragma omp master
+ {
+ gettimeofday(tstart_cpu, NULL);
+ }
+ } else {
+ gettimeofday(tstart_cpu, NULL);
+ }
+#else
+ gettimeofday(tstart_cpu, NULL);
+#endif
+}
+
+double cpu_timer_stop(struct timeval tstart_cpu){
+ double result;
+ struct timeval tstop_cpu, tresult;
+
+#ifdef _OPENMP
+ if ( omp_in_parallel() ) {
+#pragma omp master
+ {
+ gettimeofday(&tstop_cpu, NULL);
+ tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+ tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+ result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+ }
+ } else {
+ gettimeofday(&tstop_cpu, NULL);
+ tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+ tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+ result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+ }
+#else
+ gettimeofday(&tstop_cpu, NULL);
+ tresult.tv_sec = tstop_cpu.tv_sec - tstart_cpu.tv_sec;
+ tresult.tv_usec = tstop_cpu.tv_usec - tstart_cpu.tv_usec;
+ result = (double)tresult.tv_sec + (double)tresult.tv_usec*1.0e-6;
+#endif
+ return(result);
+}
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/timer.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/timer.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef _TIMER_H
+#define _TIMER_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void cpu_timer_start(struct timeval *tstart_cpu);
+double cpu_timer_stop(struct timeval tstart_cpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TIMER_H */
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/zorder.c?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.c Mon Sep 4 07:25:36 2017
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include "s7.h"
+#include "zorder.h"
+
+#define DEBUG 0
+
+void calc_zorder(int size, int *i, int *j, int *level, int levmx, int ibase, int *z_index, int *z_order)
+{ unsigned long long ibit, // Bitwise representation of x-index.
+ jbit; // Bitwise representation of y-index.
+
+ // Convert the indices to a bitwise representation.
+ int ic;
+ for (ic = 0; ic < size; ic++)
+ { if (level[ic] < 0) continue;
+ ibit = index_to_bit(i[ic], level[ic], levmx, ibase);
+ jbit = index_to_bit(j[ic], level[ic], levmx, ibase);
+ z_index[ic] = twobit_to_index(ibit, jbit);
+ z_order[ic] = ic; }
+
+ // Sort the z-ordered indices.
+ S7_Index_Sort(z_index, size, S7_INT, z_order);
+
+ // Output ordered mesh information.
+ if (DEBUG)
+ { printf("orig index i j lev ibit jbit ijbit z index z order\n");
+ for (ic=0; ic<size; ic++){
+ printf(" %6d %4d %4d %4d ",ic+1, j[ic], i[ic], level[ic]);
+ printbits(index_to_bit(j[ic], level[ic], levmx, ibase));
+ printf(" ");
+ printbits(index_to_bit(i[ic], level[ic], levmx, ibase));
+ printf(" ");
+ printbits( index_to_bit(i[ic], level[ic], levmx, ibase)
+ | (index_to_bit(j[ic], level[ic], levmx, ibase)
+ << 1));
+ printf(" %6d %5d\n",z_index[ic], z_order[ic]); } } }
+
+unsigned long long index_to_bit(unsigned long long index,
+ int lev,
+ int levmx,
+ int ibase)
+{ static const unsigned long long B[] =
+ {0x55555555, /* 01010101010101010101010101010101 */
+ 0x33333333, /* 00110011001100110011001100110011 */
+ 0x0F0F0F0F, /* 00001111000011110000111100001111 */
+ 0x00FF00FF, /* 00000000111111110000000011111111 */
+ 0x0000FFFF}; /* 00000000000000001111111111111111 */
+ static const unsigned long long S[] = {1, 2, 4, 8, 16};
+
+ // Convert the index to a bit representation.
+ unsigned long long ii, ibit;
+ ii = index - ibase;
+ if (lev < levmx)
+ { ii = ii * pow((double)2, (double)(levmx - lev)); }
+ ibit = ii;
+ ibit = (ibit | (ibit << S[3])) & B[3];
+ ibit = (ibit | (ibit << S[2])) & B[2];
+ ibit = (ibit | (ibit << S[1])) & B[1];
+ ibit = (ibit | (ibit << S[0])) & B[0];
+
+ return (ibit); }
+
+unsigned long long twobit_to_index(unsigned long long ibit,
+ unsigned long long jbit)
+{ unsigned long long ijbit;
+ return (ijbit = ibit | (jbit << 1)); }
+
+// Print n as a binary number.
+void printbits(int n)
+{
+ int i, step;
+
+ if (0 == n)
+ { // For simplicity's sake, treat 0 as a special case.
+ printf("00000000");
+ return; }
+
+ i = 1 << (sizeof(n) * 8 - 1);
+ step = -1; // Only print the relevant digits.
+ step >>= 8; // Print in groups of four.
+ while (step >= n)
+ { i >>= 8;
+ step >>= 8; }
+
+ // At this point, i is the smallest power of two larger or equal to n.
+ while (i > 0)
+ { if (n & i)
+ printf("1");
+ else
+ printf("0");
+ i >>= 1; } }
+
Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CLAMR/zorder.h?rev=312488&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/zorder.h Mon Sep 4 07:25:36 2017
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2011-2012, Los Alamos National Security, LLC.
+ * All rights Reserved.
+ *
+ * Copyright 2011-2012. Los Alamos National Security, LLC. This software was produced
+ * under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National
+ * Laboratory (LANL), which is operated by Los Alamos National Security, LLC
+ * for the U.S. Department of Energy. The U.S. Government has rights to use,
+ * reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS
+ * ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
+ * ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified
+ * to produce derivative works, such modified software should be clearly marked,
+ * so as not to confuse it with the version available from LANL.
+ *
+ * Additionally, redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the Los Alamos National Security, LLC, Los Alamos
+ * National Laboratory, LANL, the U.S. Government, nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LOS ALAMOS NATIONAL SECURITY, LLC AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+ * NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LOS ALAMOS NATIONAL
+ * SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CLAMR -- LA-CC-11-094
+ * This research code is being developed as part of the
+ * 2011 X Division Summer Workshop for the express purpose
+ * of a collaborative code for development of ideas in
+ * the implementation of AMR codes for Exascale platforms
+ *
+ * AMR implementation of the Wave code previously developed
+ * as a demonstration code for regular grids on Exascale platforms
+ * as part of the Supercomputing Challenge and Los Alamos
+ * National Laboratory
+ *
+ * Authors: Bob Robey XCP-2 brobey at lanl.gov
+ * Neal Davis davis68 at lanl.gov, davis68 at illinois.edu
+ * David Nicholaeff dnic at lanl.gov, mtrxknight at aol.com
+ * Dennis Trujillo dptrujillo at lanl.gov, dptru10 at gmail.com
+ *
+ */
+#ifndef _ZORDER_H
+#define _ZORDER_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void calc_zorder(int size, int *i, int *j, int *level, int levmx, int ibase, int *z_index, int *z_order);
+unsigned long long index_to_bit(unsigned long long index, int lev, int levmx, int ibase);
+unsigned long long twobit_to_index(unsigned long long ibit, unsigned long long jbit);
+void printbits(int n);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZORDER_H */
+
Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/CMakeLists.txt?rev=312488&r1=312487&r2=312488&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/CMakeLists.txt Mon Sep 4 07:25:36 2017
@@ -1,3 +1,4 @@
add_subdirectory(HPCCG)
add_subdirectory(PENNANT)
add_subdirectory(miniFE)
+add_subdirectory(CLAMR)
Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C%2B%2B/Makefile?rev=312488&r1=312487&r2=312488&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C++/Makefile Mon Sep 4 07:25:36 2017
@@ -1,6 +1,6 @@
# MultiSource/DOE-ProxyApps-C++ Makefile: Build all subdirectories automatically
LEVEL = ../../..
-PARALLEL_DIRS = HPCCG PENNANT miniFE
+PARALLEL_DIRS = HPCCG PENNANT miniFE CLAMR
include $(LEVEL)/Makefile.programs
More information about the llvm-commits
mailing list