[test-suite] r312615 - [test-suite] Adding the SimpleMOC proxy app

Tue Sep 5 21:05:04 PDT 2017

Author: hfinkel
Date: Tue Sep  5 21:05:03 2017
New Revision: 312615

URL: http://llvm.org/viewvc/llvm-project?rev=312615&view=rev
Log:
[test-suite] Adding the SimpleMOC proxy app

SimpleMOC: The purpose of this mini-app is to demonstrate the performance
characterterics and viability of the Method of Characteristics (MOC) for 3D
neutron transport calculations in the context of full scale light water reactor
simulation.

https://github.com/ANL-CESAR/SimpleMOC

Patch by Pavanravikanth Kondamudi, thanks!

Includes glibc_compat_rand.{c,h} written by me.

Differential Revision: https://reviews.llvm.org/D36621

Added:
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c
Modified:
    test-suite/trunk/LICENSE.TXT
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
    test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile

Modified: test-suite/trunk/LICENSE.TXT
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/LICENSE.TXT?rev=312615&r1=312614&r2=312615&view=diff
==============================================================================

--- test-suite/trunk/LICENSE.TXT (original)
+++ test-suite/trunk/LICENSE.TXT Tue Sep  5 21:05:03 2017
@@ -88,6 +88,7 @@ Pathfinder:         llvm-test/MultiSourc
 XSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/XSBench
 miniGMG:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG
 RSBench:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/RSBench
+SimpleMOC:          llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC
 CLAMR:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR
 HPCCG:              llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/HPCCG
 PENNANT:            llvm-test/MultiSource/Benchmarks/DOE-ProxyApps-C++/PENNANT

Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt?rev=312615&r1=312614&r2=312615&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/CMakeLists.txt Tue Sep  5 21:05:03 2017
@@ -3,3 +3,4 @@ add_subdirectory(Pathfinder)
 add_subdirectory(miniAMR)
 add_subdirectory(miniGMG)
 add_subdirectory(RSBench)
+add_subdirectory(SimpleMOC)

Modified: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile?rev=312615&r1=312614&r2=312615&view=diff
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile (original)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/Makefile Tue Sep  5 21:05:03 2017
@@ -5,6 +5,7 @@ PARALLEL_DIRS = XSBench    \
                 miniAMR    \
                 Pathfinder \
                 miniGMG    \
-                RSBench
+                RSBench    \
+                SimpleMOC
 
 include $(LEVEL)/Makefile.programs

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/CMakeLists.txt Tue Sep  5 21:05:03 2017
@@ -0,0 +1,5 @@
+set(PROG SimpleMOC)
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS -std=gnu99)
+set(RUN_OPTIONS -i ${CMAKE_CURRENT_SOURCE_DIR}/default.in)
+llvm_multisource()

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/LICENSE Tue Sep  5 21:05:03 2017
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 John Tramm
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/Makefile Tue Sep  5 21:05:03 2017
@@ -0,0 +1,7 @@
+LEVEL = ../../../..
+
+PROG = SimpleMOC
+LDFLAGS = -lm
+CFLAGS = -std=gnu99
+RUN_OPTIONS = -i $(PROJ_SRC_DIR)/default.in
+include $(LEVEL)/MultiSource/Makefile.multisrc

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC.reference_output Tue Sep  5 21:05:03 2017
@@ -0,0 +1,12 @@
+                                 INITIALIZATION
+================================================================================
+Initializing 2D tracks...
+Initializing 3D tracks...
+Initializing flat source regions...
+Beginning XS Allocation...
+Beginning Source and Flux Parameter Allocation...
+================================================================================
+Starting transport sweep ...
+Renormalizing Flux...
+Renormalizing Flux Complete.
+exit 0

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/SimpleMOC_header.h Tue Sep  5 21:05:03 2017
@@ -0,0 +1,262 @@
+#ifndef __SimpleMOC_header
+#define __SimpleMOC_header
+
+#include<stdio.h>
+#include<stdlib.h>
+#include<math.h>
+#include<string.h>
+#include<time.h>
+#include<stdbool.h>
+#include<limits.h>
+#include<assert.h>
+#include<pthread.h>
+#include<unistd.h>
+
+#ifdef MPI
+#include<mpi.h>
+#endif
+
+#ifdef OPENMP
+#include<omp.h>
+#endif
+
+#ifdef PAPI
+#include<papi.h>
+#endif
+
+#define TIMIING_INFO 0
+
+#include "glibc_compat_rand.h"
+#ifndef NO_GLIBC_COMPAT_RAND
+#define rand glibc_compat_rand
+#define srand glibc_compat_srand
+#endif
+
+// User inputs
+typedef struct{
+	int x_assemblies;          	/* Number of assemblies in the x-axis of the 
+								   reactor */
+	int y_assemblies;           /* Number of assemblies in the y-axis of the 
+								   reactor */
+	int cai;                    // Number of course axial intervals
+	int fai;                    /* Number of fine axial intervals per coarse 
+								   axial interval */
+	int axial_exp;             	// Axial source expansion order
+	float radial_ray_sep;     	// Radial ray separation
+	float axial_z_sep;        	// Axial stacked z-ray separation
+	int n_azimuthal;           	// Number of azimuthal angles
+	int n_polar_angles;        	// Number of polar angles
+	int n_egroups;             	// Number of energy groups
+	bool decompose;            	// Turn decomposition on/off
+	int decomp_assemblies_ax;  	// Number of sub-domains per assembly (axially)
+	long segments_per_track;   	// Average number of segments per track
+	float assembly_width;     	// Width of an assembly - 1.26 x 17 cm
+	float height;             	// Height of the reactor - 400 cm
+	float domain_height;      	// Z Height of a domain
+	float precision;		   	// precision for source convergence
+	long mype;                 	// MPI Rank
+	long ntracks_2D;           	// Number of 2D tracks (derived)
+	int z_stacked;             	// Number of z rays (derived)
+	long ntracks;              	/* Total number of 3D tracks per assembly
+								   (derived) */
+	int nthreads;              	// Number of OpenMP Threads
+	int papi_event_set;         // PAPI event set
+	// 0 - FLOPS   1 - Bandwidth   2 - CPU Stall reason
+	
+	// Source regions per assembly (3M estimate)
+	long n_2D_source_regions_per_assembly;
+
+	// Source regions per node (derived)	
+	long n_source_regions_per_node; 
+
+    #ifdef PAPI
+    // String for command line PAPI event
+    char event_name[PAPI_MAX_STR_LEN]; 
+    // Array to accumulate PAPI counts across all threads
+    long long *vals_accum;
+    #endif
+
+	bool load_tracks;		// Turn on/off loading 2D tracks from file
+	char* track_file;		// Name/address of tracking file to load
+
+	long segments_processed;    // Total number of segments processed per node
+
+} Input;
+
+// Localized geometrical region ID
+typedef struct{
+	long assembly;             // Assembly ID
+	long pin;                  // Pin Cell ID
+	long zone;                 // Zone (inside pin cell) ID
+} RegionID;
+
+// Segment Structure
+typedef struct{
+	float length;
+	long source_id;
+} Segment;
+
+// Track2D Structure
+typedef struct{
+	float az_weight;          	// Azimuthal Quadrature Weight (rand)
+	long n_segments;           	// Number of Segments (gaussian)
+	Segment * segments;        	// Array of Segments
+	int n_3D_segments;			// Number of intersections in 3D tracks
+} Track2D;
+
+// Track Structure
+typedef struct{
+	float p_weight;				// Polar Quadrature Weight     (rand)
+	float z_height;           	// Z-height
+	long rank_in;              	// MPI rank to receive from
+	long rank_out;             	// MPI rank to send to
+	float * f_psi;			   	// Forward angular flux along track
+	float * b_psi;				// Backward angular flux along track
+} Track;
+
+// Source Region Structure
+typedef struct{
+	float ** fine_flux;
+	float ** fine_source;
+	float vol;
+	float * sigT;
+	float ** XS;
+	float ** scattering_matrix;
+	#ifdef OPENMP
+	omp_lock_t * locks;
+	#endif
+} Source;
+
+// Table structure for computing exponential
+typedef struct{
+	float * values;
+	float dx;
+	float maxVal;
+	int N;
+} Table;
+
+// Params Structure for easier data pointer passing
+typedef struct{
+	Track2D * tracks_2D;
+	Track *** tracks;
+	Source * sources;
+   	float * polar_angles;
+	float * leakage;
+	Table expTable;
+} Params;
+
+// MPI 3D Grid info
+typedef struct{
+	#ifdef MPI
+	MPI_Comm cart_comm_3d;
+	MPI_Datatype Flux_Array;
+	#endif
+	int x_pos_src;
+	int x_pos_dest;
+	int x_neg_src;
+	int x_neg_dest;
+	int y_pos_src;
+	int y_pos_dest;
+	int y_neg_src;
+	int y_neg_dest;
+	int z_pos_src;
+	int z_pos_dest;
+	int z_neg_src;
+	int z_neg_dest;
+} CommGrid;
+
+// Attenuation Arrays
+typedef struct{
+	float * q0;
+	float * q1;
+	float * q2;
+	float * sigT;
+	float * tau;
+	float * sigT2;
+	float * expVal;
+	float * reuse;
+	float * flux_integral;
+	float * tally;
+	float * t1;
+	float * t2;
+	float * t3;
+	float * t4;
+} AttenuateVars;
+
+
+// init.c
+Input set_default_input( void );
+void set_small_input( Input * I );
+Params build_tracks( Input* I );
+CommGrid init_mpi_grid( Input I );
+void calculate_derived_inputs( Input * I );
+#ifdef OPENMP
+omp_lock_t * init_locks( Input I );
+#endif
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+void print_input_summary(Input input);
+void read_CLI( int argc, char * argv[], Input * input );
+void print_CLI_error(void);
+void read_input_file( Input * I, char * fname);
+
+// tracks.c
+Track2D * generate_2D_tracks( Input input, size_t * nbytes );
+void generate_2D_segments( Input input, Track2D * tracks,
+	   	size_t * nbytes );
+void free_2D_tracks( Track2D * tracks );
+Track *** generate_tracks(Input input, Track2D * tracks_2D, size_t * nbytes);
+void free_tracks( Track *** tracks );
+long segments_per_2D_track_distribution( Input I );
+float * generate_polar_angles( Input I );
+Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes);
+
+// utils.c
+float urand(void);
+float nrand(float mean, float sigma);
+float pairwise_sum(float * vector, long size);
+Table buildExponentialTable( float precision, float maxVal );
+float interpolateTable( Table table, float x);
+double get_time(void);
+size_t est_mem_usage( Input I );
+double time_per_intersection( Input I, double time );
+
+// source.c
+Source * initialize_sources( Input I, size_t * nbytes );
+void free_sources( Input I, Source * sources );
+
+// solver.c
+void transport_sweep( Params * params, Input * I );
+int get_pos_interval( float z, float dz);
+int get_neg_interval( float z, float dz);
+int get_alt_neg_interval( float z, float dz);
+void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I, 
+		Params * params, float ds, float mu, float az_weight, AttenuateVars * A ); 
+void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params, float ds, float mu, float az_weight, AttenuateVars * A );
+void alt_attenuate_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params, float ds, float mu, float az_weight );
+void renormalize_flux( Params params, Input I, CommGrid grid );
+float update_sources( Params params, Input I, float keff );
+float compute_keff( Params params, Input I, CommGrid grid);
+
+// test.c
+void gen_norm_pts(float mean, float sigma, int n_pts);
+void print_Input_struct( Input I );
+
+// papi.c
+void papi_serial_init(void);
+void counter_init( int *eventset, int *num_papi_events, Input * I );
+void counter_stop( int * eventset, int num_papi_events, Input * I );
+
+// comms.c
+#ifdef MPI
+void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid);
+void transfer_boundary_fluxes( Params params, Input I, CommGrid grid);
+#endif
+
+#endif

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/comms.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,199 @@
+#include"SimpleMOC_header.h"
+
+#ifdef MPI
+// Transfer information between nodes (angular fluxes)
+void fast_transfer_boundary_fluxes( Params params, Input I, CommGrid grid)
+{
+	MPI_Barrier(grid.cart_comm_3d);
+	if(I.mype==0) printf("Beginning Inter-Node Border Flux Transfer...\n");
+
+	int tracks_per_msg = 10000;
+
+	float h = I.domain_height;
+	float x = I.assembly_width;
+
+	// calculate number of tracks to each surface
+	long ntracks_per_axial_direction  = I.ntracks * x / (2*x + 4*h);
+	long ntracks_per_radial_direction = I.ntracks * h / (2*x + 4*h);
+
+	// correct so that all tracks are used and are symmetric
+	long remaining_tracks = I.ntracks - 2 * ntracks_per_axial_direction
+	   - 4 * ntracks_per_radial_direction;
+
+	long add_radial = remaining_tracks * ( 4*h / (2*x + 4*h) );
+	add_radial = 4 * (add_radial / 4);
+	ntracks_per_radial_direction += add_radial / 4;
+	
+	long add_axial = remaining_tracks - add_radial;
+	ntracks_per_axial_direction += add_axial / 2;
+
+	// Calculate all requests needed
+	long max_requests = ntracks_per_radial_direction / tracks_per_msg;
+	max_requests *= 4;
+	max_requests += 2 * (ntracks_per_axial_direction / tracks_per_msg );
+
+	// One for send, one for receive
+	max_requests *= 2;
+
+	long send_idx = 0;
+	MPI_Status stat;
+
+	// Computer Message Size
+	size_t bytes = I.n_egroups * sizeof(float) * tracks_per_msg;
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("MPI Message Size: %.2lf (MB)\n",
+			bytes / 1024. / 1024. );
+#endif
+	
+
+	// Use knowledge of underlying flux structure for efficiency
+	float * flux_array = params.tracks[0][0][0].f_psi;
+
+	// TODO: Send reverse direction as well!!!
+
+	// make an array of radial sending destinations
+	int send_dest[6] = 
+	{
+		grid.x_pos_dest,
+		grid.x_neg_dest,
+		grid.y_pos_dest,
+		grid.y_neg_dest,
+		grid.z_pos_dest,
+		grid.z_neg_dest
+	};
+	// make an array of radial receiving sources
+	int rec_sources[6] = 
+	{
+		grid.x_pos_src,
+		grid.x_neg_src,
+		grid.y_pos_src,
+		grid.y_neg_src,
+		grid.z_pos_src,
+		grid.z_neg_src
+	};
+	
+	// make an array of number of messages
+	// NOTE: There is some rounding here, should be corrected in real app
+	long num_messages[6] =
+	{
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_radial_direction / tracks_per_msg,
+		ntracks_per_axial_direction / tracks_per_msg,
+		ntracks_per_axial_direction / tracks_per_msg
+	};
+
+
+	// send_idx is now the beginning of the non-border region memory
+	// i.e., we need to actually MPI Send/Recv the rest of the data
+
+	// calculate the maximum number of messages sent in any direction
+	long max_msgs_per_dir = 0;
+	for( int i = 0; i < 6; i++ )
+		if( num_messages[i] > max_msgs_per_dir )
+		   max_msgs_per_dir = num_messages[i];
+
+	// Flux Memory is assumed to be laid out as follows:
+	// (Border Flux) --- (Send_MSG_Dir_0) (Recv_MSG_Dir_0) (Send_MSG_Dir_1) (Recv_MSG_Dir_1)....
+
+	// New Comms
+	float ** buffer = (float **) malloc(6 * sizeof(float*));
+	float * _buffer = (float *) malloc( 6 * tracks_per_msg * I.n_egroups * sizeof(float));
+	for( int i = 0; i < 6; i++ )
+		buffer[i] = &_buffer[i * tracks_per_msg * I.n_egroups];
+
+	long idx = 0;
+	for( long i = 0; i < max_msgs_per_dir; i++ )
+	{
+		MPI_Request request[12];
+		int active[6] = {0};
+		int mpi_send[6] = {0};
+		int mpi_recv[6] = {0};
+		long bookmark = idx;
+		for( int j = 0; j < 6; j++ )
+		{
+			if( i >= num_messages[j] )
+				continue;
+			
+			// check if border assembly
+			else if( send_dest[j] == -1 )
+			{
+				* params.leakage += pairwise_sum( &flux_array[idx],
+						I.n_egroups * tracks_per_msg );
+				idx += (long) I.n_egroups * tracks_per_msg;
+			}
+			else
+			{
+				MPI_Isend(
+					&flux_array[idx], 	// Send Buffer
+					tracks_per_msg,     // Number of Elements
+					grid.Flux_Array,  	/* Type of element 
+											(all energy group array) */
+					send_dest[j],	  	// Destination MPI rank
+					j,                	// Message ID
+					grid.cart_comm_3d,	// MPI Communicator
+					&request[j] );   	/* MPI Request (to monitor 
+											when call finishes) */
+				idx += (long) I.n_egroups * tracks_per_msg;
+				mpi_send[j] = 1;
+			}
+		}
+		for( int j = 0; j < 6; j++ )
+		{
+			if( i >= num_messages[j] )
+				continue;
+			
+			// Check if Border Case
+			else if( rec_sources[j] == -1)
+				for( long k =0; k < I.n_egroups * tracks_per_msg; k++)
+					buffer[j][k] = 0;
+			else
+			{
+				MPI_Irecv(
+						buffer[j],   		// Recv Buffer
+						tracks_per_msg,   	// Number of Elements
+						grid.Flux_Array,  	/* Type of element 
+												(all energy group array) */
+						rec_sources[j],		// MPI rank to Receive From
+						j,              	// Message ID
+						grid.cart_comm_3d,	// MPI Communicator
+						&request[6+j] ); 		/* MPI Request (to monitor 
+												when call finishes) */
+				mpi_recv[j] = 1;
+			}
+			active[j] = 1;
+		}
+
+		// Block for Comm Round to complete & copy received data out of buffer
+		for( int j = 0; j < 6; j++ )
+		{
+			if( mpi_send[j] == 1 )
+			{
+				MPI_Wait( &request[j], &stat );
+			}
+			if( mpi_recv[j] == 1 )
+			{
+				MPI_Wait( &request[6+j], &stat );
+			}
+			if( active[j] == 1 )
+			{
+				memcpy(&flux_array[bookmark], buffer[j],
+						I.n_egroups * tracks_per_msg * sizeof(float));
+				bookmark += (long) I.n_egroups*tracks_per_msg;
+			}
+		}
+
+
+	}
+
+	free(&buffer[0][0]);
+	free(buffer);
+
+
+	MPI_Barrier( grid.cart_comm_3d );
+	MPI_Barrier( MPI_COMM_WORLD);
+
+	if(I.mype==0) printf("Finished Inter-Node Border Flux Transfer.\n");
+}
+#endif

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/default.in Tue Sep  5 21:05:03 2017
@@ -0,0 +1,20 @@
+10 - x_assemblies - Number of assemblies in the x-axis of the reactor
+10 - y_assemblies - Number of assemblies in the y-axis of the reactor
+9 - cai - Number of coarse axial intervals
+5 - fai - Number of fine axial intervals per coarse  axial interval
+2 - axial_exp - Axial source expansion order
+0.05 - radial_ray_sep -  Radial ray separation
+0.25 - axial_z_sep -  Axial stacked z-ray separation
+6 - n_azimuthal  -  Number of azimuthal angles (should be 32)
+10 - n_polar_angles -  Number of polar angles
+10 - n_egroups -  Number of energy groups
+1 - decompose - Turn decomposition on/off (true = 1, false = 0) 
+20 - decomp_assemblies_ax - Number of assemblies per sub-domain (axially) 
+20 - segments_per_track - Average number of segments per track
+21.42 - assembly_width - Width of an assembly - 1.26 x 17 cm
+50.0 - height - Height of the reactor
+0.01 - precision - precision for source convergence
+1000 - n_2D_source_regions_per_assembly - 2D src regions per assembly 
+0 - papi_event_set - PAPI Event Set Choice
+0 - Turn on/off load tracks from OpenMOC (true = 1, false = 0)
+file_name - name of OpenMOC tracking file to load

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,60 @@
+/*===------------ glibc_compat_rand.c - glibc rand emulation --------------===*\
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#include "glibc_compat_rand.h"
+
+/**
+ * This rand implementation is designed to emulate the implementation of
+ * rand/srand in recent versions of glibc. This is used for programs which
+ * require this specific rand implementation in order to pass verification
+ * tests.
+ *
+ * For more information, see: http://www.mathstat.dal.ca/~selinger/random/
+ **/
+
+#define TABLE_SIZE 344
+static unsigned int table[TABLE_SIZE];
+static int next;
+
+int glibc_compat_rand(void) {
+  /* Calculate the indices i-3 and i-31 in the circular vector. */
+  int i3 = (next < 3) ? (TABLE_SIZE + next - 3) : (next - 3);
+  int i31 = (next < 31) ? (TABLE_SIZE + next - 31) : (next - 31);
+
+  table[next] = table[i3] + table[i31];
+  unsigned int r = table[next] >> 1;
+
+  ++next;
+  if (next >= TABLE_SIZE)
+    next = 0;
+
+  return r;
+}
+
+void glibc_compat_srand(unsigned int seed) {
+  if (seed == 0)
+    seed = 1;
+
+  table[0] = seed;
+
+  for (int i = 1; i < 31; i++) {
+    int r = (16807ll * table[i - 1]) % 2147483647;
+    if (r < 0)
+      r += 2147483647;
+
+    table[i] = r;
+  }
+
+  for (int i = 31; i < 34; i++)
+    table[i] = table[i - 31];
+  for (int i = 34; i < TABLE_SIZE; i++)
+    table[i] = table[i - 31] + table[i - 3];
+
+  next = 0;
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/glibc_compat_rand.h Tue Sep  5 21:05:03 2017
@@ -0,0 +1,16 @@
+/*===------------- glibc_compat_rand.h- glibc rand emulation --------------===*\
+ * |*
+ * |*                     The LLVM Compiler Infrastructure
+ * |*
+ * |* This file is distributed under the University of Illinois Open Source
+ * |* License. See LICENSE.TXT for details.
+ * |*
+ * \*===----------------------------------------------------------------------===*/
+
+#ifndef GLIBC_COMPAT_RAND_H
+#define GLIBC_COMPAT_RAND_H
+
+int glibc_compat_rand(void);
+void glibc_compat_srand(unsigned int seed);
+
+#endif /* GLIBC_COMPAT_RAND_H */

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/init.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,249 @@
+#include"SimpleMOC_header.h"
+
+// Calculate Derived Inputs
+void calculate_derived_inputs( Input * I )
+{
+	
+	#ifdef MPI
+	int mype;
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	I->mype = mype;
+	#endif
+
+	/* Divide number of azimuthal angles by 2 to accound for forward/backward
+	 *  tracking */
+	I->n_azimuthal /= 2;
+
+	// calculate number of 2D tracks, enforcing divisible by 2
+	I->ntracks_2D = I->n_azimuthal * 
+			(I->assembly_width * sqrt(2) / I->radial_ray_sep);
+
+	I->ntracks_2D = 2 * ( I->ntracks_2D / 2 );
+
+	I->z_stacked = (int) ( I->height / (I->axial_z_sep * I->decomp_assemblies_ax));
+	I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked;  
+	I->domain_height = I->height / I->decomp_assemblies_ax;
+
+	I->n_source_regions_per_node = I->n_2D_source_regions_per_assembly *
+		I->cai / I->decomp_assemblies_ax;
+
+}
+
+// Gets I from user and sets defaults
+Input set_default_input( void )
+{
+	Input I;
+
+	I.x_assemblies = 17;      	/* Number of assemblies in the x-axis 
+								   of the reactor */
+	I.y_assemblies = 17;        /* Number of assemblies in the y-axis 
+								   of the reactor */
+	I.cai = 27;                	// Number of coarse axial intervals
+	I.fai = 5;                  /* Number of fine axial intervals per coarse 
+								   axial interval */
+	I.axial_exp = 2;            // Axial source expansion order
+	I.radial_ray_sep = 0.05;    // Radial ray separation
+	I.axial_z_sep = 0.25;       // Axial stacked z-ray separation
+	I.n_azimuthal = 64;         // Number of azimuthal angles (should be 32)
+	I.n_polar_angles = 10;      // Number of polar angles
+	I.n_egroups = 104;        	// Number of energy groups
+	I.decompose = true;      	/* Turn decomposition on/off (true = on, 
+								   flase = off) */
+	I.decomp_assemblies_ax =20; /* Number of subdomains per assembly
+								   (axially) */
+	I.segments_per_track = 120;  // Average number of segments per track (123)
+	I.assembly_width = 21.42;   // Width of an assembly - 1.26 x 17 cm
+	I.height = 400.0;           // Height of the reactor - 400 cm
+	I.precision = 0.01;			// precision for source convergence
+	I.mype = 0;                 // MPI Rank
+
+	// source regions per assembly (estimate)
+	I.n_2D_source_regions_per_assembly = 5000; 
+
+	#ifdef PAPI
+	I.papi_event_set = 4;
+	#endif
+	
+	#ifdef OPENMP
+	I.nthreads = omp_get_max_threads();
+	#endif
+
+	I.load_tracks = false; 
+
+	return I;
+}
+
+// Changes defaults to small problem size
+void set_small_input( Input * I )
+{
+	I->x_assemblies = 15;      		/* Number of assemblies in the x-axis 
+								   		of the reactor */
+	I->y_assemblies = 15;        	/* Number of assemblies in the y-axis 
+								    	of the reactor */
+	I->cai = 5;                	 	// Number of coarse axial intervals
+	I->fai = 3;                  	/* Number of fine axial intervals per 
+									   coarse axial interval */
+	I->axial_exp = 2;            	// Axial source expansion order
+	I->radial_ray_sep = 0.5;     	// Radial ray separation
+	I->axial_z_sep = 0.2;        	// Axial stacked z-ray separation
+	I->n_azimuthal = 5;         	// Number of azimuthal angles
+	I->n_polar_angles = 5;      	// Number of polar angles
+	I->n_egroups = 104;        		// Number of energy groups
+	I->decompose = false;      	 	/* Turn decomposition on/off (true = on, 
+								   		flase = off) */
+	I->decomp_assemblies_ax = 1; 	/* Number of sub-domains per assembly
+								   		(axially) */
+	I->segments_per_track = 120;  	// Average number of segments per track
+	I->assembly_width = 1.26*17; 	// Width of an assembly - 1.26 x 17 cm
+	I->height = 400.0;           	// Height of the reactor - 400 cm
+	I->precision = 0.01;			// precision for source convergence
+
+	// source regions per assembly (estimate)
+	I->n_2D_source_regions_per_assembly = 3000; 
+}
+
+// Initializes all track data
+Params build_tracks( Input* input )
+{
+	Input I = *input;
+	size_t nbytes = 0;
+	Params params;
+
+	if(I.mype == 0)
+	{
+		center_print("INITIALIZATION", 79);
+		border_print();
+		printf("Initializing 2D tracks...\n");
+	}
+
+	if(I.load_tracks)
+	{
+		params.tracks_2D = load_OpenMOC_tracks(
+				I.track_file,false, input, &nbytes);
+		I = *input;
+	}
+	else
+    	params.tracks_2D = generate_2D_tracks(I, &nbytes); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		printf("Initializing 3D tracks...\n");
+	}
+
+	params.tracks = generate_tracks(I, params.tracks_2D, &nbytes);
+	params.polar_angles = generate_polar_angles( I ); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		printf("Initializing flat source regions...\n");
+	}
+
+	params.sources = initialize_sources(I, &nbytes); 
+
+	if(I.mype == 0)
+	{
+#ifdef PRINT_MEM_SIZES
+		printf("Memory allocated thus far (MB): %zu\n", nbytes / 1024 / 1014 );
+#endif
+		border_print();
+	}
+
+	// initialize to zero leakage
+	float * leakage = calloc( 1, sizeof(float) );
+	params.leakage = leakage;
+
+	// build exponential table for interpolation
+	params.expTable = buildExponentialTable( I.precision, 10.0 ); 
+
+	return params;
+}
+
+// Initializes 3D Cartesian Communication Grid + Shift Ranks
+CommGrid init_mpi_grid( Input I )
+{
+	CommGrid grid;
+
+	#ifdef MPI
+	MPI_Comm cart_comm_3d;
+	int ndims = 3;
+	int dims[3] = {2,2,1};
+	int period[3] = {0,0,0};
+   	int reorder = 1;
+
+	MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, period, reorder, 
+			&cart_comm_3d);
+
+	// X Shifts
+	int x_pos_src;
+	int x_pos_dest;
+	int x_neg_src;
+	int x_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 0,  1, &x_pos_src, &x_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 0, -1, &x_neg_src, &x_neg_dest );
+
+	// Y Shifts
+	int y_pos_src;
+	int y_pos_dest;
+	int y_neg_src;
+	int y_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 1,  1, &y_pos_src, &y_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 1, -1, &y_neg_src, &y_neg_dest );
+
+	// Z Shifts
+	int z_pos_src;
+	int z_pos_dest;
+	int z_neg_src;
+	int z_neg_dest;
+	MPI_Cart_shift( cart_comm_3d, 2,  1, &z_pos_src, &z_pos_dest );
+	MPI_Cart_shift( cart_comm_3d, 2, -1, &z_neg_src, &z_neg_dest );
+
+	grid.cart_comm_3d = cart_comm_3d;
+	grid.x_pos_src    = x_pos_src;
+	grid.x_pos_dest   = x_pos_dest;
+	grid.x_neg_src    = x_neg_src;
+	grid.x_neg_dest   = x_neg_dest;
+	grid.y_pos_src    = y_pos_src;
+	grid.y_pos_dest   = y_pos_dest;
+	grid.y_neg_src    = y_neg_src;
+	grid.y_neg_dest   = y_neg_dest;
+	grid.z_pos_src    = z_pos_src;
+	grid.z_pos_dest   = z_pos_dest;
+	grid.z_neg_src    = z_neg_src;
+	grid.z_neg_dest   = z_neg_dest;
+
+
+	// Init flux buffer MPI type
+	MPI_Datatype flux_array;
+	MPI_Type_contiguous(I.n_egroups, MPI_FLOAT, &flux_array);
+	MPI_Type_commit(&flux_array);
+
+	grid.Flux_Array = flux_array;
+
+	#endif
+
+	return grid;
+}
+
+#ifdef OPENMP
+// Intialized OpenMP Source Region Locks
+omp_lock_t * init_locks( Input I )
+{
+	// Allocate locks array
+	long n_locks = I.n_source_regions_per_node * I.fai; 
+	omp_lock_t * locks = (omp_lock_t *) malloc( n_locks* sizeof(omp_lock_t));
+	
+	// Initialize locks array
+	for( long i = 0; i < n_locks; i++ )
+		omp_init_lock(&locks[i]);
+
+	return locks;
+}	
+#endif
+
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/io.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,276 @@
+#include "SimpleMOC_header.h"
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+"              _____ _                 _      __  __  ____   _____ \n"
+"             / ____(_)               | |    |  \\/  |/ __ \\ / ____|\n"
+"            | (___  _ _ __ ___  _ __ | | ___| \\  / | |  | | |     \n"
+"             \\___ \\| | '_ ` _ \\| '_ \\| |/ _ \\ |\\/| | |  | | |     \n"
+"             ____) | | | | | | | |_) | |  __/ |  | | |__| | |____ \n"
+"            |_____/|_|_| |_| |_| .__/|_|\\___|_|  |_|\\____/ \\_____|\n"
+"                               | |                                \n"
+"                               |_|                                \n"
+	);
+	border_print();
+	printf("\n");
+	center_print("Developed at", 79);
+	center_print("The Massachusetts Institute of Technology", 79);
+	center_print("and", 79);
+	center_print("Argonne National Laboratory", 79);
+	printf("\n");
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	printf("\n");
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+// Prints a border
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( int a )
+{
+    if( a < 1000 )
+        printf("%d\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%d,%03d\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%d,%03d,%03d,%03d\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%d\n",a);
+}
+
+// Prints out the summary of User input
+void print_input_summary(Input I)
+{
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	#ifdef MPI
+	int nranks;
+	MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+	printf("%-35s%d\n", "MPI Ranks:", nranks); 
+	#endif
+	#ifdef OPENMP
+	printf("%-35s%d\n", "Number of Threads:", I.nthreads);
+	#endif
+	printf("%-35s%d\n", "x-axis assemblies:", I.x_assemblies);
+	printf("%-35s%d\n", "y-axis assemblies:", I.y_assemblies);
+	printf("%-35s%d\n", "coarse axial intervals:", I.cai);
+	printf("%-35s%d\n", "fine axial intervals:", I.fai);
+	printf("%-35s%d\n", "axial source expansion order:", I.axial_exp);
+	printf("%-35s%.2lf\n", "radial ray separation:", I.radial_ray_sep);
+	printf("%-35s%.2lf\n", "axial z-ray separation:", I.axial_z_sep);
+	printf("%-35s%d\n", "azimuthal angles:", I.n_azimuthal);
+	printf("%-35s%d\n", "polar angles:", I.n_polar_angles);
+	printf("%-35s%d\n", "energy groups:", I.n_egroups);
+	printf("%-35s%d\n", "assemblies per axial sub-domain:", 
+			I.decomp_assemblies_ax);
+	printf("%-35s%ld\n", "avg segments per track:", I.segments_per_track);
+	printf("%-35s%.2lf\n", "assembly width:", I.assembly_width);
+	printf("%-35s%.2lf\n", "reactor height:", I.height);
+	printf("%-35s%ld\n", "2D Src regions per assembly:", 
+			I.n_2D_source_regions_per_assembly);
+	printf("%-35s%ld\n", "2D Tracks:", I.ntracks_2D);
+	printf("%-35s", "3D Tracks:"); fancy_int(I.ntracks);
+#ifdef PRINT_MEM_SIZES
+	printf("%-35s%zu (MB)\n", "Estimated Memory Usage:",
+			est_mem_usage(I) / 1024 / 1024);
+#endif
+	#ifdef PAPI
+    if( I.papi_event_set == -1)
+        printf("%-35s%s\n", "PAPI event to count:", I.event_name);
+	#endif
+	border_print();
+}
+
+// reads command line inputs and applies options
+void read_CLI( int argc, char * argv[], Input * input )
+{
+	// defaults to max threads on the system	
+	#ifdef OPENMP
+	input->nthreads = omp_get_num_procs();
+	#else
+	input->nthreads = 1;
+	#endif
+	
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// nthreads (-t)
+		if( strcmp(arg, "-t") == 0 )
+		{
+			if( ++i < argc )
+				input->nthreads = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// input file (-i)
+		else if( strcmp(arg, "-i") == 0 )
+		{
+			if( ++i < argc )
+				read_input_file( input, argv[i]);
+			else
+				print_CLI_error();
+		}
+		// set input for small problem (-s)
+		else if( strcmp(arg, "-s") == 0)
+			set_small_input( input );
+
+        #ifdef PAPI
+        // Add single PAPI event
+        else if( strcmp(arg, "-p") == 0 )
+        {
+            if( ++i < argc ){
+                input->papi_event_set = -1;
+                strcpy(input->event_name, argv[i]);
+            }
+            else
+                print_CLI_error();
+        }
+        #endif
+		// Load OpenMOC track data file
+		else if( strcmp(arg,"-d") == 0)
+		{
+			if( ++i < argc )
+			{
+				input->track_file = argv[i];
+				input->load_tracks = true;
+			}
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+
+	// Validate nthreads
+	if( input->nthreads < 1 )
+		print_CLI_error();
+}
+
+// print error to screen, inform program options
+void print_CLI_error(void)
+{
+	printf("Usage: ./SimpleMOC <options>\n");
+	printf("Options include:\n");
+	printf("  -t <threads>     Number of OpenMP threads to run\n");
+	printf("  -i <filename>    Input file name\n");
+    printf("  -p <PAPI event>  PAPI event name to count (1 only) \n");
+    printf("  -s               Small problem flag \n");
+	printf("  -d <filename>    OpenMOC tracking file\n");
+	printf("See readme for full description of default run values\n");
+	exit(1);
+}
+
+// read input file describing problem parameters
+void read_input_file( Input * I, char * fname)
+{
+	#if TIMING_INFO | 0
+		printf("Attempting to open FIle: %s\n", fname);
+	#endif
+    	FILE * fp = fopen(fname, "r");
+	#if TIMING_INFO | 0
+		printf("Opened FIle: %s\n", fname);
+	#endif
+	if( fp == NULL )
+		printf("FIle Open FAILED\n");
+    char c[255];
+
+	char * stat;
+	int err;
+
+    err = fscanf(fp, "%d", &I->x_assemblies);
+    stat = fgets(c, 255, fp);
+
+    err = fscanf(fp, "%d", &I->y_assemblies);
+    stat = fgets(c, 255, fp);
+    
+	err = fscanf(fp, "%d", &I->cai);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->fai);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->axial_exp);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->radial_ray_sep);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->axial_z_sep);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_azimuthal);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_polar_angles);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->n_egroups);
+    stat = fgets(c, 255, fp);
+
+	int decompose;	
+	err = fscanf(fp, "%d", &decompose);
+    stat = fgets(c, 255, fp);
+	if(decompose == 0)
+		I->decompose = false;
+	else
+		I->decompose = true;
+	
+	err = fscanf(fp, "%d", &I->decomp_assemblies_ax);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%ld", &I->segments_per_track);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->assembly_width);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->height);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%f", &I->precision);
+    stat = fgets(c, 255, fp);
+
+	err = fscanf(fp, "%ld", &I->n_2D_source_regions_per_assembly);
+    stat = fgets(c, 255, fp);
+	
+	err = fscanf(fp, "%d", &I->papi_event_set);
+    stat = fgets(c, 255, fp);
+
+	fclose(fp);
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/main.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,147 @@
+#include"SimpleMOC_header.h"
+
+int main( int argc, char * argv[] )
+{
+	int version = 3;
+	int mype = 1;
+
+	#ifdef MPI
+	int nranks;
+	MPI_Status stat;
+	MPI_Init(&argc, &argv);
+	MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+	MPI_Comm_rank(MPI_COMM_WORLD, &mype);
+	#endif
+
+	#ifdef PAPI
+	papi_serial_init();
+	#endif
+
+	srand(time(NULL) * (mype+1));
+
+	Input input = set_default_input();
+	read_CLI( argc, argv, &input );
+	calculate_derived_inputs( &input );
+
+	if( mype == 0 )
+		logo(version);
+
+	#ifdef OPENMP
+	omp_set_num_threads(input.nthreads); 
+	#endif
+
+	Params params = build_tracks( &input );
+	CommGrid grid = init_mpi_grid( input );
+
+	if( mype == 0 )
+		print_input_summary(input);
+
+	float res;
+	float keff = 1.0;
+	int num_iters = 1;
+
+
+	double time_transport = 0;
+	double time_flux_exchange = 0;
+	double time_renormalize_flux = 0;
+	double time_update_sources = 0;
+	double time_compute_keff = 0;
+	double start, stop;
+
+	if(mype==0)
+	{
+		center_print("SIMULATION", 79);
+		border_print();
+	}
+
+	for( int i = 0; i < num_iters; i++)
+	{
+		// Transport Sweep
+		start = get_time();
+		transport_sweep(&params, &input);
+		stop = get_time();
+		time_transport += stop-start;
+
+		// Domain Boundary Flux Exchange (MPI)
+		#ifdef MPI
+		start = get_time();
+		fast_transfer_boundary_fluxes(params, input, grid);
+		stop = get_time();
+		time_flux_exchange += stop-start;
+		#endif
+
+		// Flux Renormalization
+		start = get_time();
+		renormalize_flux(params,input, grid);
+		stop = get_time();
+		time_renormalize_flux += stop-start;
+
+		// Update Source Regions
+		start = get_time();
+		res = update_sources(params, input, keff);
+		stop = get_time();
+		time_update_sources += stop-start;
+
+		// Calculate K-Effective
+		start = get_time();
+		keff = compute_keff(params, input, grid);
+		stop = get_time();
+		time_compute_keff += stop-start;
+		if( mype == 0 )
+			printf("keff = %f\n", keff);
+	}
+
+	double time_total = time_transport + time_flux_exchange 
+		+ time_renormalize_flux + time_update_sources + time_compute_keff;
+
+	if( mype == 0 )
+	{
+		border_print();
+		center_print("RESULTS SUMMARY", 79);
+		border_print();
+
+		printf("Transport Sweep Time:         %6.2lf sec   (%4.1lf%%)\n", 
+				time_transport, 100*time_transport/time_total);
+		printf("Domain Flux Exchange Time:    %6.2lf sec   (%4.1lf%%)\n", 
+				time_flux_exchange, 100*time_flux_exchange/time_total);
+		printf("Flux Renormalization Time:    %6.2lf sec   (%4.1lf%%)\n", 
+				time_renormalize_flux, 100*time_renormalize_flux/time_total);
+		printf("Update Source Time:           %6.2lf sec   (%4.1lf%%)\n", 
+				time_update_sources, 100*time_update_sources/time_total);
+		printf("K-Effective Calc Time:        %6.2lf sec   (%4.1lf%%)\n", 
+				time_compute_keff, 100*time_compute_keff/time_total);
+		printf("Total Time:                   %6.2lf sec\n", time_total);
+	}
+
+	long tracks_per_second = 2 * input.ntracks/time_transport;
+
+	#ifdef MPI
+	MPI_Barrier(grid.cart_comm_3d);
+	long global_tps = 0;
+	MPI_Reduce( &tracks_per_second, // Send Buffer
+			&global_tps,            // Receive Buffer
+			1,                    	// Element Count
+			MPI_LONG,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+	MPI_Barrier(grid.cart_comm_3d);
+	tracks_per_second = global_tps;
+	#endif
+
+	if( mype == 0 )
+	{
+		printf("Time per Intersection:          ");
+		printf("%.2lf ns\n", time_per_intersection( input, time_transport ));
+		border_print();
+	}
+
+	free_2D_tracks( params.tracks_2D );
+	free_tracks( params.tracks );
+
+	#ifdef MPI
+	MPI_Finalize();
+	#endif
+
+	return 0;
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/papi.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,493 @@
+#include"SimpleMOC_header.h"
+
+#ifdef PAPI
+
+// initialize papi with one thread first
+void papi_serial_init(void)
+{
+	if ( PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT){
+		fprintf(stderr, "PAPI library init error!\n");
+		exit(1);
+	}
+	if (( PAPI_thread_init((long unsigned int (*)(void))
+					pthread_self )) != PAPI_OK){
+		PAPI_perror("PAPI_thread_init");
+		exit(1);
+	}
+}
+
+void counter_init( int *eventset, int *num_papi_events, Input * I )
+{
+	char error_str[PAPI_MAX_STR_LEN];
+	int stat;
+
+	int * events;
+    
+    // Command line event
+    if( I->papi_event_set == -1){
+        *num_papi_events = 1;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+        PAPI_event_name_to_code( I->event_name, &events[0]);
+    }
+	// FLOPS
+	if( I->papi_event_set == 0 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_SP_OPS;
+		events[1] = PAPI_TOT_CYC;
+	}
+	// Bandwidth
+	if( I->papi_event_set == 1 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_L3_TCM;
+		events[1] = PAPI_TOT_CYC;
+	}
+	// CPU Stall Reason
+	if( I->papi_event_set == 2 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "RESOURCE_STALLS:ANY";
+		char * event2 = "RESOURCE_STALLS:SB";
+		char * event3 = "RESOURCE_STALLS:RS";
+		char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+	// CPU Stall Percentage
+	if( I->papi_event_set == 3 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "RESOURCE_STALLS:ANY";
+		char * event2 = "PAPI_TOT_CYC";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+	}
+	// Memory Loads
+	if( I->papi_event_set == 4 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "MEM_LOAD_UOPS_RETIRED";
+		char * event2 = "MEM_LOAD_UOPS_RETIRED:L1_HIT";
+		char * event3 = "MEM_LOAD_UOPS_RETIRED:L2_HIT";
+		char * event4 = "MEM_LOAD_UOPS_RETIRED:L3_HIT";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+	// LLC Miss Rate
+	if( I->papi_event_set == 5 )
+	{
+		*num_papi_events = 2;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_L3_TCM;
+		events[1] = PAPI_L3_TCA;
+	}
+	// Branch MisPrediction
+	if( I->papi_event_set == 6 )
+	{
+		*num_papi_events = 3;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		events[0] = PAPI_BR_MSP;
+		events[1] = PAPI_BR_CN;
+		events[2] = PAPI_BR_PRC;
+	}
+	// TLB Misses
+	if( I->papi_event_set == 7 )
+	{
+		*num_papi_events = 4;
+		events = (int *) malloc( *num_papi_events * sizeof(int));
+		int EventCode;
+		char * event1 = "perf::DTLB-LOADS";
+		char * event2 = "perf::DTLB-LOAD-MISSES";
+		char * event3 = "perf::DTLB-STORES";
+		char * event4 = "perf::DTLB-STORE-MISSES";
+		PAPI_event_name_to_code( event1, &EventCode );
+		events[0] = EventCode;	
+		PAPI_event_name_to_code( event2, &EventCode );
+		events[1] = EventCode;	
+		PAPI_event_name_to_code( event3, &EventCode );
+		events[2] = EventCode;	
+		PAPI_event_name_to_code( event4, &EventCode );
+		events[3] = EventCode;	
+	}
+
+	/////////////////////////////////////////////////////////////////////////
+	//                        PAPI EVENT SELECTION
+	/////////////////////////////////////////////////////////////////////////
+	// User can comment/uncomment blocks as they see fit within this seciton
+
+	// Some Standard Events
+	//int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
+
+	// Bandwidth Used
+	// ((PAPI_Lx_TCM * Lx_linesize) / PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = {PAPI_L3_TCM, PAPI_TOT_CYC};
+
+	// L3 Total Cache Miss Ratio
+	// PAPI_L3_TCM / PAPI_L3_TCA
+	// (On Xeon dual octo -  65%, not dependent on # of threads)
+	//int events[] = {PAPI_L3_TCM, PAPI_L3_TCA};
+
+	// % Cycles with no instruction use
+	// PAPI_STL_ICY / PAPI_TOT_CYC
+	//int events[] = { PAPI_STL_ICY, PAPI_TOT_CYC };
+
+	// % Branch instructions Mispredicted
+	// PAPI_BR_MSP / PAPI_BR_CN
+	//int events[] = { PAPI_BR_MSP, PAPI_BR_CN, PAPI_BR_PRC };
+
+	// TLB Misses
+	//int events[] = { PAPI_TLB_DM };
+
+	// MFlops
+	// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = { PAPI_FP_INS, PAPI_TOT_CYC };
+
+	// MFlops (Alternate?)
+	// (PAPI_FP_INS/PAPI_TOT_CYC) * Clock(MHz)
+	//int events[] = { PAPI_DP_OPS, PAPI_TOT_CYC };
+
+
+	// TLB misses (Using native counters)
+	/*
+	   int events[2];
+	   int EventCode;
+	   char * event1 = "perf::DTLB-LOADS";
+	   char * event2 = "perf::DTLB-LOAD-MISSES";
+	   PAPI_event_name_to_code( event1, &EventCode );
+	   events[0] = EventCode;	
+	   PAPI_event_name_to_code( event2, &EventCode );
+	   events[1] = EventCode;	
+	   */
+
+	/*	
+	// Stalled Cycles, front v back (Using native counters)
+	int events[3];
+	int EventCode;
+	char * event1 = "perf::STALLED-CYCLES-FRONTEND";
+	char * event2 = "perf::STALLED-CYCLES-BACKEND";
+	char * event3 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	PAPI_event_name_to_code( event3, &EventCode );
+	events[2] = EventCode;	
+	*/	
+	/*
+	// LLC Cache Misses (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "ix86arch::LLC_REFERENCES";
+	char * event2 = "ix86arch::LLC_MISSES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// Node Prefetch Misses (Using native counters)
+	int events[1];
+	int EventCode;
+	//char * event1 = "perf::NODE-PREFETCHES";
+	//char * event2 = "perf::NODE-PREFETCH-MISSES";
+	char * event1 = "perf::NODE-PREFETCHES";
+	char * event2 = "perf::NODE-LOAD-MISSES:COUNT";
+	//PAPI_event_name_to_code( event1, &EventCode );
+	//events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[0] = EventCode;	
+	*/
+
+	/*
+	// CPU Stalls Due to lack of Load Buffers (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:LB";
+	char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/	
+	/*
+	// CPU Stalls Due to ANY Resource (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "PAPI_TOT_CYC";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// CPU Stalls at Reservation Station (Using native counters)
+	int events[2];
+	int EventCode;
+	char * event1 = "RESOURCE_STALLS:RS";
+	char * event2 = "perf::PERF_COUNT_HW_CPU_CYCLES";
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	*/
+
+	/*
+	// CPU Stall Reason Breakdown (Using native counters)
+	int events[4];
+	int EventCode;
+
+	// Set 1
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:LB";
+	char * event3 = "RESOURCE_STALLS:RS";
+	char * event4 = "RESOURCE_STALLS:SB";
+	// Set 1
+
+	// Set 2
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:ROB";
+	char * event3 = "RESOURCE_STALLS:MEM_RS";
+	char * event4 = "RESOURCE_STALLS2:ALL_FL_EMPTY";
+	// Set 2
+	// Set 3
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS2:ALL_PRF_CONTROL";
+	char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
+	char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+	// Set 3
+	char * event1 = "RESOURCE_STALLS:ANY";
+	char * event2 = "RESOURCE_STALLS:SB";
+	char * event3 = "RESOURCE_STALLS:RS"; // duplicate
+	char * event4 = "RESOURCE_STALLS2:OOO_RSRC";
+
+
+	// Events that don't need to be counted
+	// Don't bother measuring these
+	//char * event1 = "RESOURCE_STALLS:FCSW"; // Always 0, don't measure
+	//char * event1 = "RESOURCE_STALLS:MXCSR"; // Always 0, don't measure
+	//char * event3 = "RESOURCE_STALLS2:BOB_FULL"; // Always trivial
+	//char * event3 = "RESOURCE_STALLS2:ANY_PRF_CONTROL"; // duplicate
+
+	PAPI_event_name_to_code( event1, &EventCode );
+	events[0] = EventCode;	
+	PAPI_event_name_to_code( event2, &EventCode );
+	events[1] = EventCode;	
+	PAPI_event_name_to_code( event3, &EventCode );
+	events[2] = EventCode;	
+	PAPI_event_name_to_code( event4, &EventCode );
+	events[3] = EventCode;	
+	*/
+
+	/////////////////////////////////////////////////////////////////////////
+	//                        PAPI EVENT LOADING
+	/////////////////////////////////////////////////////////////////////////
+	// Users should not need to alter anything within this section
+
+	int thread = omp_get_thread_num();
+
+	if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK)
+	{
+		PAPI_perror("PAPI_create_eventset");
+		exit(1);
+	}
+
+	for( int i = 0; i < *num_papi_events; i++ )
+	{
+		if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK)
+		{
+			PAPI_perror("PAPI_add_event");
+			exit(1);
+		}
+	}
+
+	if ((stat=PAPI_start(*eventset)) != PAPI_OK)
+	{
+		PAPI_perror("PAPI_start");
+		exit(1);
+	}
+}
+
+
+
+
+/*
+   void counter_init( int *eventset, int *num_papi_events )
+   {
+   char error_str[PAPI_MAX_STR_LEN];
+//  int events[] = {PAPI_TOT_INS,PAPI_BR_INS,PAPI_SR_INS};
+int events[] = {PAPI_TOT_INS,PAPI_LD_INS,PAPI_FP_INS};
+int events[] = {ix86arch::LLC_REFERENCES, 
+int stat;
+
+int thread = omp_get_thread_num();
+if( thread == 0 )
+printf("Initializing PAPI counters...\n");
+
+ *num_papi_events = sizeof(events) / sizeof(int);
+
+ if ((stat = PAPI_thread_init((long unsigned int (*)(void)) omp_get_thread_num)) != PAPI_OK){
+ PAPI_perror("PAPI_thread_init");
+ exit(1);
+ }
+
+ if ( (stat= PAPI_create_eventset(eventset)) != PAPI_OK){
+ PAPI_perror("PAPI_create_eventset");
+ exit(1);
+ }
+
+ for( int i = 0; i < *num_papi_events; i++ ){
+ if ((stat=PAPI_add_event(*eventset,events[i])) != PAPI_OK){
+ PAPI_perror("PAPI_add_event");
+ exit(1);
+ }
+ }
+
+ if ((stat=PAPI_start(*eventset)) != PAPI_OK){
+ PAPI_perror("PAPI_start");
+ exit(1);
+ }
+ }
+ */
+
+// Stops the papi counters and prints results
+void counter_stop( int * eventset, int num_papi_events, Input * I )
+{
+	int * events = malloc(num_papi_events * sizeof(int));
+	int n = num_papi_events;
+	PAPI_list_events( *eventset, events, &n );
+	PAPI_event_info_t info;
+
+	long_long * values = malloc( num_papi_events * sizeof(long_long));
+	PAPI_stop(*eventset, values);
+	int thread = omp_get_thread_num();
+	int nthreads = omp_get_num_threads();
+
+	static long LLC_cache_miss = 0;
+	static long total_cycles = 0;
+	static long FLOPS = 0;
+	static long stall_any = 0;
+	static long stall_SB = 0;
+	static long stall_RS = 0;
+	static long stall_OO = 0;
+	static long tlb_load = 0;
+	static long tlb_load_m = 0;
+	static long tlb_store = 0;
+	static long tlb_store_m = 0;
+
+    #pragma omp master
+    {
+        I->vals_accum = malloc( num_papi_events * sizeof(long long));
+        for(int i=0; i < num_papi_events ; i ++)
+            I->vals_accum[i] = 0;
+    }
+    #pragma omp barrier
+
+	#pragma omp critical (papi)
+	{
+		printf("Thread %d\n", thread);
+		for( int i = 0; i < num_papi_events; i++ )
+		{
+            I->vals_accum[i] += values[i];
+			PAPI_get_event_info(events[i], &info);
+			printf("%-15lld\t%s\t%s\n", values[i],info.symbol,info.long_descr);
+			if( strcmp(info.symbol, "PAPI_L3_TCM") == 0 )
+				LLC_cache_miss += values[i];
+			if( strcmp(info.symbol, "PAPI_TOT_CYC") == 0 )
+				total_cycles += values[i];
+			if( strcmp(info.symbol, "PAPI_SP_OPS") == 0 )
+				FLOPS += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:ANY") == 0 )
+				stall_any += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:SB") == 0 )
+				stall_SB += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS:RS") == 0 )
+				stall_RS += values[i];
+			if( strcmp(info.symbol, "RESOURCE_STALLS2:OOO_RSRC") == 0 )
+				stall_OO += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-LOADS") == 0 )
+				tlb_load += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-LOAD-MISSES") == 0 )
+				tlb_load_m += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-STORES") == 0 )
+				tlb_store += values[i];
+			if( strcmp(info.symbol, "perf::DTLB-STORE-MISSES") == 0 )
+				tlb_store_m += values[i];
+		}
+		free(values);	
+	}
+	{
+		#pragma omp barrier
+	}
+	#pragma omp master
+	{
+        if( omp_get_num_threads() > 1){
+            printf("Thread Totals:\n");
+            for( int i = 0; i < num_papi_events; i++ )
+            {
+                PAPI_get_event_info(events[i], &info);
+                printf("%-15lld\t%s\t%s\n", I->vals_accum[i],info.symbol,info.long_descr);
+            }
+        }
+        free( I->vals_accum );
+
+		border_print();
+		center_print("PERFORMANCE SUMMARY", 79);
+		border_print();
+		long cycles = (long) (total_cycles / (double) nthreads);
+		double bw = LLC_cache_miss*64./cycles*2.8e9/1024./1024./1024.;
+		if( I->papi_event_set == 0 )
+			printf("GFLOPs: %.3lf\n", FLOPS / (double) cycles * 2.8  );
+		if( I->papi_event_set == 1 )
+			printf("Bandwidth: %.3lf (GB/s)\n", bw);
+		if( I->papi_event_set == 2 )
+		{
+			printf("%-30s %.2lf%%\n", "Store Buffer Full:",
+					stall_SB / (double) stall_any * 100.);
+			printf("%-30s %.2lf%%\n", "Reservation Station Full:",
+					stall_RS / (double) stall_any * 100.);
+			printf("%-30s %.2lf%%\n", "OO Pipeline Full:",
+					stall_OO / (double) stall_any * 100.);
+		}
+		if( I->papi_event_set == 3 )
+			printf("CPU Stalled Cycles: %.2lf%%\n",
+					stall_any / (double) total_cycles * 100.);	
+		if( I->papi_event_set == 7 )
+		{
+			printf("%-30s %.2lf%%\n", "Data TLB Load Miss Rate: ",
+					tlb_load_m / (double) tlb_load * 100 );
+			printf("%-30s %.2lf%%\n", "Data TLB Store Miss Rate: ",
+					tlb_store_m / (double) tlb_store * 100 );
+		}
+
+		border_print();
+	}
+    free(events);
+}
+
+#endif

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/solver.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,1469 @@
+#include"SimpleMOC_header.h"
+
+/* Efficient version of attenuate fluxes which determines the change in angular
+ * flux along a particular track across a fine axial region and tallies the 
+ * contribution to the scalar flux in the fine axial region. This function
+ * assumes a quadratic source, which is calculated on the fly using neighboring
+ * source values. 
+ * 
+ * This version decomposes the work into many for loops for efficient SIMD 
+ * instructions and to reduce register pressure. For a more descriptive 
+ * (but less effiient) version of the code in terms of the underlying physics, 
+ * see alt_attenuate_fluxes which solves the problem in a more naive, 
+ * straightforward manner. */
+void attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I_in,
+		Params * params_in, float ds, float mu, float az_weight, 
+		AttenuateVars * A ) 
+{
+	Input I = *I_in;
+	Params params = *params_in;
+
+	// unload attenuate vars
+	float * restrict q0 = A->q0;
+	float *  restrict q1 = A->q1;
+	float *  restrict q2 = A->q2;
+	float *  restrict sigT = A->sigT;
+	float *  restrict tau = A->tau;
+	float *  restrict sigT2 = A->sigT2;
+	float *  restrict expVal = A->expVal;
+	float *  restrict reuse = A->reuse;
+	float *  restrict flux_integral = A->flux_integral;
+	float *  restrict tally = A->tally;
+	float *  restrict t1 = A->t1;
+	float *  restrict t2 = A->t2;
+	float *  restrict t3 = A->t3;
+	float *  restrict t4 = A->t4;
+
+	// compute fine axial interval spacing
+	float dz = I.height / (I.fai * I.decomp_assemblies_ax * I.cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * 
+		( (int)( track->z_height / dz ) + 0.5f );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I.fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight;
+	float mu2 = mu * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = QSR -> fine_flux[fine_id];
+
+	if( fine_id == 0 )
+	{
+		// adjust z height to account for edge
+		zin -= dz;
+
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id][g];
+			float y2 = QSR->fine_source[fine_id+1][g];
+			float y3 = QSR->fine_source[fine_id+2][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+			
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+	else if ( fine_id == I.fai - 1 )
+	{
+		// adjust z height to account for edge
+		zin += dz;
+		
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-2][g];
+			float y2 = QSR->fine_source[fine_id-1][g];
+			float y3 = QSR->fine_source[fine_id][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+	else
+	{
+		// cycle over energy groups
+		#ifdef INTEL
+		#pragma simd
+		#elif defined IBM
+		#pragma simd_level(10)
+		#endif
+		for( int g = 0; g < I.n_egroups; g++)
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2.f*dz);
+			float c2 = (y1 - 2.f*y2 + y3) / (2.f*dz*dz);
+
+			// calculate q0, q1, q2
+			q0[g] = c0 + c1*zin + c2*zin*zin;
+			q1[g] = c1 + 2.f*c2*zin;
+			q2[g] = c2;
+		}
+	}
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// load total cross section
+		sigT[g] = QSR->sigT[g];
+
+		// calculate common values for efficiency
+		tau[g] = sigT[g] * ds;
+		sigT2[g] = sigT[g] * sigT[g];
+	}
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+		expVal[g] = interpolateTable( params.expTable, tau[g] );  
+
+	// Flux Integral
+
+	// Re-used Term
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		reuse[g] = tau[g] * (tau[g] - 2.f) + 2.f * expVal[g] 
+			/ (sigT[g] * sigT2[g]); 
+	}
+
+
+	float * psi;
+	if(forward)
+		psi = track->f_psi;
+	else
+		psi = track->b_psi;
+
+	//#pragma vector nontemporal
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// add contribution to new source flux
+		flux_integral[g] = (q0[g] * tau[g] + (sigT[g] * psi[g] - q0[g])
+			* expVal[g]) / sigT2[g] + q1[g] * mu * reuse[g] + q2[g] * mu2 
+			* (tau[g] * (tau[g] * (tau[g] - 3.f) + 6.f) - 6.f * expVal[g]) 
+			/ (3.f * sigT2[g] * sigT2[g]);
+	}
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		// Prepare tally
+		tally[g] = weight * flux_integral[g];
+	}
+
+	#ifdef OPENMP
+	omp_set_lock(QSR->locks + fine_id);
+	#endif
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		FSR_flux[g] += tally[g];
+	}
+
+	#ifdef OPENMP
+	omp_unset_lock(QSR->locks + fine_id);
+	#endif
+
+	// Term 1
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t1[g] = q0[g] * expVal[g] / sigT[g];  
+	}
+	// Term 2
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t2[g] = q1[g] * mu * (tau[g] - expVal[g]) / sigT2[g]; 
+	}
+	// Term 3
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t3[g] =	q2[g] * mu2 * reuse[g];
+	}
+	// Term 4
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		t4[g] = psi[g] * (1.f - expVal[g]);
+	}
+	// Total psi
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I.n_egroups; g++)
+	{
+		psi[g] = t1[g] + t2[g] + t3[g] + t4[g];
+	}
+}	
+
+// single direction transport sweep
+void transport_sweep( Params * params, Input * I )
+{
+	if(I->mype==0) printf("Starting transport sweep ...\n");
+
+	// calculate the height of a node's domain and of each FSR
+	double node_delta_z = I->height / I->decomp_assemblies_ax;
+	double fine_delta_z = node_delta_z / (I->cai * I->fai);
+
+	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
+	 * angles, polar angles, and z stacked rays) */
+
+	//print_Input_struct( I );
+	long segments_processed = 0;
+
+	#pragma omp parallel default(none) \
+	shared( I, params, node_delta_z, fine_delta_z ) \
+	reduction(+ : segments_processed )
+	{
+		#ifdef OPENMP
+		int thread = omp_get_thread_num();
+		int nthreads = omp_get_num_threads();
+		unsigned int seed = time(NULL) * (thread+1);
+		#endif
+		//print_Input_struct( I );
+
+		#ifdef PAPI
+		int eventset = PAPI_NULL;
+		int num_papi_events;
+		#pragma omp critical
+		{
+			counter_init(&eventset, &num_papi_events, I);
+		}
+		#endif
+
+		AttenuateVars A;
+		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
+		A.q0 = ptr;
+		ptr += I->n_egroups;
+		A.q1 = ptr;
+		ptr += I->n_egroups;
+		A.q2 = ptr;
+		ptr += I->n_egroups;
+		A.sigT = ptr;
+		ptr += I->n_egroups;
+		A.tau = ptr;
+		ptr += I->n_egroups;
+		A.sigT2 = ptr;
+		ptr += I->n_egroups;
+		A.expVal = ptr;
+		ptr += I->n_egroups;
+		A.reuse = ptr;
+		ptr += I->n_egroups;
+		A.flux_integral = ptr;
+		ptr += I->n_egroups;
+		A.tally = ptr;
+		ptr += I->n_egroups;
+		A.t1 = ptr;
+		ptr += I->n_egroups;
+		A.t2 = ptr;
+		ptr += I->n_egroups;
+		A.t3 = ptr;
+		ptr += I->n_egroups;
+		A.t4 = ptr;
+
+		#pragma omp for schedule( dynamic ) 
+		for (long i = 0; i < I->ntracks_2D; i++)
+		{
+			#if TIMING_INFO | 0
+				// print progress
+				#ifdef OPENMP
+				if(I->mype==0 && thread == 0)
+				{
+					printf("\rAttenuating Tracks... (%.0lf%% completed)",
+							(i / ( (double)I->ntracks_2D / (double) nthreads ))
+							/ (double) nthreads * 100.0);
+				}
+				#else
+				if( i % 50 == 0)
+					if(I->mype==0)
+						printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
+								I->ntracks_2D );
+				#endif
+			#endif
+
+
+			// treat positive-z traveling rays first
+			bool pos_z_dir = true;
+			for( int j = 0; j < I->n_polar_angles; j++)
+			{
+				if( j == I->n_polar_angles / 2 )
+					pos_z_dir = false;
+				float p_angle = params->polar_angles[j];
+				float mu = cos(p_angle);
+
+				// start with all z stacked rays
+				int begin_stacked = 0;
+				int end_stacked = I->z_stacked;
+
+				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
+				{
+					// calculate distance traveled in cell if segment completed
+					float s_full = params->tracks_2D[i].segments[n].length 
+						/ sin(p_angle);
+
+					// allocate varaible for distance traveled in an FSR
+					float ds = 0;
+
+					// loop over remaining z-stacked rays
+					for( int k = begin_stacked; k < end_stacked; k++)
+					{
+						// initialize s to full length
+						float s = s_full;
+
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// set flag for completeion of segment
+						bool seg_complete = false;
+
+						// calculate interval
+						int curr_interval;
+						if( pos_z_dir)
+							curr_interval = get_pos_interval(track->z_height, 
+									fine_delta_z);
+						else
+							curr_interval = get_neg_interval(track->z_height, 
+									fine_delta_z);
+
+						while( !seg_complete )
+						{
+							// flag to reset z position
+							bool reset = false;
+
+
+							/* calculate new height based on s 
+							 * (distance traveled in FSR) */
+							float z = track->z_height + s * cos(p_angle);
+
+							// check if still in same FSR (fine axial interval)
+							int new_interval;
+							if( pos_z_dir )
+								new_interval = get_pos_interval(z, 
+										fine_delta_z);
+							else
+								new_interval = get_neg_interval(z,
+										fine_delta_z);
+
+							if( new_interval == curr_interval )
+							{
+								seg_complete = true;
+								ds = s;
+							}
+
+							// otherwise, we need to recalculate distances
+							else
+							{
+								// correct z
+								if( pos_z_dir )
+								{
+									curr_interval++;
+									z = fine_delta_z * (float) curr_interval;
+								}
+								else{
+									curr_interval--;
+									z = fine_delta_z * (float) curr_interval;
+								}
+
+								// calculate distance travelled in FSR (ds)
+								ds = (z - track->z_height) / cos(p_angle);
+
+								// update track length remaining
+								s -= ds;
+
+								/* check remaining track length to protect
+								 * against potential roundoff errors */
+								if( s <= 0 )
+									seg_complete = true;
+
+								// check if out of bounds or track complete
+								if( z <= 0 || z >= node_delta_z )
+								{
+									// mark segment as completed
+									seg_complete = true;
+
+									// remember to no longer treat this track
+									if ( pos_z_dir )
+										end_stacked--;
+									else
+										begin_stacked++;
+
+									// reset z height
+									reset = true;
+								}
+							}
+
+							// pick a random FSR (cache miss expected)
+							#ifdef OPENMP
+							long QSR_id = rand_r(&seed) % 
+								I->n_source_regions_per_node;
+							#else
+							long QSR_id = rand() % 
+								I->n_source_regions_per_node;
+							#endif
+
+							/* update sources and fluxes from attenuation 
+							 * over FSR */
+							if( I->axial_exp == 2 )
+							{
+								attenuate_fluxes( track, true, 
+										&params->sources[QSR_id], 
+										I, params, ds, mu, 
+										params->tracks_2D[i].az_weight, &A );
+
+								segments_processed++;
+							}
+
+							else if( I->axial_exp == 0 )
+							{
+								attenuate_FSR_fluxes( track, true,
+										&params->sources[QSR_id],
+										I, params, ds, mu,
+										params->tracks_2D[i].az_weight, &A );
+
+								segments_processed++;
+							}
+							else
+							{
+								printf("Error: invalid axial expansion order");
+								printf("\n Please input 0 or 2\n");
+								exit(1);
+							}
+
+							// update with new z height or reset if finished
+							if( n == params->tracks_2D[i].n_segments - 1  
+									|| reset)
+							{
+								if( pos_z_dir)
+									track->z_height = I->axial_z_sep * k;
+								else
+									track->z_height = I->axial_z_sep * (k+1);
+							}
+							else
+								track->z_height = z;
+
+						}
+					}
+				}
+			}
+		}
+		#ifdef OPENMP
+		if(thread == 0 && I->mype==0) printf("\n");
+		#endif
+
+		#ifdef PAPI
+		if( thread == 0 )
+		{
+			printf("\n");
+			border_print();
+			center_print("PAPI COUNTER RESULTS", 79);
+			border_print();
+			printf("Count          \tSmybol      \tDescription\n");
+		}
+		{
+			#pragma omp barrier
+		}
+		counter_stop(&eventset, num_papi_events, I);
+		#endif
+	}
+	I->segments_processed = segments_processed;
+
+	return;
+}
+
+
+// run one full transport sweep, return k
+void two_way_transport_sweep( Params * params, Input * I )
+{
+	if(I->mype==0) printf("Starting transport sweep ...\n");
+
+	// calculate the height of a node's domain and of each FSR
+	double node_delta_z = I->height / I->decomp_assemblies_ax;
+	int num_intervals = (I->cai * I->fai);
+	double fine_delta_z = node_delta_z / num_intervals;
+
+	/* loop over tracks (implicitly azimuthal angles, tracks in azimuthal 
+	 * angles, polar angles, and z stacked rays) */
+		long segments_processed = 0;
+
+	#pragma omp parallel default(none) \
+	shared( I, params, node_delta_z, fine_delta_z, num_intervals ) \
+	reduction(+ : segments_processed )
+	{
+		#ifdef OPENMP
+		int thread = omp_get_thread_num();
+		int nthreads = omp_get_num_threads();
+		unsigned int seed = time(NULL) * (thread+1);
+		#endif
+		//print_Input_struct( I );
+
+		#ifdef PAPI
+		int eventset = PAPI_NULL;
+		int num_papi_events;
+		#pragma omp critical
+		{
+			counter_init(&eventset, &num_papi_events, I);
+		}
+		#endif
+
+
+		AttenuateVars A;
+		float * ptr = (float * ) malloc( I->n_egroups * 14 * sizeof(float));
+		A.q0 = ptr;
+		ptr += I->n_egroups;
+		A.q1 = ptr;
+		ptr += I->n_egroups;
+		A.q2 = ptr;
+		ptr += I->n_egroups;
+		A.sigT = ptr;
+		ptr += I->n_egroups;
+		A.tau = ptr;
+		ptr += I->n_egroups;
+		A.sigT2 = ptr;
+		ptr += I->n_egroups;
+		A.expVal = ptr;
+		ptr += I->n_egroups;
+		A.reuse = ptr;
+		ptr += I->n_egroups;
+		A.flux_integral = ptr;
+		ptr += I->n_egroups;
+		A.tally = ptr;
+		ptr += I->n_egroups;
+		A.t1 = ptr;
+		ptr += I->n_egroups;
+		A.t2 = ptr;
+		ptr += I->n_egroups;
+		A.t3 = ptr;
+		ptr += I->n_egroups;
+		A.t4 = ptr;
+
+		#pragma omp for schedule( dynamic ) 
+		for (long i = 0; i < I->ntracks_2D; i++)
+		{
+			// print progress
+			#ifdef OPENMP
+			if(I->mype==0 && thread == 0)
+			{
+				printf("\rAttenuating Tracks... (%.0lf%% completed)",
+						(i / ( (double)I->ntracks_2D / (double) nthreads ))
+						/ (double) nthreads * 100.0);
+			}
+			#else
+			if( i % 50 == 0)
+				if(I->mype==0)
+					printf("%s%ld%s%ld\n","2D Tracks Completed = ", i," / ", 
+							I->ntracks_2D );
+			#endif
+
+			// allocate arrays for segment storage FIXME
+			double ** seg_dist = malloc( I->z_stacked * sizeof(double *) );
+			Source *** seg_src = malloc( I->z_stacked * sizeof(Source**) );
+			int * seg_idx = malloc( I->z_stacked * sizeof(int) );
+			int * seg_size = malloc( I->z_stacked * sizeof(int) );
+
+			// fill matrix with arrays FIXME
+			for( int k = 0; k < I->z_stacked; k++)
+			{
+				seg_size[k] = 2 * I->segments_per_track;
+				seg_dist[k] = malloc( seg_size[k] * sizeof(double) );
+				seg_src[k] = malloc( seg_size[k] * sizeof(Source *) );
+				seg_idx[k] = 0;
+			}
+
+			// treat positive-z traveling rays first
+			bool pos_z_dir = true;
+			for( int j = 0; j < I->n_polar_angles; j++)
+			{
+				if( j == I->n_polar_angles / 2 )
+					pos_z_dir = false;
+				float p_angle = params->polar_angles[j];
+				float mu = cos(p_angle);
+
+				// start with all z stacked rays
+				int begin_stacked = 0;
+				int end_stacked = I->z_stacked;
+
+				// reset semgnet indexes
+				for( int k = 0; k < I->z_stacked; k++)
+					seg_idx[k] = 0;
+
+				for( int n = 0; n < params->tracks_2D[i].n_segments; n++)
+				{
+					// calculate distance traveled in cell if segment completed
+					float s_full = params->tracks_2D[i].segments[n].length 
+						/ sin(p_angle);
+
+					// allocate varaible for distance traveled in an FSR
+					float ds = 0;
+
+					// loop over remaining z-stacked rays
+					int tracks_completed = 0;
+					for( int k = begin_stacked; k < end_stacked; k++)
+					{
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// determine current axial interval
+						int interval = (int) track->z_height / fine_delta_z;
+
+						// calculate distance to domain boundary
+						float bound_dist;
+						if( pos_z_dir)
+							bound_dist = (node_delta_z - track->z_height) / mu;
+						else
+							bound_dist = -track->z_height / mu;
+
+						// determine track length
+						float s;
+						if(	s_full < bound_dist )
+							s = s_full;
+						else
+						{
+							// note completion of track
+							s = bound_dist;
+							tracks_completed++;
+						}
+
+						// set flag for completeion of segment
+						bool seg_complete = false;
+
+						while( !seg_complete )
+						{
+							// initialize tracking variables
+							long QSR_id = interval + num_intervals * n;
+							float ds;
+							float z;
+
+							// calculate z height of next fine axial interval
+							float fai_z_height;
+							if( pos_z_dir )
+								fai_z_height = (interval + 1) * fine_delta_z ;
+							else
+								fai_z_height = interval * fine_delta_z;
+
+							// calculate z distance to next fine axial interval
+							float z_dist_to_fai = 
+								fai_z_height - track->z_height;
+
+							/* calculate total distance (s) to fine axial 
+							 * interval */
+							float s_dist_to_fai = z_dist_to_fai / mu;
+
+							// determine if a fine axial interval is crossed
+							if( s_dist_to_fai < s )
+							{
+								if( pos_z_dir )
+									interval++;
+								else
+									interval--;
+								ds = s_dist_to_fai;
+								z = track->z_height + z_dist_to_fai;
+							}
+							else
+							{
+								ds = s;
+								z = track->z_height + s * mu;
+							}	
+
+							/* shorten remaining segment length and check if
+							 * completed (accounting for potential roundoff) */
+							s -= ds;
+							if( s <= 0 || interval < 0 
+									|| interval >= num_intervals)
+								seg_complete = true;
+
+							// pick a random FSR (cache miss expected)
+							#ifdef OPENMP
+							QSR_id = rand_r(&seed) % 
+								I->n_source_regions_per_node;
+							#else
+							QSR_id = rand() % I->n_source_regions_per_node;
+							#endif
+
+							/* update sources and fluxes from attenuation 
+							 * over FSR */
+							if( I->axial_exp == 2 )
+							{
+								attenuate_fluxes( track, true,
+										&params->sources[QSR_id], 
+										I, params, ds, mu, 
+										params->tracks_2D[i].az_weight, &A );
+								segments_processed++;
+							}
+
+							else if( I->axial_exp == 0 )
+								attenuate_FSR_fluxes( track, true,
+										&params->sources[QSR_id],
+										I, params, ds, mu,
+										params->tracks_2D[i].az_weight, &A );
+							else
+							{
+								printf("Error: invalid axial expansion order");
+								printf("\n Please input 0 or 2\n");
+								exit(1);
+							}
+
+							// update track height
+							track->z_height = z;
+							
+							// save segment length and source FIXME
+							seg_dist[k][seg_idx[k]] = ds;
+							seg_src[k][seg_idx[k]] = &params->sources[QSR_id];
+							seg_idx[k]++;
+
+							// check if array needs to grow FIXME
+							if( seg_idx[k] >= seg_size[k] )
+							{
+								seg_size[k] *= 2;
+								seg_dist[k] = (double *) realloc( seg_dist[k],
+										seg_size[k] * sizeof(double) );
+								seg_src[k] = (Source **) realloc( seg_src[k],
+										seg_size[k] * sizeof(Source *) );
+							}
+						}
+					}
+					if(pos_z_dir)
+						end_stacked -= tracks_completed;
+					else
+						begin_stacked += tracks_completed;
+				}
+
+				// loop over all z stacked rays again
+				for( int k = 0; k < I->z_stacked; k++ )
+				{
+					for( int n = seg_idx[k]-1; n >= 0; n--)
+					{
+						// load distance
+						float ds = seg_dist[k][n];
+
+						// select current track
+						Track * track = &params->tracks[i][j][k];
+
+						// update sources and fluxes from attenuation over FSR
+						if( I->axial_exp == 2 )
+						{
+							attenuate_fluxes( track, false,
+									seg_src[k][n], 
+									I, params, ds, -mu, 
+									params->tracks_2D[i].az_weight, &A );
+								segments_processed++;
+						}
+
+						else if( I->axial_exp == 0 )
+							attenuate_FSR_fluxes( track, false,
+									seg_src[k][n],
+									I, params, ds, -mu,
+									params->tracks_2D[i].az_weight, &A );
+
+						// update z height
+						track->z_height -= ds * mu;
+					}
+				}
+
+
+				/* Update all tracks with correct starting z location again
+				 * NOTE: this is only here to acocunt for roundoff error */
+				for( int k = 0; k < I->z_stacked; k++)
+				{
+					Track * track = &params->tracks[i][j][k];
+					if( pos_z_dir)
+						track->z_height = I->axial_z_sep * k;
+					else
+						track->z_height = I->axial_z_sep * (k+1);
+				}
+			}
+
+			// free memory
+			for( int k = 0; k < I->z_stacked; k++)
+			{
+				free(seg_dist[k]);
+				free(seg_src[k]);
+			}
+			free(seg_dist);
+			free(seg_src);
+			free(seg_idx);
+			free(seg_size);
+
+		}
+		#ifdef OPENMP
+		if(thread == 0 && I->mype==0) printf("\n");
+		#endif
+
+		#ifdef PAPI
+		if( thread == 0 )
+		{
+			printf("\n");
+			border_print();
+			center_print("PAPI COUNTER RESULTS", 79);
+			border_print();
+			printf("Count          \tSmybol      \tDescription\n");
+		}
+		{
+			#pragma omp barrier
+		}
+		counter_stop(&eventset, num_papi_events, I);
+		#endif
+	}
+	//printf("Number of segments processed: %ld\n", segments_processed);
+	I->segments_processed = segments_processed;
+
+	return;
+}
+
+/* returns integer number for axial interval for tracks traveling in the
+ *  positive direction */
+int get_pos_interval( float z, float dz)
+{
+	int interval = (int) (z/dz);
+	return interval;
+}
+
+/* returns integer number for axial interval for tracks traveling in the 
+ * negative direction */
+int get_neg_interval( float z, float dz)
+{
+	// NOTE: a bit of trickery using floors to obtain ceils 
+	int interval = INT_MAX - (int) ( (double) INT_MAX 
+			- (double) ( z / dz ) );
+	return interval;
+}
+
+int calc_next_fai( float z, float dz, bool pos_dir)
+{
+	int interval = z/dz;
+	float lower_z = dz * (float) interval;
+	if(pos_dir)
+		return interval + 1;
+	else
+		return interval;
+}
+
+/* Determines the change in angular flux along a particular track across a fine
+ * axial region and tallies the contribution to the scalar flux in the fine 
+ * axial region. This function assumes a quadratic source, which is calculated 
+ * on the fly using neighboring source values.
+ *
+ * This legacy function is unused since it is less efficient than the current 
+ * attenuate_fluxes function. However, it provides a more straightforward 
+ * description of the underlying physical problem. */
+void alt_attenuate_fluxes( Track * track, bool forward, Source * QSR, Input * I,
+		Params * params, float ds, float mu, float az_weight ) 
+{
+	// compute fine axial interval spacing
+	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * ( (int)( track->z_height / dz ) + 0.5 );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I->fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight;
+	float mu2 = mu * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = QSR -> fine_flux[fine_id];
+
+	// cycle over energy groups
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// load total cross section
+		float sigT = QSR->sigT[g];
+
+		// define source parameters
+		float q0, q1, q2;
+
+		// calculate source components
+		if( fine_id == 0 )
+		{
+			// load neighboring sources
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do linear "fitting"
+			float c0 = y2;
+			float c1 = (y3 - y2) / dz;
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin;
+			q1 = c1;
+			q2 = 0;
+		}
+		else if( fine_id == I->fai - 1 )
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+
+			// do linear "fitting"
+			float c0 = y2;
+			float c1 = (y2 - y1) / dz;
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin;
+			q1 = c1;
+			q2 = 0;
+		}		
+		else
+		{
+			// load neighboring sources
+			float y1 = QSR->fine_source[fine_id-1][g];
+			float y2 = QSR->fine_source[fine_id][g];
+			float y3 = QSR->fine_source[fine_id+1][g];
+
+			// do quadratic "fitting"
+			float c0 = y2;
+			float c1 = (y1 - y3) / (2*dz);
+			float c2 = (y1 - 2*y2 + y3) / (2*dz*dz);
+
+			// calculate q0, q1, q2
+			q0 = c0 + c1*zin + c2*zin*zin;
+			q1 = c1 + 2*c2*zin;
+			q2 = c2;
+		}
+
+		// calculate common values for efficiency
+		float tau = sigT * ds;
+		float sigT2 = sigT * sigT;
+
+		// compute exponential ( 1 - exp(-x) ) using table lookup
+		float expVal = interpolateTable( params->expTable, tau );  
+
+		// load correct angular flux vector
+		float * psi;
+		if(forward)
+			psi = track->f_psi;
+		else
+			psi = track->b_psi;
+
+		// add contribution to new source flux
+		float flux_integral = (q0 * tau + (sigT * psi[g] - q0) * expVal)
+			/ sigT2
+			+ q1 * mu * (tau * (tau - 2) + 2 * expVal)
+			/ (sigT * sigT2)
+			+ q2 * mu2 * (tau * (tau * (tau - 3) + 6) - 6 * expVal)
+			/ (3 * sigT2 * sigT2);
+
+		#pragma omp atomic
+		FSR_flux[g] += weight * flux_integral;
+
+		// update angular flux
+		psi[g] = psi[g] * (1.0 - expVal) + q0 * expVal / sigT
+			+ q1 * mu * (tau - expVal) / sigT2 + q2 * mu2 *
+			(tau * (tau - 2) + 2 * expVal) / (sigT2 * sigT);
+	}
+}
+
+/* Determines the change in angular flux along a particular track across a fine
+ * axial region and tallies the contribution to the scalar flux in the fine 
+ * axial region. This function assumes a constant  source. */ 
+void attenuate_FSR_fluxes( Track * track, bool forward, Source * FSR, Input * I,
+		Params * params_in, float ds, float mu, float az_weight, 
+		AttenuateVars *A)
+{
+	// upack attenuate vars struct
+	float *  restrict tally = A->tally;
+	float *  restrict expVal = A->expVal;
+	float *  restrict sigT = A->sigT;
+	float *  restrict tau = A->tau;
+
+	Params params = * params_in;
+
+	// compute fine axial interval spacing
+	float dz = I->height / (I->fai * I->decomp_assemblies_ax * I->cai);
+
+	// compute z height in cell
+	float zin = track->z_height - dz * 
+		( (int)( track->z_height / dz ) + 0.5f );
+
+	// compute fine axial region ID
+	int fine_id = (int) ( track->z_height / dz ) % I->fai;
+
+	// compute weight (azimuthal * polar)
+	// NOTE: real app would also have volume weight component
+	float weight = track->p_weight * az_weight * mu;
+
+	// load fine source region flux vector
+	float * FSR_flux = FSR -> fine_flux[fine_id];
+
+	// cycle over energy groups
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// load total cross section
+		sigT[g] = FSR->sigT[g];
+		tau[g] = sigT[g] * ds;
+	}
+
+	// compute exponential ( 1 - exp(-x) ) using table lookup
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for(int g = 0; g < I->n_egroups; g++)
+	{
+		expVal[g] = interpolateTable( params.expTable, tau[g] );
+	}
+
+	float * psi;
+	if(forward)
+		psi = track->f_psi;
+	else
+		psi = track->b_psi;
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		// compute angular flux attenuation
+		float q = FSR->fine_source[fine_id][g] / sigT[g];
+		float delta_psi = (psi[g] - q) * expVal[g];
+
+		// add contribution to new source flux
+		tally[g] = weight * delta_psi;
+
+		// update angular flux
+		psi[g] -= delta_psi;
+	}
+
+
+
+	#ifdef OPENMP
+	omp_set_lock(&FSR->locks[fine_id]);
+	#endif
+
+	#ifdef INTEL
+	#pragma simd
+	#elif defined IBM
+	#pragma simd_level(10)
+	#endif
+	for( int g = 0; g < I->n_egroups; g++)
+	{
+		FSR_flux[g] += tally[g];
+	}
+
+	#ifdef OPENMP
+	omp_unset_lock(&FSR->locks[fine_id]);
+	#endif
+
+
+}
+
+/* Renormalizes scalar and angular flux for next transport sweep iteration.
+ * Calculation requires multiple pair-wise sums and a reduction accross all 
+ * nodes. */
+void renormalize_flux( Params params, Input I, CommGrid grid )
+{
+	if( I.mype == 0 ) printf("Renormalizing Flux...\n");
+	float node_fission_rate = 0;
+	#ifdef OPENMP
+	#pragma omp parallel default(none) shared(params, I, grid) \
+	reduction(+ : node_fission_rate)
+	{
+	#endif
+		// tally total fission rate (pair-wise sum)
+		float * fission_rates = malloc( I.n_source_regions_per_node 
+				* sizeof(float) );
+
+		float * fine_fission_rates = malloc( I.fai * sizeof(float) );
+		float * g_fission_rates = malloc( I.n_egroups * sizeof(float) );
+
+		// accumulate total fission rate on node domain
+		#pragma omp for schedule(dynamic)
+		for( int i = 0; i < I.n_source_regions_per_node; i++)
+		{
+			Source src = params.sources[i];
+			for( int j = 0; j < I.fai; j++)
+			{
+				for( int g = 0; g < I.n_egroups; g++)
+					g_fission_rates[g] = src.fine_flux[j][g] * src.vol 
+						* src.XS[g][0];
+				fine_fission_rates[j] = pairwise_sum( g_fission_rates, 
+						I.n_egroups );
+			}
+			fission_rates[i] = pairwise_sum( fine_fission_rates, I.fai );
+		}
+		node_fission_rate = pairwise_sum(fission_rates, 
+				I.n_source_regions_per_node);
+		
+		// free allocated memory
+		free(fission_rates);
+		free(fine_fission_rates);
+		free(g_fission_rates);
+	
+	#ifdef OPENMP
+	}
+	#endif
+
+	#ifdef MPI	
+	// accumulate total fission rate by MPI Allreduce
+	float total_fission_rate = 0;
+	MPI_Barrier(grid.cart_comm_3d);
+	MPI_Allreduce( &node_fission_rate, // Send Buffer
+			&total_fission_rate,    // Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			grid.cart_comm_3d );  	// MPI Communicator
+	MPI_Barrier(grid.cart_comm_3d);
+	#else
+	float total_fission_rate = node_fission_rate;
+	#endif
+
+
+	// normalize fluxes by fission reaction rate
+	float norm_factor = 1.0 / total_fission_rate;
+
+	#pragma omp parallel for default(none) \
+	shared(I, params) private(norm_factor) schedule(dynamic)
+	for( int i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		Source * src = &params.sources[i];
+		float adjust = norm_factor * 4 * M_PI * I.fai / src->vol;
+		for( int k = 0; k < I.fai; k++)
+			for( int g = 0; g < I.n_egroups; g++)
+				src->fine_flux[k][g] *= adjust;
+	}
+
+	// normalize boundary fluxes by same factor
+	#pragma omp parallel for default(none) \
+	shared(I, params) private(norm_factor) schedule(dynamic)
+	for( long i = 0; i < I.ntracks_2D; i++)
+		for( int j = 0; j < I.n_polar_angles; j++)
+			for( int k = 0; k < I.z_stacked; k++)
+				for( int g = 0; g < I.n_egroups; g++)
+				{
+					params.tracks[i][j][k].f_psi[g] *= norm_factor;
+					params.tracks[i][j][k].b_psi[g] *= norm_factor;
+				}
+
+	if( I.mype == 0 ) printf("Renormalizing Flux Complete.\n");
+	return;
+}
+
+/* Updates sources for next iteration by computing scattering and fission
+ * components. Calculation includes multiple pair-wise sums and reductions
+ * accross all nodes */
+float update_sources( Params params, Input I, float keff )
+{
+	// source residual
+	float residual;
+
+	// calculate inverse multiplication facotr for efficiency
+	float inverse_k = 1.0 / keff;
+
+	// allocate residual arrays
+	float * group_res = (float *) malloc(I.n_egroups * sizeof(float));
+	float * fine_res = (float *) malloc(I.fai * sizeof(float));
+	float * residuals = (float *) malloc(I.n_source_regions_per_node 
+			* sizeof(float));
+
+	// allocate arrays for summation
+	float * fission_rates = malloc(I.n_egroups * sizeof(float));
+	float * scatter_rates = malloc(I.n_egroups * sizeof(float));
+
+	// cycle through all coarse axial intervals to update source
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		Source src = params.sources[i];
+
+		// cycle thorugh all fine axial regions to calculate new source
+		for( int j = 0; j < I.fai; j++)
+		{
+			// calculate total fission source and scattering source
+			float fission_source;
+			float scatter_source;
+
+			// compute total fission source
+			for( int g = 0; g < I.n_egroups; g++ )
+				fission_rates[g] = src.fine_flux[j][g] * src.XS[g][0];
+			fission_source = pairwise_sum( fission_rates, (long) I.n_egroups);
+
+			// normalize fission source by multiplication factor
+			fission_source *= inverse_k;
+
+			// compute scattering and new total source for each group
+			for( int g = 0; g < I.n_egroups; g++ )
+			{
+				for( int g2 = 0; g2 < I.n_egroups; g2++ )
+				{
+					// compute scatter source originating from g2 -> g
+					scatter_rates[g2] = src.scattering_matrix[g][g2] * 
+						src.fine_flux[j][g2];
+				}
+				scatter_source = pairwise_sum(scatter_rates, 
+						(long) I.n_egroups);
+
+				// compuate new total source
+				float chi = src.XS[g][2];
+
+				// calculate new fine source
+				float newSrc = (fission_source * chi + scatter_source) 
+					/ (4.0 * M_PI);
+
+				// calculate residual
+				float oldSrc = src.fine_source[j][g];
+				group_res[g] = (newSrc - oldSrc) * (newSrc - oldSrc)
+					/ (oldSrc * oldSrc);
+
+				/* calculate new source in fine axial interval assuming 
+				 * isotropic source components */
+				src.fine_source[j][g] = newSrc;
+			}
+			fine_res[j] = pairwise_sum(group_res, (long) I.n_egroups);
+		}
+		residuals[i] = pairwise_sum(fine_res, (long) I.fai);
+	}
+
+	// calculate source residual
+	residual = pairwise_sum(residuals, I.n_source_regions_per_node);
+
+	// free memory
+	free(fission_rates);
+	free(scatter_rates);
+	free(group_res);
+	free(fine_res);
+	free(residuals);
+
+
+	// NOTE: See code around line 600 of CPUSolver.cpp in ClosedMOC/ OpenMOC
+
+	return residual;
+}
+
+/* Computes globall k-effective using multiple pair-wise summations and finally
+ * a reduction accross all nodes */
+float compute_keff(Params params, Input I, CommGrid grid)
+{
+	// allocate temporary memory
+	float * sigma = malloc( I.n_egroups * sizeof(float) );
+	float * group_rates = malloc( I.n_egroups * sizeof(float) );
+	float * fine_rates = malloc( I.fai * sizeof(float) );
+	float * QSR_rates = malloc( I.n_source_regions_per_node * sizeof(float) );
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// compute total absorption rate, looping over source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		// load absorption XS data
+		Source src = params.sources[i];
+		for( int g = 0; g < I.n_egroups; g++)	
+			sigma[g] = src.XS[g][1];
+
+		for( int j = 0; j < I.fai; j++ )
+		{
+			// calculate absorption rates
+			float * fine_flux = src.fine_flux[j];
+			for( int g = 0; g < I.n_egroups; g++)
+				group_rates[g] = sigma[g] * fine_flux[g];
+
+			// sum absorption over all energy groups
+			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
+		}
+		// sum absorption over all fine axial intervals
+		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
+	}
+	// sum absorption over all source regions in a node
+	float node_abs = pairwise_sum( QSR_rates, I.n_source_regions_per_node);
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// compute total absorption rate, looping over source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+	{
+		// load nuSigmaF XS data
+		Source src = params.sources[i];
+		for( int g = 0; g < I.n_egroups; g++)	
+			sigma[g] = src.XS[g][0];
+
+		for( int j = 0; j < I.fai; j++ )
+		{
+			// calculate absorption rates
+			float * fine_flux = src.fine_flux[j];
+			for( int g = 0; g < I.n_egroups; g++)
+				group_rates[g] = sigma[g] * fine_flux[g];
+
+			// sum fission over all energy groups
+			fine_rates[j] = pairwise_sum( group_rates, (long) I.n_egroups );
+		}
+		// sum fission over all fine axial intervals
+		QSR_rates[i] = pairwise_sum( fine_rates, (long) I.fai );
+	}
+	// sum fission over all source regions in a node
+	float node_fission = pairwise_sum( QSR_rates, I.n_source_regions_per_node);
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// MPi Reduction
+	float tot_abs = 0;
+	float tot_fission = 0;
+	float leakage = 0;
+
+	#ifdef MPI 
+
+	// Total Absorption Reduction
+	MPI_Reduce( &node_abs,    		// Send Buffer
+			&tot_abs,      			// Receive Buffer
+			1,                  	// Element Count
+			MPI_FLOAT,          	// Element Type
+			MPI_SUM,            	// Reduciton Operation Type
+			0,                  	// Master Rank
+			grid.cart_comm_3d );	// MPI Communicator
+
+	// Total Fission Reduction
+	MPI_Reduce( &node_fission,     	// Send Buffer
+			&tot_fission,  			// Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+
+	// Total Leakage Reduction
+	MPI_Reduce( params.leakage,  	// Send Buffer
+			&leakage,      			// Receive Buffer
+			1,                    	// Element Count
+			MPI_FLOAT,           	// Element Type
+			MPI_SUM,              	// Reduciton Operation Type
+			0,                    	// Master Rank
+			grid.cart_comm_3d );  	// MPI Communicator
+
+	MPI_Barrier(grid.cart_comm_3d);
+
+	// calculate keff
+	float keff = tot_fission/ (tot_abs + leakage);
+	#else
+	float keff = node_fission / (node_abs + *params.leakage);
+	#endif
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// free memory
+	free(sigma);
+	free(group_rates);
+	free(fine_rates);
+	free(QSR_rates);
+
+	return keff;
+}
+
+/* Interpolates a formed exponential table to compute ( 1- exp(-x) )
+ *  at the desired x value */
+float interpolateTable( Table table, float x)
+{
+	// check to ensure value is in domain
+	if( x > table.maxVal )
+		return 1.0f;
+	else
+	{
+		int interval = (int) ( x / table.dx + 0.5f * table.dx );
+		/*
+		   if( interval >= table.N || interval < 0)
+		   {
+		   printf( "Interval = %d\n", interval);
+		   printf( "N = %d\n", table.N);
+		   printf( "x = %f\n", x);
+		   printf( "dx = %f\n", table.dx);
+		   exit(1);
+		   }
+		   */
+		float slope = table.values[ 2 * interval ];
+		float intercept = table.values[ 2 * interval + 1 ];
+		float val = slope * x + intercept;
+		return val;
+	}
+}
+

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/source.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,232 @@
+#include"SimpleMOC_header.h"
+
+// intitialize source structures and intitialize XS values to random numbers
+Source * initialize_sources( Input I, size_t * nbytes )
+{
+	// Allocate space
+	Source * sources = (Source *) malloc( I.n_source_regions_per_node 
+			* sizeof(Source) );
+	*nbytes += I.n_source_regions_per_node * sizeof(Source);
+
+	// determine number of cross section and coarse axial regions
+	long n_xs_regions = I.n_source_regions_per_node / 8;
+	
+	///////////////////////////////////////////////////////////////////////////
+	
+	// Allocate scattering matrix matrix ptrs
+	float *** s_matrices = (float ***) malloc( n_xs_regions 
+			* sizeof(float**) );
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for ALL scattering matrix ptrs
+	float ** s_matrix_ptrs = (float **) malloc( n_xs_regions * I.n_egroups 
+			* sizeof(float *));
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for ALL scattering data
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) 
+		printf("Scattering data requires %zu MB of data...\n", 
+				n_xs_regions * I.n_egroups * I.n_egroups 
+				* sizeof(float) / 1024 / 1024);
+#endif
+	float * s_matrix_data = (float *) malloc( n_xs_regions * I.n_egroups
+		   	* I.n_egroups * sizeof(float));
+	*nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float);
+
+	// Stitch allocation ptrs together
+	for( long i = 0; i < n_xs_regions; i++ )
+		s_matrices[i] = &s_matrix_ptrs[i * I.n_egroups];
+
+	for( long i = 0; i < n_xs_regions; i++ )
+		for( long j = 0; j < I.n_egroups; j++ )
+			s_matrices[i][j] = &s_matrix_data[i * I.n_egroups * I.n_egroups 
+				+ j * I.n_egroups];
+
+	// Iniitalize Scattering Matrix Values
+	for( long i = 0; i < n_xs_regions; i++ )
+		for( long j = 0; j < I.n_egroups; j++ )
+			for( long k = 0; k < I.n_egroups; k++ )
+				s_matrices[i][j][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+	
+	/*
+	 * Create data scrtucture for storing XS data (and chi) as follows:
+	 * An array is created which stores in contigious memory as
+	 * [ ... , nu*SigmaF, SigmaA, Chi, ...]
+	 */
+	
+	if(I.mype==0) printf("Beginning XS Allocation...\n");
+
+	// Allocate space for XS ptrs (by region)
+	float *** XS = (float ***) malloc( n_xs_regions * sizeof(float **) );
+	*nbytes += n_xs_regions * sizeof(float **);
+
+	// Allocate space for each XS type of interest (total, nu*SigmaF, and chi)
+	float ** XS_arrays = (float **) malloc (n_xs_regions * I.n_egroups
+		   	* sizeof(float *) );
+	*nbytes += n_xs_regions * I.n_egroups * sizeof(float *);
+
+	// Allocate space for total XS data
+	float * XS_data = (float *) malloc( n_xs_regions * I.n_egroups * 3 
+			* sizeof(float) );
+	*nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float);
+
+	// stitch allocation ptrs together for XS data
+	for( long i = 0; i < n_xs_regions; i++)
+		XS[i] = &XS_arrays[i * I.n_egroups];
+
+	for( long i = 0; i < n_xs_regions; i++)
+		for(long j = 0; j < I.n_egroups; j++)
+			XS[i][j] = &XS_data[i * I.n_egroups * 3 + j * 3];
+
+	// Initialize XS data
+	for( long i = 0; i < n_xs_regions; i++)
+		for( int j = 0; j < I.n_egroups; j++)
+			for( int k = 0; k < 3; k++)
+				XS[i][j][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+
+	/* Here I allocate useful source parameters and fluxes for use in the
+	 * attenuate_fluxes function in solver.c - the parameters are arranged
+	 * contigiously for efficiency */
+
+	if(I.mype==0) printf("Beginning Source and Flux Parameter Allocation...\n");
+
+	// Allocate space for source parameters (quadratic axially)
+	float *** fineSource = (float ***) malloc( I.n_source_regions_per_node
+		   	* sizeof(float **) );
+	*nbytes += I.n_source_regions_per_node * sizeof(float **);
+
+	// Allocate space for array pointers to parameters
+	float ** fineSourcePtrs = (float **) malloc ( I.n_source_regions_per_node
+		   	* I.fai * sizeof(float *) );
+	*nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+
+	// Allocate space for flux parameters (quadratic axially)
+	float *** fineFlux = (float ***) malloc( I.n_source_regions_per_node
+		   	* sizeof(float **) );
+	*nbytes += I.n_source_regions_per_node * sizeof(float **);
+
+	// Allocate space for array pointers to parameters
+	float ** fineFluxPtrs = (float **) malloc ( I.n_source_regions_per_node
+		   	* I.fai * sizeof(float *) );
+	*nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+
+	// Allocate space for cross section pointers
+	float ** sigT = (float **) malloc( I.n_source_regions_per_node
+			* sizeof(float *) );
+
+	// Allocate space for parameter data
+	float * data = (float *) malloc( (2 * I.fai + 1) * 
+			I.n_source_regions_per_node * I.n_egroups * sizeof(float) );
+
+	*nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups 
+		* sizeof(float);
+
+	///////////////////////////////////////////////////////////////////////////
+	
+	// stitch pointers together
+
+	// stitch allocation ptrs together for source data
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		fineSource[i] = &fineSourcePtrs[i * I.fai];
+
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for(long j = 0; j < I.fai; j++)
+			fineSource[i][j] = &data[i * I.fai * I.n_egroups
+			   	+ j * I.n_egroups];
+
+	// stitch allocation ptrs together for flux data
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		fineFlux[i] = &fineFluxPtrs[i * I.fai];
+
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for(int j = 0; j < I.fai; j++)
+			fineFlux[i][j] = &data[I.n_egroups * I.fai * 
+				(I.n_source_regions_per_node + i) + j * I.n_egroups];
+	
+	// stitch allocation ptrs together for total XS
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		sigT[i] = &data[ 2 * I.n_source_regions_per_node * I.fai * I.n_egroups 
+			+ i * I.n_egroups];
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// Initialize source to random numbers
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int j = 0; j < I.fai; j++)
+			for( int k = 0; k < I.n_egroups; k++)
+				fineSource[i][j][k] = urand();
+		
+	// Initialize fine flux to zeros
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int j = 0; j < I.fai; j++)
+			for( int k = 0; k < I.n_egroups; k++)
+				fineFlux[i][j][k] = 0;
+	
+
+	// Initialize total XS to random numbers
+	for( long i = 0; i < I.n_source_regions_per_node; i++)
+		for( int k = 0; k < I.n_egroups; k++)
+			sigT[i][k] = urand();
+
+	///////////////////////////////////////////////////////////////////////////
+	//
+	
+	#ifdef OPENMP
+	omp_lock_t * locks = init_locks(I);
+	long lock_idx = 0;
+	#endif
+
+	// Assign to source regions
+	for( long i = 0; i < I.n_source_regions_per_node; i++ )
+	{
+		long idx;
+		if( i == 0 )
+			idx = 0;
+		else
+			idx = rand() % n_xs_regions;
+
+		sources[i].scattering_matrix = s_matrices[idx];
+		sources[i].XS = XS[idx];
+		sources[i].fine_flux = fineFlux[i];
+		sources[i].fine_source = fineSource[i]; 
+		sources[i].sigT = sigT[i];
+
+		// initialize FSR volume
+		sources[i].vol = urand();
+
+		#ifdef OPENMP
+		sources[i].locks = &locks[lock_idx];
+		lock_idx += I.fai;
+		#endif
+	}
+
+	// free memory of temporary variables
+	free( s_matrices );
+	free( XS );
+	free( fineFlux );
+	free( fineSource);
+	free( sigT);
+
+	return sources;
+}
+
+void free_sources( Input I, Source * sources )
+{
+	// Free XS's
+	free( sources[0].XS[0] );
+	free( sources[0].XS );
+	// Free Flux's
+	free( sources[0].fine_flux[0] );
+	free( sources[0].fine_flux );
+	// Free scattering matrices
+	free( sources[0].scattering_matrix[0] );
+	free( sources[0].scattering_matrix );
+	// Free source values
+	free( sources[0].fine_source[0] );
+	free( sources[0].fine_source );
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/test.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,71 @@
+#include"SimpleMOC_header.h"
+
+// prints input parameters to screen
+void print_Input_struct( Input I )
+{
+	printf("INPUT STRUCT\n");
+	printf("%d\n", I.x_assemblies);    		/* Number of assemblies in the 
+												x-axis of the reactor */
+	printf("%d\n", I.y_assemblies);       	/* Number of assemblies in the 
+												y-axis of the reactor */
+	printf("%d\n", I.cai);                 	// Number of course axial intervals
+	printf("%d\n", I.fai);                 	/* Number of fine axial intervals 
+											   per coarse axial interval */
+	printf("%d\n", I.axial_exp);          	// Axial source expansion order
+	printf("%lf\n", I.radial_ray_sep);    	// Radial ray separation
+	printf("%lf\n", I.axial_z_sep);       	// Axial stacked z-ray separation
+	printf("%d\n", I.n_azimuthal);         	// Number of azimuthal angles
+	printf("%d\n", I.n_polar_angles);      	// Number of polar angles
+	printf("%d\n", I.n_egroups);           	// Number of energy groups
+	printf("%d\n", I.decomp_assemblies_ax);	/* Number of assemblies per 
+												sub-domain (axially) */
+	printf("%ld\n", I.segments_per_track); 	/* Average number of segments per 
+												track */
+	printf("%lf\n", I.assembly_width);    	// Width of an assembly
+	printf("%lf\n", I.height);            	// Height of the reactor
+	printf("%lf\n", I.domain_height);     	// Z Height of a domain
+	printf("%lf\n", I.precision);		  	// precision for source convergence
+	printf("%ld\n", I.mype);               	// MPI Rank
+	printf("%ld\n", I.ntracks_2D);        	// Number of 2D tracks (derived)
+	printf("%d\n", I.z_stacked);          	// Number of z rays (derived)
+	printf("%ld\n", I.ntracks);           	/* Total number of 3D tracks per 
+											   assembly (derived) */
+	printf("%d\n", I.nthreads);           	// Number of OpenMP Threads
+	printf("%d\n", I.papi_event_set);     	/* PAPI event set:
+												0 - FLOPS   
+												1 - Bandwidth   
+												2 - CPU Stall reason
+											*/
+	
+	// Source regions per assembly
+	printf("%ld\n", I.n_2D_source_regions_per_assembly);
+
+	// Source regions per node (derived)	
+	printf("%ld\n", I.n_source_regions_per_node); 
+	printf("END INPUT STRUCT\n");
+	
+}
+
+// generates normally distributed random numbers
+void gen_norm_pts(float mean, float sigma, int n_pts)
+{
+	// generate output file
+	FILE * out;
+	out = fopen("gen_points.txt","w");
+	fprintf(out, "Random numbers generated for a normal distribution\n");
+	fprintf(out, "Mean = %f\n", mean);
+	fprintf(out, "Standard deviation = %f\n", sigma);
+	fprintf(out, "Generated points:\n");
+
+	// generate gaussian random points
+	for(int i = 0; i < n_pts; i++)
+	{
+		float random = nrand(mean,sigma);	
+		fprintf(out, "%f\n", random);
+	}
+
+	// close stream
+	fclose(out);
+
+	return;
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/tracks.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,327 @@
+#include"SimpleMOC_header.h"
+
+// Allocates and initializes an array of 2D tracks
+Track2D * generate_2D_tracks( Input I, size_t * nbytes )
+{
+	// Allocate space for 2D tracks
+	Track2D * tracks = (Track2D *) malloc( I.ntracks_2D * sizeof(Track2D));
+	*nbytes += I.ntracks_2D * sizeof(Track2D);
+
+	// Fill weights with randomized data
+	for( int i = 0; i < I.ntracks_2D; i++ )
+		tracks[i].az_weight = urand();
+
+	// Allocate and randomize segments
+	generate_2D_segments( I, tracks, nbytes );
+
+	return tracks;
+}
+
+// Allocates and initializes all segments
+void generate_2D_segments( 
+		Input I, 
+		Track2D * tracks, 
+		size_t * nbytes )
+{
+	/* Randomize Number of segments per track, and accumulate total 2D 
+	 * segments in assembly */
+	long total_segments = 0;
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		tracks[i].n_segments = segments_per_2D_track_distribution( I );
+		total_segments += tracks[i].n_segments;
+	}
+	
+	// Allocate contiguous space for segments
+	Segment * contiguous_segments = (Segment *) malloc( total_segments 
+			* sizeof(Segment));
+	
+	*nbytes += total_segments * sizeof(Segment);
+
+	// Set segments arrays to correct locations within contiguous allocation
+	long idx = 0;
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		tracks[i].segments = &contiguous_segments[idx];
+		idx += tracks[i].n_segments;
+	}
+
+	// Initialize segments to randomized values
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( long j = 0; j < tracks[i].n_segments; j++ )
+		{
+			tracks[i].segments[j].length  = urand() * I.assembly_width
+				/ tracks[i].n_segments;
+			// source ID to be calculated on the fly
+		}
+	}
+}
+
+// generate segments per 2D track based on a normal distribution
+long segments_per_2D_track_distribution( Input I )
+{
+	return nrand(I.segments_per_track, sqrt(I.segments_per_track));
+}
+
+// free memory associated with 2D tracks
+void free_2D_tracks( Track2D * tracks )
+{
+	free(tracks[0].segments);
+	free(tracks);
+}
+
+// allocate memory for tracks (primarily angular fluxes)
+Track *** generate_tracks(Input I, Track2D * tracks_2D, size_t * nbytes)
+{
+	// Allocate space for tracks (3D)
+	Track *** tracks = (Track ***) malloc( I.ntracks_2D * sizeof(Track **));
+	*nbytes += I.ntracks_2D * sizeof(Track **);
+
+	// Allocate pointers for tracks associated with a 2D track
+	Track ** tracks_in_track2D = (Track **) malloc( I.ntracks_2D *
+		   	I.n_polar_angles * sizeof(Track *));
+	*nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *);
+
+	// Allocate complete array of track data
+	Track * track_data = (Track *) malloc( I.ntracks * sizeof(Track) );
+	*nbytes += I.ntracks * sizeof(Track);
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("3D track data requires %zu MB of data...\n", 
+			I.ntracks * sizeof(Track) / 1024 / 1024 );
+#endif
+
+	// stitch pointers together
+	for( long i = 0; i < I.ntracks_2D; i++ )
+		tracks[i] = &tracks_in_track2D[ i * I.n_polar_angles ];
+
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( int j = 0; j < I.n_polar_angles; j++ )
+		{
+			tracks[i][j] = &track_data[ 
+				(i * I.n_polar_angles + j) * I.z_stacked ];
+		}
+	}
+
+	// Allocate space for Flux Arrays
+	size_t flux_bytes_needed = 2 * I.ntracks * I.n_egroups * sizeof(float);
+	
+#ifdef PRINT_MEM_SIZES
+	if(I.mype==0) printf("Flux Arrays Require %zu MB of data...\n", 
+			flux_bytes_needed / 1024 / 1024);
+#endif
+
+	float * flux_space = (float *) malloc( flux_bytes_needed );
+	*nbytes += flux_bytes_needed;
+	size_t flux_idx = 0;
+
+	long offset = I.ntracks * I.n_egroups;
+
+	for( long i = 0; i < I.ntracks_2D; i++ )
+	{
+		for( int j = 0; j < I.n_polar_angles; j++ )
+		{
+			for( int k = 0; k < I.z_stacked; k++ )
+			{
+				/* bottom z heights should only have upward directed polar
+				 * angles and  similarly top should only have downward directed
+				 * polar angles */
+				if( j < I.n_polar_angles/2 )
+					tracks[i][j][k].z_height = I.axial_z_sep * k;
+				else
+					tracks[i][j][k].z_height = I.axial_z_sep * ( k + 1 );
+				
+				// set polar weight, NOTE: this is the same for same polar angle
+				tracks[i][j][k].p_weight = urand();
+
+				// assign angular flux array memory
+				tracks[i][j][k].f_psi = &flux_space[flux_idx];
+				flux_idx += I.n_egroups;
+				tracks[i][j][k].b_psi = &flux_space[flux_idx];
+				flux_idx += I.n_egroups;
+
+				// set incoming flux to 0 (guess 0 for inner assemblies)
+				for( int g = 0; g < I.n_egroups; g++)
+				{
+					tracks[i][j][k].f_psi[g] = 0;
+					tracks[i][j][k].b_psi[g] = 0;
+				}
+			}
+		}
+	}
+	return tracks;
+}
+
+// free memory associated with tracks
+void free_tracks( Track *** tracks )
+{
+	free(tracks);
+}
+
+// assign polar angles
+float * generate_polar_angles( Input I )
+{
+	float * polar_angles = (float *) malloc( I.n_polar_angles * 
+			sizeof(float) );
+
+	for( int i = 0; i < I.n_polar_angles; i++)
+		polar_angles[i] = M_PI * (i + 0.5) / I.n_polar_angles;
+	
+	return polar_angles;
+}
+
+Track2D * load_OpenMOC_tracks(char* fname, bool CMFD_obj, Input* I, size_t* nbytes)
+{
+	int ret;
+	FILE* in;
+	in = fopen(fname, "r");
+	printf("Reading track data from:\n%s\n",fname);
+	int string_length;
+
+	/* Import Geometry metadata from the Track file */
+	ret = fread(&string_length, sizeof(int), 1, in);
+	char geometry_to_string[string_length];
+	ret = fread(geometry_to_string, sizeof(char)*string_length, 1, in);
+	printf("Importing ray tracing data from file...\n");
+	
+	/* Import ray tracing metadata from the Track file */
+	double spacing;
+	ret = fread(&I->n_azimuthal, sizeof(int), 1, in);
+	ret = fread(&spacing, sizeof(double), 1, in);
+	I->radial_ray_sep = (float) spacing;
+
+	/* Initialize data structures for Tracks */
+	int num_tracks[I->n_azimuthal];
+	int num_x[I->n_azimuthal];
+	int num_y[I->n_azimuthal];
+	double azim_weights[I->n_azimuthal];
+	
+	ret = fread(num_tracks, sizeof(int), I->n_azimuthal, in);
+	ret = fread(num_x, sizeof(int), I->n_azimuthal, in);
+	ret = fread(num_y, sizeof(int), I->n_azimuthal, in);
+	ret = fread(azim_weights, sizeof(double), I->n_azimuthal, in);
+	
+	/* Import azimuthal angle quadrature weights from Track file */
+	double x0, y0, x1, y1;
+	double phi;
+	int azim_angle_index;
+	int num_segments;
+	double length;
+	int material_id;
+	int region_id;
+	int mesh_surface_fwd;
+	int mesh_surface_bwd;
+	long num_2D_tracks;
+
+	/* Calculate the total number of 2D Tracks */
+	I->ntracks_2D = 0;
+	for (int i=0; i < I->n_azimuthal; i++)
+		I->ntracks_2D += (long) num_tracks[i];	
+	
+	/* Allocate memory for 2D tracks */
+	Track2D * tracks = (Track2D *) malloc( I->ntracks_2D * sizeof(Track2D));
+	*nbytes += I->ntracks_2D * sizeof(Track2D);
+
+	// Allocate memory for the number of segments per Track array
+	long tot_num_segments = 0;
+
+	fpos_t checkpoint;
+	fgetpos(in, &checkpoint);
+
+	// Loop over tracks to determine total number of segments
+	for (int i=0; i < I->n_azimuthal; i++)
+	{
+		for (int j=0; j < num_tracks[i]; j++)
+		{
+			// Skip to segment number information
+			fseek( in, 5*sizeof(double) + sizeof(int), SEEK_CUR);
+			
+			// Load number of segments
+			ret = fread(&num_segments, sizeof(int), 1, in);
+			tot_num_segments += (long) num_segments;
+
+			// Loop over all segments in this Track
+			for (int s=0; s < num_segments; s++)
+			{
+				// Skip ahead again 
+				fseek( in, 2*sizeof(int) + sizeof(double), SEEK_CUR);
+				if (CMFD_obj)
+					fseek( in, 2*sizeof(int), SEEK_CUR);
+			}
+		}
+	}
+
+	// Allocate contiguous space for segments 
+	Segment * contiguous_segments = (Segment *) malloc( tot_num_segments 
+			* sizeof(Segment));
+	*nbytes += tot_num_segments * sizeof(Segment);
+
+	// Reset file handle 
+	fsetpos(in, &checkpoint);
+	
+	/* Initialize 2D track unique identifer (uid) end semgnet index (idx)*/
+	int uid = 0;
+	int idx = 0;
+	
+	/* Loop over Tracks */
+	for (int i=0; i < I->n_azimuthal; i++)
+	{
+		//TODO: recored actual azimuthal angles
+		for (int j=0; j < num_tracks[i]; j++)
+		{
+			/* Import data for this Track from Track file */
+			ret = fread(&x0, sizeof(double), 1, in);
+			ret = fread(&y0, sizeof(double), 1, in);
+			ret = fread(&x1, sizeof(double), 1, in);
+			ret = fread(&y1, sizeof(double), 1, in);
+			ret = fread(&phi, sizeof(double), 1, in);
+			ret = fread(&azim_angle_index, sizeof(int), 1, in);
+			
+			/* Load number of segments */
+			ret = fread(&num_segments, sizeof(int), 1, in);
+			tracks[uid].n_segments = (long) num_segments;
+
+			/* Allocate memory for segments */
+			tracks[uid].segments = &contiguous_segments[idx];
+			idx += tracks[uid].n_segments;
+
+			// TODO: set real azimuthal weight
+			tracks[uid].az_weight = urand();
+		
+			/* Loop over all segments in this Track */
+			for (int s=0; s < num_segments; s++)
+			{
+				/* Import data for this segment from Track file */
+				ret = fread(&length, sizeof(double), 1, in);
+				ret = fread(&material_id, sizeof(int), 1, in);
+				ret = fread(&region_id, sizeof(int), 1, in);
+			
+				/* record track length */
+				tracks[uid].segments[s].length = (float) length;
+
+				/* record region_id (not used at the moment) */
+				tracks[uid].segments[s].source_id = (long) region_id;
+
+				/* Import CMFD-related data if needed */
+				if (CMFD_obj)
+				{
+					ret = fread(&mesh_surface_fwd, sizeof(int), 1, in);
+					ret = fread(&mesh_surface_bwd, sizeof(int), 1, in);
+				}
+			}
+			uid++;
+		}
+	}
+	// recalculate average number of segments per track
+	I->segments_per_track = tot_num_segments / I->ntracks_2D;	
+
+	printf("Number of 2D tracks = %ld\n", I->ntracks_2D);
+	I->ntracks = I->ntracks_2D * I->n_polar_angles * I->z_stacked; 
+	printf("Number of 3D tracks = %ld\n", I->ntracks);
+	printf("Number of segments = %ld\n", tot_num_segments);
+
+	/* Close the Track file */
+	fclose(in);
+	return tracks;
+}

Added: test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c
URL: http://llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c?rev=312615&view=auto
==============================================================================
--- test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c (added)
+++ test-suite/trunk/MultiSource/Benchmarks/DOE-ProxyApps-C/SimpleMOC/utils.c Tue Sep  5 21:05:03 2017
@@ -0,0 +1,155 @@
+#include"SimpleMOC_header.h"
+
+// generates a random number between 0 and 1
+float urand(void)
+{
+	return (float)rand() / (float)RAND_MAX;
+}
+
+/* generates a random number from a normal distribution of a 
+ * given mean and standard deviation (Box-Muller) */
+float nrand(float mean, float sigma)
+{
+	// generate two random numbers
+	float rand1 = urand();
+	float rand2 = urand();
+
+	/* use first for "exponential part" and second for
+	 *  "azimuthal part" to create a standard normal random number */
+	float x = sqrt( -2 * log(rand1) ) * cos( 2 * M_PI * rand2);
+
+	// NOTE: We can generate a second random number if neeeded
+	// y = sqrt(-2*log(rand1)) * sin( 2 * M_PI * rand2);
+	
+	// shift random number to appropriate normal distribution and return
+	return x * sigma + mean;
+}
+
+// pairwise summation for long arrays
+float pairwise_sum( float * vector, long size ){
+	float sum = 0;
+
+	// Base case: perform summation if size <= 16
+	if( size <= 16)
+		for( int i = 0; i < size; i++ )
+			sum += vector[i];
+
+	else
+	{
+		// otherwise, split
+		sum = pairwise_sum( &vector[0], size/2 ) +
+			pairwise_sum( &vector[size/2], size - size/2 );
+	}
+	
+	return sum;
+}
+
+// Builds a table of exponential values for linear interpolation
+Table buildExponentialTable( float precision, float maxVal )
+{
+	// define table
+	Table table;
+
+	// compute number of arry values
+	int N = (int) ( maxVal * sqrt(1.0 / ( 8.0 * precision * 0.01 ) ) ); 
+
+	// compute spacing
+	float dx = maxVal / (float) N;
+
+	// allocate an array to store information
+	float * tableVals = malloc( (2 * N ) * sizeof(float) );
+
+	// store linear segment information (slope and y-intercept)
+	for( int n = 0; n < N; n++ )
+	{
+		// compute slope and y-intercept for ( 1 - exp(-x) )
+		float exponential = exp( - n * dx );
+		tableVals[ 2*n ] = - exponential;
+		tableVals[ 2*n + 1 ] = 1 + ( n * dx - 1 ) * exponential;
+	}
+
+	// assign data to table
+	table.dx = dx;
+	table.values = tableVals;
+	table.maxVal = maxVal - table.dx;
+	table.N = N;
+
+	return table;
+}
+
+// Timer function. Depends on if compiled with MPI, openmp, or vanilla
+double get_time(void)
+{
+	#ifdef MPI
+	return MPI_Wtime();
+	#endif
+
+	#ifdef OPENMP
+	return omp_get_wtime();
+	#endif
+
+	time_t time;
+	time = clock();
+
+	return (double) time / (double) CLOCKS_PER_SEC;
+}
+
+// Memory Usage Estimator
+size_t est_mem_usage( Input I )
+{
+	size_t nbytes = 0;
+	
+	int z_stacked = (int) ( I.height / (I.axial_z_sep * 
+				I.decomp_assemblies_ax) );
+	
+	long n_xs_regions = I.n_source_regions_per_node / 8;
+
+	nbytes += I.ntracks_2D * sizeof(Track2D);
+	nbytes += I.segments_per_track * I.ntracks_2D * sizeof(Segment);
+	nbytes += I.ntracks_2D * sizeof(Track **);
+	nbytes += I.ntracks_2D * I.n_polar_angles * sizeof(Track *);
+	nbytes += I.ntracks * sizeof(Track);
+	nbytes += I.ntracks_2D * I.n_polar_angles * z_stacked 
+		* I.n_egroups * sizeof(float) * 2;
+	nbytes += I.n_source_regions_per_node * sizeof(Source);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * I.n_egroups * I.n_egroups * sizeof(float);
+	nbytes += n_xs_regions * sizeof(float **);
+	nbytes += n_xs_regions * I.n_egroups * sizeof(float *);
+	nbytes += n_xs_regions * I.n_egroups * 3 * sizeof(float);
+	nbytes += I.n_source_regions_per_node * sizeof(float **);
+	nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+	nbytes += I.n_source_regions_per_node * sizeof(float **);
+	nbytes += I.n_source_regions_per_node * I.fai * sizeof(float *);
+	nbytes += I.n_source_regions_per_node * I.fai * I.n_egroups
+		* sizeof(float);
+	
+	// 2way tracking memory
+	nbytes += I.nthreads * I.z_stacked * sizeof(double *);
+	nbytes += I.nthreads * I.z_stacked * sizeof(Source**);
+	nbytes += I.nthreads * I.z_stacked * sizeof(int);
+	nbytes += I.nthreads * I.z_stacked * sizeof(int);			
+	nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track 
+		* sizeof(double);
+	nbytes += I.nthreads * I.z_stacked * 2 * I.segments_per_track 
+		* sizeof(Source *);		
+
+	// MPI Buffers
+	#ifdef MPI
+	nbytes += 6 * I.n_egroups * 10000 * sizeof(float);
+	#endif
+	
+	return nbytes;
+}
+
+// Calculates Time per Intersection
+double time_per_intersection( Input I, double time )
+{
+	/* This was the old estimate - new way uses actual tracking data
+	double tpi = time / (double) I.ntracks /
+		(double) I.segments_per_track / (double) I.n_egroups / 1e-9 / 2; 
+		*/
+	double tpi = time / (double) I.segments_processed * 1.0e9 / (double) I.n_egroups;
+	return tpi;
+}