Commit f60ed308 authored by Éric Thiébaut's avatar Éric Thiébaut
Browse files

Define TAO_ALIGNMENT and make some shared array fields constant

TAO_ALIGNEMENT is set to 64 bytes, the cache line size.
parent bce96b22
......@@ -158,13 +158,6 @@ static void warn(
// Hack to write constants in shared structure.
static void overwrite_long(const long* ptr, long val);
// Alignment (in bytes) for storing list of commands.
#ifdef __BIGGEST_ALIGNMENT__
# define ALIGNMENT (__BIGGEST_ALIGNMENT__)
#else
# define ALIGNMENT 64 // 512-bit alignment
#endif
// Clamp x in the range [lo,hi].
#define CLAMP(x, lo, hi) ((x) <= (lo) ? (lo) : ((x) >= (hi) ? (hi) : (x)))
......@@ -295,7 +288,7 @@ alpao_mirror_t* alpao_open_mirror(
// Allocate zero-filled memory for the mirror structure plus enough aligned
// space for temporary commands.
long offset = TAO_ROUND_UP(sizeof(alpao_mirror_t), ALIGNMENT);
long offset = TAO_ROUND_UP(sizeof(alpao_mirror_t), TAO_ALIGNMENT);
long size = offset + number*sizeof(Scalar);
dm = tao_malloc(size);
if (dm == NULL) {
......
......@@ -62,7 +62,7 @@ tao_array_t* tao_create_array(
return NULL;
}
size_t header = sizeof(tao_array_t);
size_t size = header + ALIGNMENT - 1 + elsize*nelem;
size_t size = header + TAO_ALIGNMENT - 1 + elsize*nelem;
tao_array_t* arr = (tao_array_t*)tao_malloc(size);
if (arr == NULL) {
return NULL;
......@@ -79,7 +79,7 @@ tao_array_t* tao_create_array(
arr->dims[d] = 1;
}
size_t address = (char*)arr - (char*)0;
arr->data = (void*)TAO_ROUND_UP(address + header, ALIGNMENT);
arr->data = (void*)TAO_ROUND_UP(address + header, TAO_ALIGNMENT);
return arr;
}
......
......@@ -50,31 +50,4 @@ typedef enum {_False = 0, _True = 1} _Bool;
# endif
#endif
// Alignment of data for vectorization depends on the chosen compilation
// settings. The following table summarizes the value of macro
// `__BIGGEST_ALIGNMENT__` with different settings:
//
// ---------------------------------------
// Alignment (bytes) Compilation Options
// ---------------------------------------
// 16 -ffast-math -msse
// 16 -ffast-math -msse2
// 16 -ffast-math -msse3
// 16 -ffast-math -msse4
// 16 -ffast-math -mavx
// 32 -ffast-math -mavx2
// ---------------------------------------
//
// The address of attached shared memory is a multiple of memory page size
// (PAGE_SIZE which is 4096 on the Linux machine I tested) and so much
// larger than ALIGNMENT (defined below). So, in principle, it is sufficient
// to align the shared array data to a multiple of ALIGNMENT relative to the
// address of the attached shared memory to have correct alignment for all
// processes.
#define ALIGNMENT 32
#if defined(__BIGGEST_ALIGNMENT__) && __BIGGEST_ALIGNMENT__ > ALIGNMENT
# undef ALIGNMENT
# define ALIGNMENT __BIGGEST_ALIGNMENT__
#endif
#endif // _TAO_COMMON_H
......@@ -353,7 +353,20 @@ void tao_copy_checked_args(
}
}
#define DATA(arr) (void*)((uint8_t*)(arr) + (arr)->offset)
static inline void* array_data(const tao_array_t* arr)
{
return arr->data;
}
static inline void* shared_array_data(const tao_shared_array_t* arr)
{
return TAO_GET_SHARED_ARRAY_DATA(arr);
}
#define DATA(arr) \
_Generic(arr, \
tao_array_t *: array_data, \
tao_shared_array_t*: shared_array_data)(arr)
tao_status_t tao_copy_to_array(
tao_array_t* restrict dst,
......@@ -396,7 +409,8 @@ tao_status_t tao_copy_to_shared_array(
tao_push_error(__func__, TAO_BAD_RANK);
return TAO_ERROR;
}
return tao_copy(DATA(dst), dst->eltype, dst->dims, dstoffs,
return tao_copy(TAO_GET_SHARED_ARRAY_DATA(dst),
dst->eltype, dst->dims, dstoffs,
srcdata, srctype, srcdims, srcoffs,
lens, ndims);
}
......@@ -443,7 +457,8 @@ tao_status_t tao_copy_from_shared_array(
return TAO_ERROR;
}
return tao_copy(dstdata, dsttype, dstdims, dstoffs,
DATA(src), src->eltype, src->dims, srcoffs,
TAO_GET_SHARED_ARRAY_DATA(src),
src->eltype, src->dims, srcoffs,
lens, ndims);
}
......@@ -463,8 +478,8 @@ tao_status_t tao_copy_array_to_array(
tao_push_error(__func__, TAO_BAD_RANK);
return TAO_ERROR;
}
return tao_copy(dst->data, dst->eltype, dst->dims, dstoffs,
src->data, src->eltype, src->dims, srcoffs,
return tao_copy(DATA(dst), dst->eltype, dst->dims, dstoffs,
DATA(src), src->eltype, src->dims, srcoffs,
lens, ndims);
}
......@@ -485,7 +500,7 @@ tao_status_t tao_copy_array_to_shared_array(
return TAO_ERROR;
}
return tao_copy(DATA(dst), dst->eltype, dst->dims, dstoffs,
src->data, src->eltype, src->dims, srcoffs,
DATA(src), src->eltype, src->dims, srcoffs,
lens, ndims);
}
......@@ -505,7 +520,7 @@ tao_status_t tao_copy_shared_array_to_array(
tao_push_error(__func__, TAO_BAD_RANK);
return TAO_ERROR;
}
return tao_copy(dst->data, dst->eltype, dst->dims, dstoffs,
return tao_copy(DATA(dst), dst->eltype, dst->dims, dstoffs,
DATA(src), src->eltype, src->dims, srcoffs,
lens, ndims);
}
......
......@@ -12,13 +12,6 @@
#include <stdlib.h>
#include <tao-mirrors.h>
// Alignment (in bytes) for storing list of commands.
#ifdef __BIGGEST_ALIGNMENT__
# define ALIGNMENT (__BIGGEST_ALIGNMENT__)
#else
# define ALIGNMENT 64 // 512-bit alignment
#endif
// Yields current and scheduled commands.
#define CURRENT(dm) (const void*)((char*)(dm) + (dm)->current)
#define SCHEDULED(dm) ( void*)((char*)(dm) + (dm)->scheduled)
......@@ -52,8 +45,8 @@ tao_shared_mirror_t* tao_create_shared_mirror(
// Compute total size of shared object and offsets to current and scheduled
// list of commands.
long off1 = TAO_ROUND_UP(sizeof(tao_shared_mirror_t), ALIGNMENT);
long off2 = TAO_ROUND_UP(off1 + number*elsize, ALIGNMENT);
long off1 = TAO_ROUND_UP(sizeof(tao_shared_mirror_t), TAO_ALIGNMENT);
long off2 = TAO_ROUND_UP(off1 + number*elsize, TAO_ALIGNMENT);
long size = off2 + number*elsize;
tao_shared_mirror_t* dm = (tao_shared_mirror_t*)
tao_create_shared_object(owner, TAO_SHARED_MIRROR, size, perms);
......
......@@ -44,7 +44,7 @@ long tao_get_shared_array_dim(
void* tao_get_shared_array_data(
const tao_shared_array_t* arr)
{
return (arr != NULL ? (void*)((uint8_t*)arr + arr->offset) : (void*)0);
return (arr != NULL ? TAO_GET_SHARED_ARRAY_DATA(arr) : (void*)0);
}
int64_t tao_get_shared_array_counter(
......@@ -80,7 +80,8 @@ void tao_set_shared_array_timestamp(
int idx,
const tao_time_t* restrict ts)
{
if (arr != NULL && ts != NULL && 0 <= idx && idx < TAO_SHARED_ARRAY_TIMESTAMPS) {
if (arr != NULL && ts != NULL && 0 <= idx
&& idx < TAO_SHARED_ARRAY_TIMESTAMPS) {
arr->ts[idx] = *ts;
}
}
......@@ -124,6 +125,21 @@ tao_shared_array_t* tao_create_3d_shared_array(
return tao_create_shared_array(owner, eltype, 3, dims, perms);
}
static inline void force_set_int(const int* ptr, int val)
{
*(int*)ptr = val;
}
static inline void force_set_long(const long* ptr, long val)
{
*(long*)ptr = val;
}
static inline void force_set_eltype(const tao_eltype_t* ptr, tao_eltype_t val)
{
*(tao_eltype_t*)ptr = val;
}
tao_shared_array_t* tao_create_shared_array(
const char* owner,
tao_eltype_t eltype,
......@@ -140,25 +156,28 @@ tao_shared_array_t* tao_create_shared_array(
if (nelem < 1) {
return NULL;
}
size_t offset = TAO_ROUND_UP(sizeof(tao_shared_array_t), ALIGNMENT);
size_t offset = TAO_SHARED_ARRAY_OFFSET;
#if 0
fprintf(stderr, "sizeof(tao_shared_array_t) = %ld bytes\n",
(long)sizeof(tao_shared_array_t));
fprintf(stderr, "array offset = %ld bytes\n", (long)offset);
#endif
size_t nbytes = offset + nelem*elsize;
tao_shared_object_t* obj = tao_create_shared_object(owner,
TAO_SHARED_ARRAY,
nbytes, perms);
tao_shared_object_t* obj =
tao_create_shared_object(owner, TAO_SHARED_ARRAY, nbytes, perms);
if (obj == NULL) {
return NULL;
}
tao_shared_array_t* arr = (tao_shared_array_t*)obj;
arr->offset = offset;
arr->nelem = nelem;
arr->ndims = ndims;
force_set_long(&arr->nelem, nelem);
force_set_int(&arr->ndims, ndims);
for (int d = 0; d < ndims; ++d) {
arr->dims[d] = dims[d];
force_set_long(&arr->dims[d], dims[d]);
}
for (int d = ndims; d < TAO_MAX_NDIMS; ++d) {
arr->dims[d] = 1;
force_set_long(&arr->dims[d], 1);
}
arr->eltype = eltype;
force_set_eltype(&arr->eltype, eltype);
arr->counter = 0;
for (int i = 0; i < TAO_SHARED_ARRAY_TIMESTAMPS; ++i) {
arr->ts[i] = TAO_UNKNOWN_TIME;
......
......@@ -55,26 +55,27 @@ struct _tao_array {
* public API to manipulate a shared array.
*/
struct _tao_shared_array {
tao_shared_object_t base; /**< Shared object backing storage of the shared
const tao_shared_object_t base; /**< Shared object backing storage of the shared
array */
size_t offset; /**< Offset of data part in bytes and relative
to the base address of the object */
long nelem; /**< Number of elements */
int ndims; /**< Number of dimensions */
long dims[TAO_MAX_NDIMS]; /**< Length of each dimension (dimensions beyond
const long nelem; /**< Number of elements */
const int ndims; /**< Number of dimensions */
const long dims[TAO_MAX_NDIMS]; /**< Length of each dimension (dimensions beyond
`ndims` are assumed to be `1`) */
tao_eltype_t eltype; /**< Type of the elements of the shared array */
const tao_eltype_t eltype; /**< Type of the elements of the shared array */
volatile uint64_t counter; /**< Counter (used for posted images) */
volatile tao_time_t ts[TAO_SHARED_ARRAY_TIMESTAMPS]; /**< Time stamps. */
};
#define TAO_SHARED_ARRAY_OFFSET \
TAO_ROUND_UP(sizeof(tao_shared_array_t), TAO_ALIGNMENT)
#define TAO_GET_SHARED_ARRAY_SHMID(arr) ((arr)->base.shmid)
#define TAO_GET_SHARED_ARRAY_ELTYPE(arr) ((arr)->eltype)
#define TAO_GET_SHARED_ARRAY_NDIMS(arr) ((arr)->ndims)
#define TAO_GET_SHARED_ARRAY_NELEM(arr) ((arr)->nelem)
#define TAO_GET_SHARED_ARRAY_DIM(arr, d) ((arr)->dims[(d)-1])
#define TAO_GET_SHARED_ARRAY_DATA(arr) \
(void*)(((uint8_t*)arr) + (arr)->offset)
(void*)(((uint8_t*)arr) + TAO_SHARED_ARRAY_OFFSET)
/**
* Private structure representing a TAO server.
......
......@@ -1291,6 +1291,46 @@ extern const union _tao_byte_order_mark _tao_native_byte_order_mark;
*/
#define TAO_IS_LITTLE_ENDIAN (TAO_NATIVE_ENDIAN_BOM == TAO_LITTLE_ENDIAN_BOM)
/**
* @def TAO_ALIGNMENT
*
* This macro gives the preferred number of bytes for memory alignement. Good
* memory alignement can improve vectorization and false sharing
* (https://en.wikipedia.org/wiki/False_sharing).
* Alignment of data for vectorization depends on the chosen compilation
* settings. The following table summarizes the value of macro
* `__BIGGEST_ALIGNMENT__` with different settings:
*
* | Alignment (bytes) | Compilation Options |
* |:-----------------:|:--------------------|
* | 16 | -ffast-math -msse |
* | 16 | -ffast-math -msse2 |
* | 16 | -ffast-math -msse3 |
* | 16 | -ffast-math -msse4 |
* | 16 | -ffast-math -mavx |
* | 32 | -ffast-math -mavx2 |
*
* The address of attached shared memory is a multiple of memory page size
* (`PAGE_SIZE` which is 4096 on the Linux machine I tested) and so much larger
* than `TAO_ALIGNMENT` (defined below). So, in principle, it is sufficient to
* align the shared array data to a multiple of ALIGNMENT relative to the
* address of the attached shared memory to have correct alignment for all
* processes.
*
* To avoid false sharing, it is necessary to align on multiple of the size of
* the cache lines. These sizes can be quieried on Linux by the following
* command:
*
* LANG=C getconf -a | grep CACHE_LINESIZE
*
* On recent Intel processors, the cache lines have 64 bytes.
*/
#define TAO_ALIGNMENT 64
#if defined(__BIGGEST_ALIGNMENT__) && __BIGGEST_ALIGNMENT__ > TAO_ALIGNMENT
# warning TAO_ALIGNMENT is smaller than __BIGGEST_ALIGNMENT__
#endif
/**
* Identifier of the type of the elements in an array.
*/
......
......@@ -27,12 +27,6 @@
// acquisition buffer is set to `NULL` when the buffer is released.
#define NULLIFY_RELEASED_BUFFERS 0
#define ALIGNMENT 32
#if defined(__BIGGEST_ALIGNMENT__) && __BIGGEST_ALIGNMENT__ > ALIGNMENT
# undef ALIGNMENT
# define ALIGNMENT __BIGGEST_ALIGNMENT__
#endif
#define if_likely(expr) if TAO_LIKELY(expr)
#define if_unlikely(expr) if TAO_UNLIKELY(expr)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment