mirror of https://github.com/ecmwf/eccodes.git
GRIB-153: gribapi patch to speed up encoding on IBM P6/7
This commit is contained in:
parent
cf3a27b03b
commit
7175b9c664
8
NOTICE
8
NOTICE
|
@ -1,7 +1,11 @@
|
|||
ECMWF GRIB API
|
||||
Copyright 2005-2013 ECMWF.
|
||||
|
||||
This product includes software developed at ECMWF (http://www.ecmwf.int).
|
||||
This product includes software developed at
|
||||
ECMWF (http://www.ecmwf.int).
|
||||
|
||||
Parts of the definitions provided by WMO (http://www.wmo.int/pages/index_en.html)
|
||||
Parts of the definitions provided by
|
||||
WMO (http://www.wmo.int/pages/index_en.html)
|
||||
|
||||
IBM POWER optimisations were developed at
|
||||
The Max-Planck-Institute for Meteorology.
|
||||
|
|
13
configure.ac
13
configure.ac
|
@ -89,6 +89,19 @@ fi
|
|||
AC_DEFINE_UNQUOTED(GRIB_PTHREADS,$GRIB_PTHREADS,1->pthreads enabled 0->pthreads disabled)
|
||||
AC_DEFINE_UNQUOTED(GRIB_LINUX_PTHREADS,$GRIB_LINUX_PTHREADS,1->pthreads enabled 0->pthreads disabled)
|
||||
|
||||
dnl check IBM POWER 6/7 optimisations option
|
||||
AC_ARG_ENABLE([ibmpower67_opt],
|
||||
[AS_HELP_STRING([--enable-ibmpower67_opt],[enable IBM POWER 6/7 optimisations [by default disabled]])],
|
||||
[ibmpower67_opts=${enableval}] , [ibmpower67_opts=no]
|
||||
)
|
||||
if test "x${ibmpower67_opts}" = xyes; then
|
||||
GRIB_IBMPOWER67_OPT=1
|
||||
else
|
||||
GRIB_IBMPOWER67_OPT=0
|
||||
fi
|
||||
AC_DEFINE_UNQUOTED(GRIB_IBMPOWER67_OPT,$GRIB_IBMPOWER67_OPT,1->IBM Power6/7 Optimisations enabled 0->IBM Power6/7 Optimisations disabled)
|
||||
|
||||
|
||||
dnl check on uppercase fortran modules not working to be fixed
|
||||
dnl some fortran compilers change the name of the .mod file in upper case!
|
||||
ac_cv_prog_f90_uppercase_mod=no
|
||||
|
|
|
@ -302,33 +302,34 @@ list( APPEND grib_api_srcs
|
|||
grib_windef.h
|
||||
)
|
||||
|
||||
# list( APPEND grib_api_extra_srcs
|
||||
# grib_bits_any_endian.c
|
||||
# grib_bits_any_endian_simple.c
|
||||
# )
|
||||
list( APPEND grib_api_extra_srcs
|
||||
grib_bits_fast_big_endian.c grib_bits_any_endian.c
|
||||
grib_bits_fast_big_endian_vector.c grib_bits_any_endian_vector.c
|
||||
grib_bits_fast_big_endian_simple.c grib_bits_any_endian_simple.c
|
||||
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c
|
||||
encode_double_array.c
|
||||
grib_bits_ibmpow.c
|
||||
grib_bits_ibmpow_opt.c
|
||||
minmax_val.c
|
||||
)
|
||||
|
||||
configure_file( grib_api_version.c.in grib_api_version.c @ONLY )
|
||||
|
||||
ecbuild_generate_yy( YYPREFIX grib_yy
|
||||
ecbuild_generate_yy(YYPREFIX grib_yy
|
||||
YACC griby
|
||||
LEX gribl
|
||||
DEPENDANT action.c )
|
||||
|
||||
ecbuild_add_library( TARGET grib_api
|
||||
SOURCES
|
||||
grib_api_version.c
|
||||
ecbuild_add_library(TARGET grib_api
|
||||
SOURCES grib_api_version.c
|
||||
griby.c gribl.c
|
||||
${grib_api_extra_srcs}
|
||||
${grib_api_srcs}
|
||||
GENERATED
|
||||
grib_api_version.c
|
||||
INCLUDES
|
||||
${GRIB_API_EXTRA_INCLUDE_DIRS}
|
||||
LIBS
|
||||
${GRIB_API_EXTRA_LIBRARIES}
|
||||
GENERATED grib_api_version.c
|
||||
INCLUDES ${GRIB_API_EXTRA_INCLUDE_DIRS}
|
||||
LIBS ${GRIB_API_EXTRA_LIBRARIES}
|
||||
TEMPLATES ${grib_api_extra_srcs}
|
||||
${CMATH_LIBRARIES} )
|
||||
|
||||
|
||||
# TODO: add support for jpg and png
|
||||
|
||||
install( FILES grib_api.h DESTINATION include )
|
||||
|
|
|
@ -332,7 +332,8 @@ EXTRA_DIST= dummy.am griby.y gribl.l grib_accessor_classes_hash.c \
|
|||
grib_bits_fast_big_endian.c grib_bits_any_endian.c \
|
||||
grib_bits_fast_big_endian_vector.c grib_bits_any_endian_vector.c \
|
||||
grib_bits_fast_big_endian_simple.c grib_bits_any_endian_simple.c \
|
||||
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c
|
||||
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c \
|
||||
encode_double_array.c grib_bits_ibmpow.c grib_bits_ibmpow_opt.c minmax_val.c
|
||||
|
||||
include $(DEVEL_RULES)
|
||||
|
||||
|
|
|
@ -0,0 +1,427 @@
|
|||
/*
|
||||
* Copyright 2005-2013 ECMWF.
|
||||
*
|
||||
* This software is licensed under the terms of the Apache Licence Version 2.0
|
||||
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
|
||||
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
|
||||
*/
|
||||
|
||||
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
#include <libhpc.h>
|
||||
#endif
|
||||
|
||||
|
||||
static
|
||||
void encode_double_array_common(int numBits, long packStart, long datasize, GRIBPACK *lGrib,
|
||||
const double *data, double zref, double factor, long *gz)
|
||||
{
|
||||
long i, z = *gz;
|
||||
unsigned int ival;
|
||||
int cbits, jbits;
|
||||
unsigned int c;
|
||||
static unsigned int mask[] = {0,1,3,7,15,31,63,127,255};
|
||||
|
||||
/* code from gribw routine flist2bitstream */
|
||||
|
||||
cbits = 8;
|
||||
c = 0;
|
||||
for ( i = packStart; i < datasize; i++ )
|
||||
{
|
||||
/* note float -> unsigned int .. truncate */
|
||||
ival = (unsigned int) ((data[i] - zref) * factor + 0.5);
|
||||
/*
|
||||
if ( ival > max_nbpv_pow2 ) ival = max_nbpv_pow2;
|
||||
if ( ival < 0 ) ival = 0;
|
||||
*/
|
||||
jbits = numBits;
|
||||
while ( cbits <= jbits )
|
||||
{
|
||||
if ( cbits == 8 )
|
||||
{
|
||||
jbits -= 8;
|
||||
lGrib[z++] = (ival >> jbits) & 0xFF;
|
||||
}
|
||||
else
|
||||
{
|
||||
jbits -= cbits;
|
||||
lGrib[z++] = (c << cbits) + ((ival >> jbits) & mask[cbits]);
|
||||
cbits = 8;
|
||||
c = 0;
|
||||
}
|
||||
}
|
||||
/* now jbits < cbits */
|
||||
if ( jbits )
|
||||
{
|
||||
c = (c << jbits) + (ival & mask[jbits]);
|
||||
cbits -= jbits;
|
||||
}
|
||||
}
|
||||
if ( cbits != 8 ) lGrib[z++] = c << cbits;
|
||||
|
||||
*gz = z;
|
||||
}
|
||||
|
||||
static
|
||||
void encode_double_array_byte(int numBits, long packStart, long datasize,
|
||||
GRIBPACK *restrict lGrib,
|
||||
const double *restrict data,
|
||||
double zref, double factor, long *restrict gz)
|
||||
{
|
||||
long i, z = *gz;
|
||||
unsigned long ival;
|
||||
double tmp;
|
||||
|
||||
data += packStart;
|
||||
datasize -= packStart;
|
||||
|
||||
if ( numBits == 8 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(2, "pack 8 bit base");
|
||||
#endif
|
||||
#if defined (CRAY)
|
||||
#pragma _CRI ivdep
|
||||
#elif defined (SX)
|
||||
#pragma vdir nodep
|
||||
#elif defined (__uxp__)
|
||||
#pragma loop novrec
|
||||
#endif
|
||||
for ( i = 0; i < datasize; i++ )
|
||||
{
|
||||
tmp = ((data[i] - zref) * factor + 0.5);
|
||||
ival = (unsigned long) tmp;
|
||||
lGrib[z ] = ival;
|
||||
z++;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(2);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 16 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(3, "pack 16 bit base");
|
||||
#endif
|
||||
#if defined (CRAY)
|
||||
#pragma _CRI ivdep
|
||||
#elif defined (SX)
|
||||
#pragma vdir nodep
|
||||
#elif defined (__uxp__)
|
||||
#pragma loop novrec
|
||||
#endif
|
||||
for ( i = 0; i < datasize; i++ )
|
||||
{
|
||||
tmp = ((data[i] - zref) * factor + 0.5);
|
||||
ival = (unsigned long) tmp;
|
||||
lGrib[z ] = ival >> 8;
|
||||
lGrib[z+1] = ival;
|
||||
z += 2;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(3);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 24 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(4, "pack 24 bit base");
|
||||
#endif
|
||||
#if defined (CRAY)
|
||||
#pragma _CRI ivdep
|
||||
#elif defined (SX)
|
||||
#pragma vdir nodep
|
||||
#elif defined (__uxp__)
|
||||
#pragma loop novrec
|
||||
#endif
|
||||
for ( i = 0; i < datasize; i++ )
|
||||
{
|
||||
tmp = ((data[i] - zref) * factor + 0.5);
|
||||
ival = (unsigned long) tmp;
|
||||
lGrib[z ] = ival >> 16;
|
||||
lGrib[z+1] = ival >> 8;
|
||||
lGrib[z+2] = ival;
|
||||
z += 3;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(4);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 32 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(5, "pack 32 bit base");
|
||||
#endif
|
||||
#if defined (CRAY)
|
||||
#pragma _CRI ivdep
|
||||
#elif defined (SX)
|
||||
#pragma vdir nodep
|
||||
#elif defined (__uxp__)
|
||||
#pragma loop novrec
|
||||
#endif
|
||||
for ( i = 0; i < datasize; i++ )
|
||||
{
|
||||
tmp = ((data[i] - zref) * factor + 0.5);
|
||||
ival = (unsigned long) tmp;
|
||||
lGrib[z ] = ival >> 24;
|
||||
lGrib[z+1] = ival >> 16;
|
||||
lGrib[z+2] = ival >> 8;
|
||||
lGrib[z+3] = ival;
|
||||
z += 4;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(5);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits > 0 && numBits <= 32 )
|
||||
{
|
||||
encode_double_array_common(numBits, 0, datasize, lGrib,
|
||||
data, zref, factor, &z);
|
||||
}
|
||||
else if ( numBits == 0 )
|
||||
{
|
||||
}
|
||||
else
|
||||
{
|
||||
Error("Unimplemented packing factor %d!", numBits);
|
||||
}
|
||||
|
||||
*gz = z;
|
||||
}
|
||||
|
||||
static
|
||||
void encode_double_array_unrolled(int numBits, long packStart, long datasize,
|
||||
GRIBPACK *restrict lGrib,
|
||||
const double *restrict data,
|
||||
double zref, double factor, long *restrict gz)
|
||||
{
|
||||
U_BYTEORDER;
|
||||
long i, j, z = *gz;
|
||||
double tmp;
|
||||
#ifdef _ARCH_PWR6
|
||||
#define __UNROLL_DEPTH_2 8
|
||||
#else
|
||||
#define __UNROLL_DEPTH_2 8
|
||||
#endif
|
||||
|
||||
data += packStart;
|
||||
datasize -= packStart;
|
||||
|
||||
{
|
||||
long residual = datasize % __UNROLL_DEPTH_2;
|
||||
long ofs = datasize - residual;
|
||||
double dval[__UNROLL_DEPTH_2];
|
||||
unsigned long ival;
|
||||
|
||||
/* reducing FP operations to single FMA is slowing down on pwr6 ... */
|
||||
|
||||
if ( numBits == 8 )
|
||||
{
|
||||
unsigned char *cgrib = (unsigned char *) (lGrib + z);
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(2, "pack 8 bit unrolled");
|
||||
#endif
|
||||
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
dval[j] = ((data[i+j] - zref) * factor + 0.5);
|
||||
}
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
*cgrib = (unsigned long) dval[j];
|
||||
cgrib++;
|
||||
z++;
|
||||
}
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
*cgrib = (unsigned long) dval[j];
|
||||
cgrib++;
|
||||
z++;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(2);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 16 )
|
||||
{
|
||||
unsigned short *sgrib = (unsigned short *) (lGrib + z);
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(3, "pack 16 bit unrolled");
|
||||
#endif
|
||||
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
dval[j] = ((data[i+j] - zref) * factor + 0.5);
|
||||
}
|
||||
if ( IS_BIGENDIAN() )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
*sgrib = (unsigned long) dval[j];
|
||||
sgrib++;
|
||||
z += 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 8;
|
||||
lGrib[z+1] = ival;
|
||||
z += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
|
||||
}
|
||||
if ( IS_BIGENDIAN() )
|
||||
{
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
*sgrib = (unsigned long) dval[j];
|
||||
sgrib++;
|
||||
z += 2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 8;
|
||||
lGrib[z+1] = ival;
|
||||
z += 2;
|
||||
}
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(3);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 24 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(4, "pack 24 bit unrolled");
|
||||
#endif
|
||||
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
dval[j] = ((data[i+j] - zref) * factor + 0.5);
|
||||
}
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 16;
|
||||
lGrib[z+1] = ival >> 8;
|
||||
lGrib[z+2] = ival;
|
||||
z += 3;
|
||||
}
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 16;
|
||||
lGrib[z+1] = ival >> 8;
|
||||
lGrib[z+2] = ival;
|
||||
z += 3;
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(4);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits == 32 )
|
||||
{
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(5, "pack 32 bit unrolled");
|
||||
#endif
|
||||
unsigned int *igrib = (unsigned int *) (lGrib + z);
|
||||
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
dval[j] = ((data[i+j] - zref) * factor + 0.5);
|
||||
}
|
||||
if ( IS_BIGENDIAN() )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
*igrib = (unsigned long) dval[j];
|
||||
igrib++;
|
||||
z += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_2; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 24;
|
||||
lGrib[z+1] = ival >> 16;
|
||||
lGrib[z+2] = ival >> 8;
|
||||
lGrib[z+3] = ival;
|
||||
z += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
|
||||
}
|
||||
if ( IS_BIGENDIAN() )
|
||||
{
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
*igrib = (unsigned long) dval[j];
|
||||
igrib++;
|
||||
z += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
ival = (unsigned long) dval[j];
|
||||
lGrib[z ] = ival >> 24;
|
||||
lGrib[z+1] = ival >> 16;
|
||||
lGrib[z+2] = ival >> 8;
|
||||
lGrib[z+3] = ival;
|
||||
z += 4;
|
||||
}
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(5);
|
||||
#endif
|
||||
}
|
||||
else if ( numBits > 0 && numBits <= 32 )
|
||||
{
|
||||
encode_double_array_common(numBits, 0, datasize, lGrib,
|
||||
data, zref, factor, &z);
|
||||
}
|
||||
else if ( numBits == 0 )
|
||||
{
|
||||
}
|
||||
else
|
||||
{
|
||||
Error("Unimplemented packing factor %d!", numBits);
|
||||
}
|
||||
}
|
||||
|
||||
*gz = z;
|
||||
#undef __UNROLL_DEPTH_2
|
||||
}
|
|
@ -422,6 +422,11 @@ static int unpack_double(grib_accessor* a, double* val, size_t *len) {
|
|||
return _unpack_double(a,val,len,buf,pos,nvals);
|
||||
}
|
||||
|
||||
#if GRIB_IBMPOWER67_OPT
|
||||
#define restrict
|
||||
#include "minmax_val.c"
|
||||
#undef restrict
|
||||
#endif
|
||||
|
||||
static int pack_double(grib_accessor* a, const double* val, size_t *len)
|
||||
{
|
||||
|
@ -474,10 +479,14 @@ static int pack_double(grib_accessor* a, const double* val, size_t *len)
|
|||
|
||||
max = val[0];
|
||||
min = max;
|
||||
#if GRIB_IBMPOWER67_OPT
|
||||
minmax_val(val+1, n_vals-1, &min, &max);
|
||||
#else
|
||||
for(i=1;i< n_vals;i++) {
|
||||
if (val[i] > max ) max = val[i];
|
||||
if (val[i] < min ) min = val[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
/* constant field only reference_value is set and bits_per_value=0 */
|
||||
if(max==min) {
|
||||
|
@ -641,6 +650,4 @@ static int pack_double(grib_accessor* a, const double* val, size_t *len)
|
|||
|
||||
|
||||
return GRIB_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -139,7 +139,6 @@ long grib_decode_signed_longb(const unsigned char* p, long *bitp, long nbits)
|
|||
|
||||
if (sign)val = -val;
|
||||
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
|
@ -159,6 +158,11 @@ int grib_encode_signed_longb(unsigned char* p, long val ,long *bitp, long nb)
|
|||
return grib_encode_unsigned_longb(p, val ,bitp, nb-1);
|
||||
}
|
||||
|
||||
#if GRIB_IBMPOWER67_OPT
|
||||
|
||||
#include "grib_bits_ibmpow.c"
|
||||
|
||||
#else
|
||||
#if FAST_BIG_ENDIAN
|
||||
|
||||
#include "grib_bits_fast_big_endian.c"
|
||||
|
@ -169,3 +173,4 @@ int grib_encode_signed_longb(unsigned char* p, long val ,long *bitp, long nb)
|
|||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Copyright 2005-2013 ECMWF.
|
||||
*
|
||||
* This software is licensed under the terms of the Apache Licence Version 2.0
|
||||
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
|
||||
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
|
||||
*/
|
||||
|
||||
|
||||
unsigned long grib_decode_unsigned_long(const unsigned char* p, long *bitp, long nbits)
|
||||
{
|
||||
int i;
|
||||
long ret = 0;
|
||||
long o = *bitp/8;
|
||||
int l = nbits/8;
|
||||
|
||||
if (nbits==0) return 0;
|
||||
|
||||
if(nbits > max_nbits)
|
||||
{
|
||||
int bits = nbits;
|
||||
int mod = bits % max_nbits;
|
||||
|
||||
if(mod != 0)
|
||||
{
|
||||
int e=grib_decode_unsigned_long(p,bitp,mod);
|
||||
Assert( e == 0);
|
||||
bits -= mod;
|
||||
}
|
||||
|
||||
while(bits > max_nbits)
|
||||
{
|
||||
int e=grib_decode_unsigned_long(p,bitp,max_nbits);
|
||||
Assert( e == 0);
|
||||
bits -= max_nbits;
|
||||
}
|
||||
|
||||
return grib_decode_unsigned_long(p,bitp,bits);
|
||||
}
|
||||
|
||||
if((nbits%8 > 0)||(*bitp%8 > 0)) {
|
||||
for(i=0; i< nbits;i++){
|
||||
ret <<= 1;
|
||||
if(grib_get_bit( p, *bitp)) ret += 1;
|
||||
*bitp += 1;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret <<= 8;
|
||||
ret |= p[o++] ;
|
||||
|
||||
for ( i=1; i<l; i++ )
|
||||
{
|
||||
ret <<= 8;
|
||||
ret |= p[o++] ;
|
||||
}
|
||||
*bitp += nbits;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int grib_encode_unsigned_long(unsigned char* p, unsigned long val ,long *bitp, long nbits)
|
||||
{
|
||||
long len = nbits;
|
||||
int s = *bitp%8;
|
||||
int n = 8-s;
|
||||
unsigned char tmp = 0; /*for temporary results*/
|
||||
|
||||
if(nbits > max_nbits)
|
||||
{
|
||||
/* TODO: Do some real code here, to support long longs */
|
||||
int bits = nbits;
|
||||
int mod = bits % max_nbits;
|
||||
long zero = 0;
|
||||
|
||||
/* printf("Warning: encoding %ld bits=%ld %ld\n",val,nbits,*bitp); */
|
||||
|
||||
if(mod != 0)
|
||||
{
|
||||
int e=grib_encode_unsigned_long(p,zero,bitp,mod);
|
||||
/* printf(" -> : encoding %ld bits=%ld %ld\n",zero,(long)mod,*bitp); */
|
||||
Assert( e == 0);
|
||||
bits -= mod;
|
||||
}
|
||||
|
||||
while(bits > max_nbits)
|
||||
{
|
||||
int e=grib_encode_unsigned_long(p,zero,bitp,max_nbits);
|
||||
/* printf(" -> : encoding %ld bits=%ld %ld\n",zero,(long)max_nbits,*bitp); */
|
||||
Assert(e == 0);
|
||||
bits -= max_nbits;
|
||||
}
|
||||
|
||||
/* printf(" -> : encoding %ld bits=%ld %ld\n",val,(long)bits,*bitp); */
|
||||
return grib_encode_unsigned_long(p,val,bitp,bits);
|
||||
}
|
||||
|
||||
if(s)
|
||||
p += (*bitp >> 3); /* skip the bytes */
|
||||
else
|
||||
p += (*bitp >> 3); /* skip the bytes */
|
||||
|
||||
/* head */
|
||||
if(s) {
|
||||
len -= n;
|
||||
if (len < 0) {
|
||||
tmp = ((val << -len) | ((*p) & dmasks[n]));
|
||||
} else {
|
||||
tmp = ((val >> len) | ((*p) & dmasks[n]));
|
||||
}
|
||||
*p = tmp;
|
||||
(*p)++;
|
||||
}
|
||||
|
||||
/* write the middle words */
|
||||
while(len >= 8)
|
||||
{
|
||||
len -= 8;
|
||||
*p++ = (val >> len);
|
||||
}
|
||||
|
||||
/* write the end bits */
|
||||
if(len) *p = (val << (8-len));
|
||||
|
||||
*bitp += nbits;
|
||||
return GRIB_SUCCESS;
|
||||
}
|
||||
|
||||
int grib_encode_unsigned_longb(unsigned char* p, unsigned long val ,long *bitp, long nb)
|
||||
{
|
||||
long i = 0;
|
||||
|
||||
Assert(nb <= max_nbits);
|
||||
|
||||
for(i=nb-1; i >= 0; i--){
|
||||
if(test(val,i))
|
||||
grib_set_bit_on (p, bitp);
|
||||
else
|
||||
grib_set_bit_off(p, bitp);
|
||||
}
|
||||
return GRIB_SUCCESS;
|
||||
}
|
||||
|
||||
#include "grib_bits_ibmpow_opt.c"
|
||||
|
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
* Copyright 2005-2013 ECMWF.
|
||||
*
|
||||
* This software is licensed under the terms of the Apache Licence Version 2.0
|
||||
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
|
||||
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
|
||||
*/
|
||||
|
||||
int grib_decode_long_array(const unsigned char* p, long *bitp, long bitsPerValue,
|
||||
size_t n_vals,long* val) {
|
||||
long i=0;
|
||||
unsigned long lvalue = 0;
|
||||
|
||||
if(bitsPerValue%8)
|
||||
{
|
||||
int j=0;
|
||||
for(i=0;i < n_vals;i++) {
|
||||
lvalue=0;
|
||||
for(j=0; j< bitsPerValue;j++){
|
||||
lvalue <<= 1;
|
||||
if(grib_get_bit( p, *bitp)) lvalue += 1;
|
||||
*bitp += 1;
|
||||
}
|
||||
val[i] = lvalue;
|
||||
}
|
||||
} else {
|
||||
int bc;
|
||||
int l = bitsPerValue/8;
|
||||
size_t o = *bitp/8;
|
||||
|
||||
for(i=0;i < n_vals;i++)
|
||||
{
|
||||
lvalue = 0;
|
||||
lvalue <<= 8;
|
||||
lvalue |= p[o++] ;
|
||||
|
||||
for ( bc=1; bc<l; bc++ )
|
||||
{
|
||||
lvalue <<= 8;
|
||||
lvalue |= p[o++] ;
|
||||
}
|
||||
val[i] = lvalue;
|
||||
}
|
||||
*bitp+=bitsPerValue*n_vals;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int grib_decode_double_array(const unsigned char* p, long *bitp, long bitsPerValue,
|
||||
double reference_value,double s,double d,
|
||||
size_t n_vals,double* val) {
|
||||
long i=0;
|
||||
unsigned long lvalue = 0;
|
||||
double x;
|
||||
|
||||
if(bitsPerValue%8)
|
||||
{
|
||||
int j=0;
|
||||
for(i=0;i < n_vals;i++) {
|
||||
lvalue=0;
|
||||
for(j=0; j< bitsPerValue;j++){
|
||||
lvalue <<= 1;
|
||||
if(grib_get_bit( p, *bitp)) lvalue += 1;
|
||||
*bitp += 1;
|
||||
}
|
||||
x=((lvalue*s)+reference_value)*d;
|
||||
val[i] = (double)x;
|
||||
}
|
||||
} else {
|
||||
int bc;
|
||||
int l = bitsPerValue/8;
|
||||
size_t o = 0;
|
||||
|
||||
for(i=0;i < n_vals;i++)
|
||||
{
|
||||
lvalue = 0;
|
||||
lvalue <<= 8;
|
||||
lvalue |= p[o++] ;
|
||||
|
||||
for ( bc=1; bc<l; bc++ )
|
||||
{
|
||||
lvalue <<= 8;
|
||||
lvalue |= p[o++] ;
|
||||
}
|
||||
x=((lvalue*s)+reference_value)*d;
|
||||
val[i] = (double)x;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* code from grib_bits_fast_big_endian_simple.c */
|
||||
int grib_decode_double_array_complex(const unsigned char* p, long *bitp, long nbits,double reference_value,double s,double* d,size_t size,double* val) {
|
||||
long i=0;
|
||||
long countOfLeftmostBits=0,leftmostBits=0;
|
||||
long startBit;
|
||||
long remainingBits = nbits;
|
||||
long *pp=(long*)p;
|
||||
int inited=0;
|
||||
unsigned long uval=0;
|
||||
|
||||
startBit=*bitp;
|
||||
remainingBits = nbits;
|
||||
if (startBit >= max_nbits) {
|
||||
pp+=startBit/max_nbits;
|
||||
startBit %= max_nbits;
|
||||
}
|
||||
|
||||
if ( (max_nbits%nbits == 0) && (*bitp%nbits == 0) ) {
|
||||
for (i=0;i<size;i++) {
|
||||
if (startBit == max_nbits) {
|
||||
startBit = 0;
|
||||
pp++;
|
||||
}
|
||||
|
||||
val[i]=VALUE(*pp,startBit,remainingBits);
|
||||
val[i]= ((( (val[i]) * s)+reference_value)*d[i/2]);
|
||||
startBit+=remainingBits;
|
||||
remainingBits=nbits;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i=0;i<size;i++) {
|
||||
countOfLeftmostBits = startBit + remainingBits;
|
||||
if (countOfLeftmostBits > max_nbits) {
|
||||
countOfLeftmostBits = max_nbits - startBit;
|
||||
remainingBits -= countOfLeftmostBits;
|
||||
leftmostBits=(VALUE(*pp,startBit,countOfLeftmostBits)) << remainingBits;
|
||||
startBit = 0;
|
||||
pp++;
|
||||
} else
|
||||
leftmostBits = 0;
|
||||
|
||||
val[i]=leftmostBits+(VALUE(*pp,startBit,remainingBits));
|
||||
val[i]= ((( (val[i]) * s)+reference_value)*d[i/2]);
|
||||
startBit+=remainingBits;
|
||||
remainingBits=nbits;
|
||||
}
|
||||
}
|
||||
|
||||
*bitp+=size*nbits;
|
||||
|
||||
return GRIB_SUCCESS;
|
||||
}
|
||||
|
||||
int grib_encode_long_array(size_t n_vals,const long* val,long bits_per_value,unsigned char* p,long *off)
|
||||
{
|
||||
size_t i=0;
|
||||
unsigned long unsigned_val=0;
|
||||
unsigned char *encoded=p;
|
||||
if(bits_per_value%8){
|
||||
for(i=0;i< n_vals;i++){
|
||||
unsigned_val=val[i];
|
||||
grib_encode_unsigned_longb(encoded, unsigned_val, off , bits_per_value);
|
||||
}
|
||||
} else{
|
||||
for(i=0;i< n_vals;i++){
|
||||
int blen=0;
|
||||
blen = bits_per_value;
|
||||
unsigned_val = val[i];
|
||||
while(blen >= 8)
|
||||
{
|
||||
blen -= 8;
|
||||
*encoded = (unsigned_val >> blen);
|
||||
encoded++;
|
||||
*off+=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
return GRIB_SUCCESS;
|
||||
}
|
||||
|
||||
#define U_BYTEORDER static union {unsigned long l; unsigned char c[sizeof(long)];} u_byteorder = {1}
|
||||
#define IS_BIGENDIAN() (u_byteorder.c[sizeof(long) - 1])
|
||||
#define GRIBPACK unsigned char
|
||||
#define Error(x, y) fprintf(stderr, x, y)
|
||||
#define restrict
|
||||
|
||||
#include "encode_double_array.c"
|
||||
|
||||
#undef U_BYTEORDER
|
||||
#undef IS_BIGENDIAN
|
||||
#undef GRIBPACK
|
||||
#undef Error
|
||||
#undef restrict
|
||||
|
||||
int grib_encode_double_array(size_t n_vals,const double* val,long bits_per_value,double reference_value,double d,double divisor,unsigned char* p,long *off)
|
||||
{
|
||||
if ( fabs(d - 1) > 0 ) /* d != 1 */
|
||||
{
|
||||
size_t i=0;
|
||||
unsigned long unsigned_val=0;
|
||||
unsigned char *encoded=p;
|
||||
double x;
|
||||
|
||||
if(bits_per_value%8){
|
||||
for(i=0;i< n_vals;i++){
|
||||
x=(((val[i]*d)-reference_value)*divisor)+0.5;
|
||||
unsigned_val = (unsigned long)x;
|
||||
grib_encode_unsigned_longb(encoded, unsigned_val, off , bits_per_value);
|
||||
}
|
||||
} else{
|
||||
for(i=0;i< n_vals;i++){
|
||||
int blen=0;
|
||||
blen = bits_per_value;
|
||||
x = ((((val[i]*d)-reference_value)*divisor)+0.5);
|
||||
unsigned_val = (unsigned long)x;
|
||||
while(blen >= 8)
|
||||
{
|
||||
blen -= 8;
|
||||
*encoded = (unsigned_val >> blen);
|
||||
encoded++;
|
||||
*off+=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined (_ARCH_PWR6)
|
||||
encode_double_array_unrolled(bits_per_value, 0, n_vals, p, val, reference_value, divisor, off);
|
||||
#else
|
||||
encode_double_array_byte (bits_per_value, 0, n_vals, p, val, reference_value, divisor, off);
|
||||
#endif
|
||||
}
|
||||
|
||||
return GRIB_SUCCESS;
|
||||
}
|
||||
|
||||
/* code from grib_bits_fast_big_endian_simple.c */
|
||||
int grib_encode_double_array_complex(size_t n_vals,double* val,long nbits,double reference_value,
|
||||
double* scal,double d,double divisor,unsigned char* p,long *bitp) {
|
||||
long* destination = (long*)p;
|
||||
double* v=val;
|
||||
long countOfLeftmostBits=0,startBit=0,remainingBits=0,rightmostBits=0;
|
||||
unsigned long uval=0;
|
||||
size_t i=0;
|
||||
|
||||
startBit=*bitp;
|
||||
remainingBits = nbits;
|
||||
|
||||
if (startBit >= max_nbits) {
|
||||
destination += startBit / max_nbits;
|
||||
startBit %= max_nbits;
|
||||
}
|
||||
|
||||
if ( (max_nbits%nbits == 0) && (*bitp%nbits == 0) ) {
|
||||
|
||||
for(i=0;i< n_vals;i++) {
|
||||
uval = (unsigned long)(((((*v)*d*scal[i/2])-reference_value)*divisor)+0.5);
|
||||
if (startBit == max_nbits) {
|
||||
startBit = 0;
|
||||
destination++;
|
||||
}
|
||||
|
||||
rightmostBits = VALUE(uval,max_nbits-remainingBits,remainingBits);
|
||||
*destination = ((*destination) & ~MASKVALUE(startBit,remainingBits))
|
||||
+ (rightmostBits << max_nbits-(remainingBits+startBit));
|
||||
startBit+=remainingBits;
|
||||
remainingBits=nbits;
|
||||
|
||||
v++;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for(i=0;i< n_vals;i++) {
|
||||
countOfLeftmostBits = startBit + remainingBits;
|
||||
uval = (unsigned long)(((((*v)*d*scal[i/2])-reference_value)*divisor)+0.5);
|
||||
if (countOfLeftmostBits > max_nbits) {
|
||||
countOfLeftmostBits = max_nbits - startBit;
|
||||
startBit = max_nbits - remainingBits;
|
||||
remainingBits -= countOfLeftmostBits;
|
||||
*destination = (((*destination) >> countOfLeftmostBits) << countOfLeftmostBits)
|
||||
+ (VALUE(uval,startBit,countOfLeftmostBits));
|
||||
startBit = 0;
|
||||
destination++;
|
||||
}
|
||||
|
||||
rightmostBits = VALUE(uval,max_nbits-remainingBits,remainingBits);
|
||||
*destination = ((*destination) & ~MASKVALUE(startBit,remainingBits))
|
||||
+ (rightmostBits << max_nbits-(remainingBits+startBit));
|
||||
startBit+=remainingBits;
|
||||
remainingBits=nbits;
|
||||
|
||||
v++;
|
||||
}
|
||||
}
|
||||
|
||||
*bitp+=n_vals*nbits;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,332 @@
|
|||
/*
|
||||
* Copyright 2005-2013 ECMWF.
|
||||
*
|
||||
* This software is licensed under the terms of the Apache Licence Version 2.0
|
||||
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
*
|
||||
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
|
||||
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
|
||||
*/
|
||||
|
||||
|
||||
#ifdef __PGI
|
||||
#undef __AVX__
|
||||
#undef __SSE2__
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
#include <libhpc.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
#include <float.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
#ifdef _GET_X86_COUNTER
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#else
|
||||
#ifdef __SSE2__
|
||||
#include <float.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <emmintrin.h>
|
||||
#ifdef _GET_X86_COUNTER
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __AVX__
|
||||
static
|
||||
void avx_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
|
||||
{
|
||||
__m256d current_max, current_min, work;
|
||||
|
||||
/* load max and min values into all four slots of the XMM registers */
|
||||
current_min = _mm256_set1_pd(*min);
|
||||
current_max = _mm256_set1_pd(*max);
|
||||
|
||||
/* Work input until "buf" reaches 32 byte alignment */
|
||||
while ( ((unsigned long)buf) % 32 != 0 && nframes > 0) {
|
||||
|
||||
/* Load the next double into the work buffer */
|
||||
work = _mm256_set1_pd(*buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
while (nframes >= 16) {
|
||||
/* use 64 byte prefetch? */
|
||||
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
|
||||
|
||||
work = _mm256_load_pd(buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf += 4;
|
||||
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
|
||||
work = _mm256_load_pd(buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf += 4;
|
||||
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
|
||||
work = _mm256_load_pd(buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf += 4;
|
||||
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
|
||||
work = _mm256_load_pd(buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf += 4;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
/* work through aligned buffers */
|
||||
while (nframes >= 4) {
|
||||
work = _mm256_load_pd(buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf += 4;
|
||||
nframes -= 4;
|
||||
}
|
||||
|
||||
/* work through the remainung values */
|
||||
while ( nframes > 0) {
|
||||
work = _mm256_set1_pd(*buf);
|
||||
current_min = _mm256_min_pd(current_min, work);
|
||||
current_max = _mm256_max_pd(current_max, work);
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
/* find min & max value through shuffle tricks */
|
||||
|
||||
work = current_min;
|
||||
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
work = _mm256_min_pd (work, current_min);
|
||||
current_min = work;
|
||||
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
work = _mm256_min_pd (work, current_min);
|
||||
_mm256_store_pd(min, work);
|
||||
work = current_max;
|
||||
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
work = _mm256_max_pd (work, current_max);
|
||||
current_max = work;
|
||||
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
work = _mm256_max_pd (work, current_max);
|
||||
_mm256_store_pd(max, work);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __SSE2__
|
||||
static
|
||||
void sse2_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
|
||||
{
|
||||
__m128d current_max, current_min, work;
|
||||
|
||||
/* load starting max and min values into all slots of the XMM registers */
|
||||
current_min = _mm_set1_pd(*min);
|
||||
current_max = _mm_set1_pd(*max);
|
||||
|
||||
/* work on input until buf reaches 16 byte alignment */
|
||||
while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
|
||||
|
||||
/* load one double and replicate */
|
||||
work = _mm_set1_pd(*buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
while (nframes >= 8) {
|
||||
/* use 64 byte prefetch for double octetts */
|
||||
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2 + */
|
||||
|
||||
work = _mm_load_pd(buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf += 2;
|
||||
work = _mm_load_pd(buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf += 2;
|
||||
work = _mm_load_pd(buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf += 2;
|
||||
work = _mm_load_pd(buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf += 2;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
/* work through smaller chunks of aligned buffers without prefetching */
|
||||
while (nframes >= 2) {
|
||||
work = _mm_load_pd(buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf += 2;
|
||||
nframes -= 2;
|
||||
}
|
||||
|
||||
/* work through the remaining value */
|
||||
while ( nframes > 0) {
|
||||
/* load the last double and replicate */
|
||||
work = _mm_set1_pd(*buf);
|
||||
current_min = _mm_min_pd(current_min, work);
|
||||
current_max = _mm_max_pd(current_max, work);
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
/* find final min and max value through shuffle tricks */
|
||||
work = current_min;
|
||||
work = _mm_shuffle_pd(work, work, _MM_SHUFFLE2(0, 1));
|
||||
work = _mm_min_pd (work, current_min);
|
||||
_mm_store_sd(min, work);
|
||||
work = current_max;
|
||||
work = _mm_shuffle_pd(work, work, _MM_SHUFFLE2(0, 1));
|
||||
work = _mm_max_pd (work, current_max);
|
||||
_mm_store_sd(max, work);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
static
|
||||
void minmax_val(const double *restrict data, long datasize, double *fmin, double *fmax)
|
||||
{
|
||||
#ifdef _GET_X86_COUNTER
|
||||
uint64_t start_minmax, end_minmax;
|
||||
#endif
|
||||
|
||||
if ( datasize < 1 ) return;
|
||||
|
||||
#ifdef _GET_X86_COUNTER
|
||||
start_minmax = _rdtsc();
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
|
||||
avx_minmax_val(data, datasize, fmin, fmax);
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
sse2_minmax_val(data, datasize, fmin, fmax);
|
||||
|
||||
#else
|
||||
|
||||
#ifdef _ARCH_PWR6
|
||||
#define __UNROLL_DEPTH_1 6
|
||||
|
||||
/* to allow pipelining we have to unroll */
|
||||
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(1, "minmax fsel");
|
||||
#endif
|
||||
{
|
||||
long i, j;
|
||||
long residual = datasize % __UNROLL_DEPTH_1;
|
||||
long ofs = datasize - residual;
|
||||
double register dmin[__UNROLL_DEPTH_1];
|
||||
double register dmax[__UNROLL_DEPTH_1];
|
||||
|
||||
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
|
||||
{
|
||||
dmin[j] = data[0];
|
||||
dmax[j] = data[0];
|
||||
}
|
||||
|
||||
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_1 )
|
||||
{
|
||||
for (j = 0; j < __UNROLL_DEPTH_1; j++)
|
||||
{
|
||||
dmin[j] = __fsel(dmin[j] - data[i+j], data[i+j], dmin[j]);
|
||||
dmax[j] = __fsel(data[i+j] - dmax[j], data[i+j], dmax[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (j = 0; j < residual; j++)
|
||||
{
|
||||
dmin[j] = __fsel(dmin[j] - data[ofs+j], data[ofs+j], dmin[j]);
|
||||
dmax[j] = __fsel(data[ofs+j] - dmax[j], data[ofs+j], dmax[j]);
|
||||
}
|
||||
|
||||
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
|
||||
{
|
||||
*fmin = __fsel(*fmin - dmin[j], dmin[j], *fmin);
|
||||
*fmax = __fsel(dmax[j] - *fmax, dmax[j], *fmax);
|
||||
}
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(1);
|
||||
#endif
|
||||
|
||||
#undef __UNROLL_DEPTH_1
|
||||
#else
|
||||
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStart(1, "minmax base");
|
||||
#endif
|
||||
{
|
||||
long i;
|
||||
|
||||
#if defined (CRAY)
|
||||
#pragma _CRI ivdep
|
||||
#elif defined (SX)
|
||||
#pragma vdir nodep
|
||||
#elif defined (__uxp__)
|
||||
#pragma loop novrec
|
||||
#endif
|
||||
for ( i = 0; i < datasize; ++i )
|
||||
{
|
||||
if ( *fmin > data[i] ) *fmin = data[i];
|
||||
if ( *fmax < data[i] ) *fmax = data[i];
|
||||
/*
|
||||
*fmin = *fmin < data[i] ? *fmin : data[i];
|
||||
*fmax = *fmax > data[i] ? *fmax : data[i];
|
||||
*/
|
||||
}
|
||||
}
|
||||
#ifdef _GET_IBM_COUNTER
|
||||
hpmStop(1);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _GET_X86_COUNTER
|
||||
end_minmax = _rdtsc();
|
||||
#ifdef __AVX__
|
||||
printf("AVX cycles:: %" PRIu64 "\n",
|
||||
end_minmax-start_minmax);
|
||||
#else
|
||||
#ifdef __SSE2__
|
||||
printf("SSE2 cycles:: %" PRIu64 "\n",
|
||||
end_minmax-start_minmax);
|
||||
#else
|
||||
printf("loop cycles:: %" PRIu64 "\n",
|
||||
end_minmax-start_minmax);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
Loading…
Reference in New Issue