GRIB-153: gribapi patch to speed up encoding on IBM P6/7

This commit is contained in:
Shahram Najm 2013-07-31 15:15:45 +01:00
parent cf3a27b03b
commit 7175b9c664
10 changed files with 1732 additions and 496 deletions

8
NOTICE
View File

@ -1,7 +1,11 @@
ECMWF GRIB API
Copyright 2005-2013 ECMWF.
This product includes software developed at ECMWF (http://www.ecmwf.int).
This product includes software developed at
ECMWF (http://www.ecmwf.int).
Parts of the definitions provided by WMO (http://www.wmo.int/pages/index_en.html)
Parts of the definitions provided by
WMO (http://www.wmo.int/pages/index_en.html)
IBM POWER optimisations were developed at
The Max-Planck-Institute for Meteorology.

View File

@ -89,6 +89,19 @@ fi
AC_DEFINE_UNQUOTED(GRIB_PTHREADS,$GRIB_PTHREADS,1->pthreads enabled 0->pthreads disabled)
AC_DEFINE_UNQUOTED(GRIB_LINUX_PTHREADS,$GRIB_LINUX_PTHREADS,1->pthreads enabled 0->pthreads disabled)
dnl check IBM POWER 6/7 optimisations option
AC_ARG_ENABLE([ibmpower67_opt],
[AS_HELP_STRING([--enable-ibmpower67_opt],[enable IBM POWER 6/7 optimisations [by default disabled]])],
[ibmpower67_opts=${enableval}] , [ibmpower67_opts=no]
)
if test "x${ibmpower67_opts}" = xyes; then
GRIB_IBMPOWER67_OPT=1
else
GRIB_IBMPOWER67_OPT=0
fi
AC_DEFINE_UNQUOTED(GRIB_IBMPOWER67_OPT,$GRIB_IBMPOWER67_OPT,1->IBM Power6/7 Optimisations enabled 0->IBM Power6/7 Optimisations disabled)
dnl check on uppercase fortran modules not working to be fixed
dnl some fortran compilers change the name of the .mod file in upper case!
ac_cv_prog_f90_uppercase_mod=no

View File

@ -302,10 +302,16 @@ list( APPEND grib_api_srcs
grib_windef.h
)
# list( APPEND grib_api_extra_srcs
# grib_bits_any_endian.c
# grib_bits_any_endian_simple.c
# )
list( APPEND grib_api_extra_srcs
grib_bits_fast_big_endian.c grib_bits_any_endian.c
grib_bits_fast_big_endian_vector.c grib_bits_any_endian_vector.c
grib_bits_fast_big_endian_simple.c grib_bits_any_endian_simple.c
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c
encode_double_array.c
grib_bits_ibmpow.c
grib_bits_ibmpow_opt.c
minmax_val.c
)
configure_file( grib_api_version.c.in grib_api_version.c @ONLY )
@ -315,20 +321,15 @@ ecbuild_generate_yy( YYPREFIX grib_yy
DEPENDANT action.c )
ecbuild_add_library(TARGET grib_api
SOURCES
grib_api_version.c
SOURCES grib_api_version.c
griby.c gribl.c
${grib_api_extra_srcs}
${grib_api_srcs}
GENERATED
grib_api_version.c
INCLUDES
${GRIB_API_EXTRA_INCLUDE_DIRS}
LIBS
${GRIB_API_EXTRA_LIBRARIES}
GENERATED grib_api_version.c
INCLUDES ${GRIB_API_EXTRA_INCLUDE_DIRS}
LIBS ${GRIB_API_EXTRA_LIBRARIES}
TEMPLATES ${grib_api_extra_srcs}
${CMATH_LIBRARIES} )
# TODO: add support for jpg and png
install( FILES grib_api.h DESTINATION include )

View File

@ -332,7 +332,8 @@ EXTRA_DIST= dummy.am griby.y gribl.l grib_accessor_classes_hash.c \
grib_bits_fast_big_endian.c grib_bits_any_endian.c \
grib_bits_fast_big_endian_vector.c grib_bits_any_endian_vector.c \
grib_bits_fast_big_endian_simple.c grib_bits_any_endian_simple.c \
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c
grib_bits_fast_big_endian_omp.c grib_bits_any_endian_omp.c \
encode_double_array.c grib_bits_ibmpow.c grib_bits_ibmpow_opt.c minmax_val.c
include $(DEVEL_RULES)

427
src/encode_double_array.c Normal file
View File

@ -0,0 +1,427 @@
/*
* Copyright 2005-2013 ECMWF.
*
* This software is licensed under the terms of the Apache Licence Version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
*
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
*/
#ifdef _GET_IBM_COUNTER
#include <libhpc.h>
#endif
static
void encode_double_array_common(int numBits, long packStart, long datasize, GRIBPACK *lGrib,
const double *data, double zref, double factor, long *gz)
{
long i, z = *gz;
unsigned int ival;
int cbits, jbits;
unsigned int c;
static unsigned int mask[] = {0,1,3,7,15,31,63,127,255};
/* code from gribw routine flist2bitstream */
cbits = 8;
c = 0;
for ( i = packStart; i < datasize; i++ )
{
/* note float -> unsigned int .. truncate */
ival = (unsigned int) ((data[i] - zref) * factor + 0.5);
/*
if ( ival > max_nbpv_pow2 ) ival = max_nbpv_pow2;
if ( ival < 0 ) ival = 0;
*/
jbits = numBits;
while ( cbits <= jbits )
{
if ( cbits == 8 )
{
jbits -= 8;
lGrib[z++] = (ival >> jbits) & 0xFF;
}
else
{
jbits -= cbits;
lGrib[z++] = (c << cbits) + ((ival >> jbits) & mask[cbits]);
cbits = 8;
c = 0;
}
}
/* now jbits < cbits */
if ( jbits )
{
c = (c << jbits) + (ival & mask[jbits]);
cbits -= jbits;
}
}
if ( cbits != 8 ) lGrib[z++] = c << cbits;
*gz = z;
}
static
void encode_double_array_byte(int numBits, long packStart, long datasize,
GRIBPACK *restrict lGrib,
const double *restrict data,
double zref, double factor, long *restrict gz)
{
long i, z = *gz;
unsigned long ival;
double tmp;
data += packStart;
datasize -= packStart;
if ( numBits == 8 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(2, "pack 8 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; i++ )
{
tmp = ((data[i] - zref) * factor + 0.5);
ival = (unsigned long) tmp;
lGrib[z ] = ival;
z++;
}
#ifdef _GET_IBM_COUNTER
hpmStop(2);
#endif
}
else if ( numBits == 16 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(3, "pack 16 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; i++ )
{
tmp = ((data[i] - zref) * factor + 0.5);
ival = (unsigned long) tmp;
lGrib[z ] = ival >> 8;
lGrib[z+1] = ival;
z += 2;
}
#ifdef _GET_IBM_COUNTER
hpmStop(3);
#endif
}
else if ( numBits == 24 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(4, "pack 24 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; i++ )
{
tmp = ((data[i] - zref) * factor + 0.5);
ival = (unsigned long) tmp;
lGrib[z ] = ival >> 16;
lGrib[z+1] = ival >> 8;
lGrib[z+2] = ival;
z += 3;
}
#ifdef _GET_IBM_COUNTER
hpmStop(4);
#endif
}
else if ( numBits == 32 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(5, "pack 32 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; i++ )
{
tmp = ((data[i] - zref) * factor + 0.5);
ival = (unsigned long) tmp;
lGrib[z ] = ival >> 24;
lGrib[z+1] = ival >> 16;
lGrib[z+2] = ival >> 8;
lGrib[z+3] = ival;
z += 4;
}
#ifdef _GET_IBM_COUNTER
hpmStop(5);
#endif
}
else if ( numBits > 0 && numBits <= 32 )
{
encode_double_array_common(numBits, 0, datasize, lGrib,
data, zref, factor, &z);
}
else if ( numBits == 0 )
{
}
else
{
Error("Unimplemented packing factor %d!", numBits);
}
*gz = z;
}
static
void encode_double_array_unrolled(int numBits, long packStart, long datasize,
GRIBPACK *restrict lGrib,
const double *restrict data,
double zref, double factor, long *restrict gz)
{
U_BYTEORDER;
long i, j, z = *gz;
double tmp;
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_2 8
#else
#define __UNROLL_DEPTH_2 8
#endif
data += packStart;
datasize -= packStart;
{
long residual = datasize % __UNROLL_DEPTH_2;
long ofs = datasize - residual;
double dval[__UNROLL_DEPTH_2];
unsigned long ival;
/* reducing FP operations to single FMA is slowing down on pwr6 ... */
if ( numBits == 8 )
{
unsigned char *cgrib = (unsigned char *) (lGrib + z);
#ifdef _GET_IBM_COUNTER
hpmStart(2, "pack 8 bit unrolled");
#endif
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
dval[j] = ((data[i+j] - zref) * factor + 0.5);
}
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
*cgrib = (unsigned long) dval[j];
cgrib++;
z++;
}
}
for (j = 0; j < residual; j++)
{
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
}
for (j = 0; j < residual; j++)
{
*cgrib = (unsigned long) dval[j];
cgrib++;
z++;
}
#ifdef _GET_IBM_COUNTER
hpmStop(2);
#endif
}
else if ( numBits == 16 )
{
unsigned short *sgrib = (unsigned short *) (lGrib + z);
#ifdef _GET_IBM_COUNTER
hpmStart(3, "pack 16 bit unrolled");
#endif
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
dval[j] = ((data[i+j] - zref) * factor + 0.5);
}
if ( IS_BIGENDIAN() )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
*sgrib = (unsigned long) dval[j];
sgrib++;
z += 2;
}
}
else
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 8;
lGrib[z+1] = ival;
z += 2;
}
}
}
for (j = 0; j < residual; j++)
{
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
}
if ( IS_BIGENDIAN() )
{
for (j = 0; j < residual; j++)
{
*sgrib = (unsigned long) dval[j];
sgrib++;
z += 2;
}
}
else
{
for (j = 0; j < residual; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 8;
lGrib[z+1] = ival;
z += 2;
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(3);
#endif
}
else if ( numBits == 24 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(4, "pack 24 bit unrolled");
#endif
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
dval[j] = ((data[i+j] - zref) * factor + 0.5);
}
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 16;
lGrib[z+1] = ival >> 8;
lGrib[z+2] = ival;
z += 3;
}
}
for (j = 0; j < residual; j++)
{
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
}
for (j = 0; j < residual; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 16;
lGrib[z+1] = ival >> 8;
lGrib[z+2] = ival;
z += 3;
}
#ifdef _GET_IBM_COUNTER
hpmStop(4);
#endif
}
else if ( numBits == 32 )
{
#ifdef _GET_IBM_COUNTER
hpmStart(5, "pack 32 bit unrolled");
#endif
unsigned int *igrib = (unsigned int *) (lGrib + z);
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
dval[j] = ((data[i+j] - zref) * factor + 0.5);
}
if ( IS_BIGENDIAN() )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
*igrib = (unsigned long) dval[j];
igrib++;
z += 4;
}
}
else
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 24;
lGrib[z+1] = ival >> 16;
lGrib[z+2] = ival >> 8;
lGrib[z+3] = ival;
z += 4;
}
}
}
for (j = 0; j < residual; j++)
{
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
}
if ( IS_BIGENDIAN() )
{
for (j = 0; j < residual; j++)
{
*igrib = (unsigned long) dval[j];
igrib++;
z += 4;
}
}
else
{
for (j = 0; j < residual; j++)
{
ival = (unsigned long) dval[j];
lGrib[z ] = ival >> 24;
lGrib[z+1] = ival >> 16;
lGrib[z+2] = ival >> 8;
lGrib[z+3] = ival;
z += 4;
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(5);
#endif
}
else if ( numBits > 0 && numBits <= 32 )
{
encode_double_array_common(numBits, 0, datasize, lGrib,
data, zref, factor, &z);
}
else if ( numBits == 0 )
{
}
else
{
Error("Unimplemented packing factor %d!", numBits);
}
}
*gz = z;
#undef __UNROLL_DEPTH_2
}

View File

@ -422,6 +422,11 @@ static int unpack_double(grib_accessor* a, double* val, size_t *len) {
return _unpack_double(a,val,len,buf,pos,nvals);
}
#if GRIB_IBMPOWER67_OPT
#define restrict
#include "minmax_val.c"
#undef restrict
#endif
static int pack_double(grib_accessor* a, const double* val, size_t *len)
{
@ -474,10 +479,14 @@ static int pack_double(grib_accessor* a, const double* val, size_t *len)
max = val[0];
min = max;
#if GRIB_IBMPOWER67_OPT
minmax_val(val+1, n_vals-1, &min, &max);
#else
for(i=1;i< n_vals;i++) {
if (val[i] > max ) max = val[i];
if (val[i] < min ) min = val[i];
}
#endif
/* constant field only reference_value is set and bits_per_value=0 */
if(max==min) {
@ -641,6 +650,4 @@ static int pack_double(grib_accessor* a, const double* val, size_t *len)
return GRIB_SUCCESS;
}

View File

@ -139,7 +139,6 @@ long grib_decode_signed_longb(const unsigned char* p, long *bitp, long nbits)
if (sign)val = -val;
return val;
}
@ -159,6 +158,11 @@ int grib_encode_signed_longb(unsigned char* p, long val ,long *bitp, long nb)
return grib_encode_unsigned_longb(p, val ,bitp, nb-1);
}
#if GRIB_IBMPOWER67_OPT
#include "grib_bits_ibmpow.c"
#else
#if FAST_BIG_ENDIAN
#include "grib_bits_fast_big_endian.c"
@ -169,3 +173,4 @@ int grib_encode_signed_longb(unsigned char* p, long val ,long *bitp, long nb)
#endif
#endif

148
src/grib_bits_ibmpow.c Normal file
View File

@ -0,0 +1,148 @@
/*
* Copyright 2005-2013 ECMWF.
*
* This software is licensed under the terms of the Apache Licence Version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
*
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
*/
unsigned long grib_decode_unsigned_long(const unsigned char* p, long *bitp, long nbits)
{
int i;
long ret = 0;
long o = *bitp/8;
int l = nbits/8;
if (nbits==0) return 0;
if(nbits > max_nbits)
{
int bits = nbits;
int mod = bits % max_nbits;
if(mod != 0)
{
int e=grib_decode_unsigned_long(p,bitp,mod);
Assert( e == 0);
bits -= mod;
}
while(bits > max_nbits)
{
int e=grib_decode_unsigned_long(p,bitp,max_nbits);
Assert( e == 0);
bits -= max_nbits;
}
return grib_decode_unsigned_long(p,bitp,bits);
}
if((nbits%8 > 0)||(*bitp%8 > 0)) {
for(i=0; i< nbits;i++){
ret <<= 1;
if(grib_get_bit( p, *bitp)) ret += 1;
*bitp += 1;
}
return ret;
}
ret <<= 8;
ret |= p[o++] ;
for ( i=1; i<l; i++ )
{
ret <<= 8;
ret |= p[o++] ;
}
*bitp += nbits;
return ret;
}
int grib_encode_unsigned_long(unsigned char* p, unsigned long val ,long *bitp, long nbits)
{
long len = nbits;
int s = *bitp%8;
int n = 8-s;
unsigned char tmp = 0; /*for temporary results*/
if(nbits > max_nbits)
{
/* TODO: Do some real code here, to support long longs */
int bits = nbits;
int mod = bits % max_nbits;
long zero = 0;
/* printf("Warning: encoding %ld bits=%ld %ld\n",val,nbits,*bitp); */
if(mod != 0)
{
int e=grib_encode_unsigned_long(p,zero,bitp,mod);
/* printf(" -> : encoding %ld bits=%ld %ld\n",zero,(long)mod,*bitp); */
Assert( e == 0);
bits -= mod;
}
while(bits > max_nbits)
{
int e=grib_encode_unsigned_long(p,zero,bitp,max_nbits);
/* printf(" -> : encoding %ld bits=%ld %ld\n",zero,(long)max_nbits,*bitp); */
Assert(e == 0);
bits -= max_nbits;
}
/* printf(" -> : encoding %ld bits=%ld %ld\n",val,(long)bits,*bitp); */
return grib_encode_unsigned_long(p,val,bitp,bits);
}
if(s)
p += (*bitp >> 3); /* skip the bytes */
else
p += (*bitp >> 3); /* skip the bytes */
/* head */
if(s) {
len -= n;
if (len < 0) {
tmp = ((val << -len) | ((*p) & dmasks[n]));
} else {
tmp = ((val >> len) | ((*p) & dmasks[n]));
}
*p = tmp;
(*p)++;
}
/* write the middle words */
while(len >= 8)
{
len -= 8;
*p++ = (val >> len);
}
/* write the end bits */
if(len) *p = (val << (8-len));
*bitp += nbits;
return GRIB_SUCCESS;
}
int grib_encode_unsigned_longb(unsigned char* p, unsigned long val ,long *bitp, long nb)
{
long i = 0;
Assert(nb <= max_nbits);
for(i=nb-1; i >= 0; i--){
if(test(val,i))
grib_set_bit_on (p, bitp);
else
grib_set_bit_off(p, bitp);
}
return GRIB_SUCCESS;
}
#include "grib_bits_ibmpow_opt.c"

298
src/grib_bits_ibmpow_opt.c Normal file
View File

@ -0,0 +1,298 @@
/*
* Copyright 2005-2013 ECMWF.
*
* This software is licensed under the terms of the Apache Licence Version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
*
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
*/
int grib_decode_long_array(const unsigned char* p, long *bitp, long bitsPerValue,
size_t n_vals,long* val) {
long i=0;
unsigned long lvalue = 0;
if(bitsPerValue%8)
{
int j=0;
for(i=0;i < n_vals;i++) {
lvalue=0;
for(j=0; j< bitsPerValue;j++){
lvalue <<= 1;
if(grib_get_bit( p, *bitp)) lvalue += 1;
*bitp += 1;
}
val[i] = lvalue;
}
} else {
int bc;
int l = bitsPerValue/8;
size_t o = *bitp/8;
for(i=0;i < n_vals;i++)
{
lvalue = 0;
lvalue <<= 8;
lvalue |= p[o++] ;
for ( bc=1; bc<l; bc++ )
{
lvalue <<= 8;
lvalue |= p[o++] ;
}
val[i] = lvalue;
}
*bitp+=bitsPerValue*n_vals;
}
return 0;
}
int grib_decode_double_array(const unsigned char* p, long *bitp, long bitsPerValue,
double reference_value,double s,double d,
size_t n_vals,double* val) {
long i=0;
unsigned long lvalue = 0;
double x;
if(bitsPerValue%8)
{
int j=0;
for(i=0;i < n_vals;i++) {
lvalue=0;
for(j=0; j< bitsPerValue;j++){
lvalue <<= 1;
if(grib_get_bit( p, *bitp)) lvalue += 1;
*bitp += 1;
}
x=((lvalue*s)+reference_value)*d;
val[i] = (double)x;
}
} else {
int bc;
int l = bitsPerValue/8;
size_t o = 0;
for(i=0;i < n_vals;i++)
{
lvalue = 0;
lvalue <<= 8;
lvalue |= p[o++] ;
for ( bc=1; bc<l; bc++ )
{
lvalue <<= 8;
lvalue |= p[o++] ;
}
x=((lvalue*s)+reference_value)*d;
val[i] = (double)x;
}
}
return 0;
}
/* code from grib_bits_fast_big_endian_simple.c */
int grib_decode_double_array_complex(const unsigned char* p, long *bitp, long nbits,double reference_value,double s,double* d,size_t size,double* val) {
long i=0;
long countOfLeftmostBits=0,leftmostBits=0;
long startBit;
long remainingBits = nbits;
long *pp=(long*)p;
int inited=0;
unsigned long uval=0;
startBit=*bitp;
remainingBits = nbits;
if (startBit >= max_nbits) {
pp+=startBit/max_nbits;
startBit %= max_nbits;
}
if ( (max_nbits%nbits == 0) && (*bitp%nbits == 0) ) {
for (i=0;i<size;i++) {
if (startBit == max_nbits) {
startBit = 0;
pp++;
}
val[i]=VALUE(*pp,startBit,remainingBits);
val[i]= ((( (val[i]) * s)+reference_value)*d[i/2]);
startBit+=remainingBits;
remainingBits=nbits;
}
} else {
for (i=0;i<size;i++) {
countOfLeftmostBits = startBit + remainingBits;
if (countOfLeftmostBits > max_nbits) {
countOfLeftmostBits = max_nbits - startBit;
remainingBits -= countOfLeftmostBits;
leftmostBits=(VALUE(*pp,startBit,countOfLeftmostBits)) << remainingBits;
startBit = 0;
pp++;
} else
leftmostBits = 0;
val[i]=leftmostBits+(VALUE(*pp,startBit,remainingBits));
val[i]= ((( (val[i]) * s)+reference_value)*d[i/2]);
startBit+=remainingBits;
remainingBits=nbits;
}
}
*bitp+=size*nbits;
return GRIB_SUCCESS;
}
int grib_encode_long_array(size_t n_vals,const long* val,long bits_per_value,unsigned char* p,long *off)
{
size_t i=0;
unsigned long unsigned_val=0;
unsigned char *encoded=p;
if(bits_per_value%8){
for(i=0;i< n_vals;i++){
unsigned_val=val[i];
grib_encode_unsigned_longb(encoded, unsigned_val, off , bits_per_value);
}
} else{
for(i=0;i< n_vals;i++){
int blen=0;
blen = bits_per_value;
unsigned_val = val[i];
while(blen >= 8)
{
blen -= 8;
*encoded = (unsigned_val >> blen);
encoded++;
*off+=8;
}
}
}
return GRIB_SUCCESS;
}
#define U_BYTEORDER static union {unsigned long l; unsigned char c[sizeof(long)];} u_byteorder = {1}
#define IS_BIGENDIAN() (u_byteorder.c[sizeof(long) - 1])
#define GRIBPACK unsigned char
#define Error(x, y) fprintf(stderr, x, y)
#define restrict
#include "encode_double_array.c"
#undef U_BYTEORDER
#undef IS_BIGENDIAN
#undef GRIBPACK
#undef Error
#undef restrict
int grib_encode_double_array(size_t n_vals,const double* val,long bits_per_value,double reference_value,double d,double divisor,unsigned char* p,long *off)
{
if ( fabs(d - 1) > 0 ) /* d != 1 */
{
size_t i=0;
unsigned long unsigned_val=0;
unsigned char *encoded=p;
double x;
if(bits_per_value%8){
for(i=0;i< n_vals;i++){
x=(((val[i]*d)-reference_value)*divisor)+0.5;
unsigned_val = (unsigned long)x;
grib_encode_unsigned_longb(encoded, unsigned_val, off , bits_per_value);
}
} else{
for(i=0;i< n_vals;i++){
int blen=0;
blen = bits_per_value;
x = ((((val[i]*d)-reference_value)*divisor)+0.5);
unsigned_val = (unsigned long)x;
while(blen >= 8)
{
blen -= 8;
*encoded = (unsigned_val >> blen);
encoded++;
*off+=8;
}
}
}
}
else
{
#if defined (_ARCH_PWR6)
encode_double_array_unrolled(bits_per_value, 0, n_vals, p, val, reference_value, divisor, off);
#else
encode_double_array_byte (bits_per_value, 0, n_vals, p, val, reference_value, divisor, off);
#endif
}
return GRIB_SUCCESS;
}
/* code from grib_bits_fast_big_endian_simple.c */
int grib_encode_double_array_complex(size_t n_vals,double* val,long nbits,double reference_value,
double* scal,double d,double divisor,unsigned char* p,long *bitp) {
long* destination = (long*)p;
double* v=val;
long countOfLeftmostBits=0,startBit=0,remainingBits=0,rightmostBits=0;
unsigned long uval=0;
size_t i=0;
startBit=*bitp;
remainingBits = nbits;
if (startBit >= max_nbits) {
destination += startBit / max_nbits;
startBit %= max_nbits;
}
if ( (max_nbits%nbits == 0) && (*bitp%nbits == 0) ) {
for(i=0;i< n_vals;i++) {
uval = (unsigned long)(((((*v)*d*scal[i/2])-reference_value)*divisor)+0.5);
if (startBit == max_nbits) {
startBit = 0;
destination++;
}
rightmostBits = VALUE(uval,max_nbits-remainingBits,remainingBits);
*destination = ((*destination) & ~MASKVALUE(startBit,remainingBits))
+ (rightmostBits << max_nbits-(remainingBits+startBit));
startBit+=remainingBits;
remainingBits=nbits;
v++;
}
} else {
for(i=0;i< n_vals;i++) {
countOfLeftmostBits = startBit + remainingBits;
uval = (unsigned long)(((((*v)*d*scal[i/2])-reference_value)*divisor)+0.5);
if (countOfLeftmostBits > max_nbits) {
countOfLeftmostBits = max_nbits - startBit;
startBit = max_nbits - remainingBits;
remainingBits -= countOfLeftmostBits;
*destination = (((*destination) >> countOfLeftmostBits) << countOfLeftmostBits)
+ (VALUE(uval,startBit,countOfLeftmostBits));
startBit = 0;
destination++;
}
rightmostBits = VALUE(uval,max_nbits-remainingBits,remainingBits);
*destination = ((*destination) & ~MASKVALUE(startBit,remainingBits))
+ (rightmostBits << max_nbits-(remainingBits+startBit));
startBit+=remainingBits;
remainingBits=nbits;
v++;
}
}
*bitp+=n_vals*nbits;
return 0;
}

332
src/minmax_val.c Normal file
View File

@ -0,0 +1,332 @@
/*
* Copyright 2005-2013 ECMWF.
*
* This software is licensed under the terms of the Apache Licence Version 2.0
* which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
*
* In applying this licence, ECMWF does not waive the privileges and immunities granted to it by
* virtue of its status as an intergovernmental organisation nor does it submit to any jurisdiction.
*/
#ifdef __PGI
#undef __AVX__
#undef __SSE2__
#endif
#ifdef _GET_IBM_COUNTER
#include <libhpc.h>
#endif
#ifdef __AVX__
#include <float.h>
#include <stdint.h>
#include <inttypes.h>
#include <immintrin.h>
#ifdef _GET_X86_COUNTER
#include <x86intrin.h>
#endif
#else
#ifdef __SSE2__
#include <float.h>
#include <stdint.h>
#include <inttypes.h>
#include <emmintrin.h>
#ifdef _GET_X86_COUNTER
#include <x86intrin.h>
#endif
#endif
#endif
#ifdef __AVX__
static
void avx_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
{
__m256d current_max, current_min, work;
/* load max and min values into all four slots of the XMM registers */
current_min = _mm256_set1_pd(*min);
current_max = _mm256_set1_pd(*max);
/* Work input until "buf" reaches 32 byte alignment */
while ( ((unsigned long)buf) % 32 != 0 && nframes > 0) {
/* Load the next double into the work buffer */
work = _mm256_set1_pd(*buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf++;
nframes--;
}
while (nframes >= 16) {
/* use 64 byte prefetch? */
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2+ */
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
nframes -= 16;
}
/* work through aligned buffers */
while (nframes >= 4) {
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
nframes -= 4;
}
/* work through the remainung values */
while ( nframes > 0) {
work = _mm256_set1_pd(*buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf++;
nframes--;
}
/* find min & max value through shuffle tricks */
work = current_min;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = _mm256_min_pd (work, current_min);
current_min = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_min_pd (work, current_min);
_mm256_store_pd(min, work);
work = current_max;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = _mm256_max_pd (work, current_max);
current_max = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_max_pd (work, current_max);
_mm256_store_pd(max, work);
return;
}
#else
#ifdef __SSE2__
static
void sse2_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
{
__m128d current_max, current_min, work;
/* load starting max and min values into all slots of the XMM registers */
current_min = _mm_set1_pd(*min);
current_max = _mm_set1_pd(*max);
/* work on input until buf reaches 16 byte alignment */
while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {
/* load one double and replicate */
work = _mm_set1_pd(*buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf++;
nframes--;
}
while (nframes >= 8) {
/* use 64 byte prefetch for double octetts */
__builtin_prefetch(buf+64,0,0); /* for GCC 4.3.2 + */
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf += 2;
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf += 2;
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf += 2;
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf += 2;
nframes -= 8;
}
/* work through smaller chunks of aligned buffers without prefetching */
while (nframes >= 2) {
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf += 2;
nframes -= 2;
}
/* work through the remaining value */
while ( nframes > 0) {
/* load the last double and replicate */
work = _mm_set1_pd(*buf);
current_min = _mm_min_pd(current_min, work);
current_max = _mm_max_pd(current_max, work);
buf++;
nframes--;
}
/* find final min and max value through shuffle tricks */
work = current_min;
work = _mm_shuffle_pd(work, work, _MM_SHUFFLE2(0, 1));
work = _mm_min_pd (work, current_min);
_mm_store_sd(min, work);
work = current_max;
work = _mm_shuffle_pd(work, work, _MM_SHUFFLE2(0, 1));
work = _mm_max_pd (work, current_max);
_mm_store_sd(max, work);
return;
}
#endif
#endif
static
void minmax_val(const double *restrict data, long datasize, double *fmin, double *fmax)
{
#ifdef _GET_X86_COUNTER
uint64_t start_minmax, end_minmax;
#endif
if ( datasize < 1 ) return;
#ifdef _GET_X86_COUNTER
start_minmax = _rdtsc();
#endif
#ifdef __AVX__
avx_minmax_val(data, datasize, fmin, fmax);
#else
#ifdef __SSE2__
sse2_minmax_val(data, datasize, fmin, fmax);
#else
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_1 6
/* to allow pipelining we have to unroll */
#ifdef _GET_IBM_COUNTER
hpmStart(1, "minmax fsel");
#endif
{
long i, j;
long residual = datasize % __UNROLL_DEPTH_1;
long ofs = datasize - residual;
double register dmin[__UNROLL_DEPTH_1];
double register dmax[__UNROLL_DEPTH_1];
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
{
dmin[j] = data[0];
dmax[j] = data[0];
}
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_1 )
{
for (j = 0; j < __UNROLL_DEPTH_1; j++)
{
dmin[j] = __fsel(dmin[j] - data[i+j], data[i+j], dmin[j]);
dmax[j] = __fsel(data[i+j] - dmax[j], data[i+j], dmax[j]);
}
}
for (j = 0; j < residual; j++)
{
dmin[j] = __fsel(dmin[j] - data[ofs+j], data[ofs+j], dmin[j]);
dmax[j] = __fsel(data[ofs+j] - dmax[j], data[ofs+j], dmax[j]);
}
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
{
*fmin = __fsel(*fmin - dmin[j], dmin[j], *fmin);
*fmax = __fsel(dmax[j] - *fmax, dmax[j], *fmax);
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(1);
#endif
#undef __UNROLL_DEPTH_1
#else
#ifdef _GET_IBM_COUNTER
hpmStart(1, "minmax base");
#endif
{
long i;
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; ++i )
{
if ( *fmin > data[i] ) *fmin = data[i];
if ( *fmax < data[i] ) *fmax = data[i];
/*
*fmin = *fmin < data[i] ? *fmin : data[i];
*fmax = *fmax > data[i] ? *fmax : data[i];
*/
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(1);
#endif
#endif
#endif
#endif
#ifdef _GET_X86_COUNTER
end_minmax = _rdtsc();
#ifdef __AVX__
printf("AVX cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#else
#ifdef __SSE2__
printf("SSE2 cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#else
printf("loop cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#endif
#endif
#endif
return;
}