FloatUtil.java

package zserio.runtime;

/**
 * The class provides help methods for manipulation with float numbers.
 *
 * The following float formats defined by IEEE 754 standard are supported:
 *
 * - half precision float point format (https://en.wikipedia.org/wiki/Half-precision_floating-point_format)
 * - single precision float point format (https://en.wikipedia.org/wiki/Single-precision_floating-point_format)
 * - double precision float point format (https://en.wikipedia.org/wiki/Double-precision_floating-point_format)
 */
public final class FloatUtil
{
    /**
     * Converts 16-bit float stored in short value to 32-bit float.
     *
     * @param float16Value Half precision float value stored in short to convert.
     *
     * @return Converted single precision float.
     */
    public static float convertShortToFloat(short float16Value)
    {
        // decompose half precision float (float16)
        final short sign16Shifted = (short)(float16Value & FLOAT16_SIGN_MASK);
        final short exponent16 =
                (short)((float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
        final short significand16 = (short)(float16Value & FLOAT16_SIGNIFICAND_MASK);

        // calculate significand for single precision float (float32)
        int significand32 = ((int)significand16)
                << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);

        // calculate exponent for single precision float (float32)
        int exponent32;
        if (exponent16 == 0)
        {
            if (significand32 != 0)
            {
                // subnormal (denormal) number will be normalized
                exponent32 = 1 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS; // exp is initialized by -14
                // shift significand until leading bit overflows into exponent bit
                while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
                {
                    exponent32--;
                    significand32 <<= 1;
                }
                // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
                significand32 &= FLOAT32_SIGNIFICAND_MASK;
            }
            else
            {
                // zero
                exponent32 = 0;
            }
        }
        else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
        {
            // infinity or NaN
            exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
        }
        else
        {
            // normal number
            exponent32 = exponent16 - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
        }

        // compose single precision float (float32)
        final int sign32Shifted = (int)(sign16Shifted)
                << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
        final int exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
        final int float32Value = sign32Shifted | exponent32Shifted | significand32;

        // convert it to float
        return convertIntToFloat(float32Value);
    }

    /**
     * Converts 32-bit float to 16-bit float stored in short value.
     *
     * @param float32 Single precision float to convert.
     *
     * @return Converted half precision float value stored in short.
     */
    public static short convertFloatToShort(float float32)
    {
        final int float32Value = convertFloatToInt(float32);

        // decompose single precision float (float32)
        final int sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
        final int exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
        final int significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);

        // calculate significand for half precision float (float16)
        short significand16 =
                (short)((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));

        // calculate exponent for half precision float (float16)
        boolean needsRounding = false;
        short exponent16;
        if (exponent32 == 0)
        {
            if (significand32 != 0)
            {
                // subnormal (denormal) number will be zero
                significand16 = 0;
            }
            exponent16 = 0;
        }
        else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
        {
            // infinity or NaN
            exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
        }
        else
        {
            // normal number
            final short signedExponent16 = (short)(exponent32 - FLOAT32_EXPONENT_BIAS + FLOAT16_EXPONENT_BIAS);
            if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
            {
                // exponent overflow, set infinity or NaN
                exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
            }
            else if (signedExponent16 <= 0)
            {
                // exponent underflow
                if (signedExponent16 <= (short)(-FLOAT16_SIGNIFICAND_NUM_BITS))
                {
                    // too big underflow, set to zero
                    exponent16 = 0;
                    significand16 = 0;
                }
                else
                {
                    // we can still use subnormal numbers
                    exponent16 = 0;
                    final int fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
                    final int significandShift = 1 - signedExponent16;
                    significand16 = (short)(fullSignificand32 >>
                            (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));

                    needsRounding =
                            ((fullSignificand32 >>
                                     (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
                                             significandShift - 1)) &
                                    0x01) != 0;
                }
            }
            else
            {
                // exponent ok
                exponent16 = signedExponent16;
                needsRounding =
                        ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
                                0x01) != 0;
            }
        }

        // compose half precision float (float16)
        final short sign16Shifted =
                (short)(sign32Shifted >>> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
        final short exponent16Shifted = (short)(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
        short float16Value = (short)(sign16Shifted | exponent16Shifted | significand16);

        // check rounding
        if (needsRounding)
            float16Value += (short)1; // might overflow to infinity

        return float16Value;
    }

    /**
     * Converts 32-bit float stored in int value to 32-bit float.
     *
     * @param float32Value Single precision float value stored in int to convert.
     *
     * @return Converted single precision float.
     */
    public static float convertIntToFloat(int float32Value)
    {
        return Float.intBitsToFloat(float32Value);
    }

    /**
     * Converts 32-bit float to 32-bit float stored in int value.
     *
     * @param float32 Single precision float to convert.
     *
     * @return Converted single precision float value stored in int.
     */
    public static int convertFloatToInt(float float32)
    {
        return Float.floatToIntBits(float32);
    }

    /**
     * Converts 64-bit float (double) stored in long value to 64-bit float (double).
     *
     * @param float64Value Double precision float value stored in long to convert.
     *
     * @return Converted double precision float.
     */
    public static double convertLongToDouble(long float64Value)
    {
        return Double.longBitsToDouble(float64Value);
    }

    /**
     * Converts 64-bit float (double) to 64-bit float (double) stored in long value.
     *
     * @param float64 Double precision float to convert.
     *
     * @return Converted double precision float value stored in long.
     */
    public static long convertDoubleToLong(double float64)
    {
        return Double.doubleToLongBits(float64);
    }

    private static final short FLOAT16_SIGN_MASK = (short)0x8000;
    private static final short FLOAT16_EXPONENT_MASK = (short)0x7C00;
    private static final short FLOAT16_SIGNIFICAND_MASK = (short)0x03FF;

    private static final short FLOAT16_SIGN_BIT_POSITION = 15;
    private static final short FLOAT16_EXPONENT_BIT_POSITION = 10;

    private static final short FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;

    private static final short FLOAT16_EXPONENT_INFINITY_NAN = (short)0x001F;
    private static final short FLOAT16_EXPONENT_BIAS = 15;

    private static final int FLOAT32_SIGN_MASK = 0x80000000;
    private static final int FLOAT32_EXPONENT_MASK = 0x7F800000;
    private static final int FLOAT32_SIGNIFICAND_MASK = 0x007FFFFF;

    private static final int FLOAT32_SIGN_BIT_POSITION = 31;
    private static final int FLOAT32_EXPONENT_BIT_POSITION = 23;

    private static final int FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;

    private static final int FLOAT32_EXPONENT_INFINITY_NAN = 0x00FF;
    private static final int FLOAT32_EXPONENT_BIAS = 127;
}