Zserio C++ runtime library  1.0.2
Built for Zserio 2.14.1
FloatUtil.cpp
Go to the documentation of this file.
1 #include "zserio/FloatUtil.h"
2 
3 namespace zserio
4 {
5 
6 static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
7 static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
8 static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);
9 
10 static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
11 static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);
12 
13 static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;
14 
15 static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
16 static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);
17 
18 static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
19 static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
20 static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);
21 
22 static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
23 static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);
24 
25 static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;
26 
27 static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
28 static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);
29 
30 float convertUInt16ToFloat(uint16_t float16Value)
31 {
32  // decompose half precision float (float16)
33  const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
34  const uint16_t exponent16 = static_cast<uint16_t>(
35  static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
36  const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);
37 
38  // calculate significand for single precision float (float32)
39  uint32_t significand32 = static_cast<uint32_t>(significand16)
40  << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);
41 
42  // calculate exponent for single precision float (float32)
43  uint32_t exponent32 = 0;
44  if (exponent16 == 0)
45  {
46  if (significand32 != 0)
47  {
48  // subnormal (denormal) number will be normalized
49  exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
50  // shift significand until leading bit overflows into exponent bit
51  while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
52  {
53  exponent32--;
54  significand32 <<= 1U;
55  }
56  // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
57  significand32 &= FLOAT32_SIGNIFICAND_MASK;
58  }
59  }
60  else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
61  {
62  // infinity or NaN
63  exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
64  }
65  else
66  {
67  // normal number
68  exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
69  }
70 
71  // compose single precision float (float32)
72  const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
73  << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
74  const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
75  const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32;
76 
77  // convert it to float
78  return convertUInt32ToFloat(float32Value);
79 }
80 
81 uint16_t convertFloatToUInt16(float float32)
82 {
83  const uint32_t float32Value = convertFloatToUInt32(float32);
84 
85  // decompose single precision float (float32)
86  const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
87  const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
88  const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);
89 
90  // calculate significand for half precision float (float16)
91  uint16_t significand16 = static_cast<uint16_t>(
92  (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));
93 
94  // calculate exponent for half precision float (float16)
95  bool needsRounding = false;
96  uint16_t exponent16 = 0;
97  if (exponent32 == 0)
98  {
99  if (significand32 != 0)
100  {
101  // subnormal (denormal) number will be zero
102  significand16 = 0;
103  }
104  }
105  else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
106  {
107  // infinity or NaN
108  exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
109  }
110  else
111  {
112  // normal number
113  const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
114  static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
115  if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
116  {
117  // exponent overflow, set infinity or NaN
118  exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
119  }
120  else if (signedExponent16 <= 0)
121  {
122  // exponent underflow
123  if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
124  {
125  // too big underflow, set to zero
126  significand16 = 0;
127  }
128  else
129  {
130  // we can still use subnormal numbers
131  const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
132  const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
133  significand16 = static_cast<uint16_t>(fullSignificand32 >>
134  (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));
135 
136  needsRounding =
137  ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
138  significandShift - 1)) &
139  UINT32_C(1)) != 0;
140  }
141  }
142  else
143  {
144  // exponent ok
145  exponent16 = static_cast<uint16_t>(signedExponent16);
146  needsRounding =
147  ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
148  UINT32_C(1)) != 0;
149  }
150  }
151 
152  // compose half precision float (float16)
153  const uint16_t sign16Shifted =
154  static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
155  const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
156  uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16;
157 
158  // check rounding
159  if (needsRounding)
160  {
161  ++float16Value; // might overflow to infinity
162  }
163 
164  return float16Value;
165 }
166 
167 float convertUInt32ToFloat(uint32_t float32Value)
168 {
169  const float* convertedFloat = static_cast<const float*>(static_cast<void*>(&float32Value));
170 
171  return *convertedFloat;
172 }
173 
174 uint32_t convertFloatToUInt32(float float32)
175 {
176  const uint32_t* float32ValuePtr = static_cast<const uint32_t*>(static_cast<void*>(&float32));
177 
178  return *float32ValuePtr;
179 }
180 
181 double convertUInt64ToDouble(uint64_t float64Value)
182 {
183  const double* convertedDouble = static_cast<const double*>(static_cast<void*>(&float64Value));
184 
185  return *convertedDouble;
186 }
187 
188 uint64_t convertDoubleToUInt64(double float64)
189 {
190  const uint64_t* float64ValuePtr = static_cast<const uint64_t*>(static_cast<void*>(&float64));
191 
192  return *float64ValuePtr;
193 }
194 
195 } // namespace zserio
uint64_t convertDoubleToUInt64(double float64)
Definition: FloatUtil.cpp:188
uint32_t convertFloatToUInt32(float float32)
Definition: FloatUtil.cpp:174
float convertUInt32ToFloat(uint32_t float32Value)
Definition: FloatUtil.cpp:167
double convertUInt64ToDouble(uint64_t float64Value)
Definition: FloatUtil.cpp:181
uint16_t convertFloatToUInt16(float float32)
Definition: FloatUtil.cpp:81
float convertUInt16ToFloat(uint16_t float16Value)
Definition: FloatUtil.cpp:30