Zserio C++ runtime library  1.1.0
Built for Zserio 2.15.0
FloatUtil.cpp
Go to the documentation of this file.
1 #include <cstring>
2 
3 #include "zserio/FloatUtil.h"
4 
5 namespace zserio
6 {
7 
8 static const uint16_t FLOAT16_SIGN_MASK = UINT16_C(0x8000);
9 static const uint16_t FLOAT16_EXPONENT_MASK = UINT16_C(0x7C00);
10 static const uint16_t FLOAT16_SIGNIFICAND_MASK = UINT16_C(0x03FF);
11 
12 static const uint16_t FLOAT16_SIGN_BIT_POSITION = UINT16_C(15);
13 static const uint16_t FLOAT16_EXPONENT_BIT_POSITION = UINT16_C(10);
14 
15 static const uint16_t FLOAT16_SIGNIFICAND_NUM_BITS = FLOAT16_EXPONENT_BIT_POSITION;
16 
17 static const uint16_t FLOAT16_EXPONENT_INFINITY_NAN = UINT16_C(0x001F);
18 static const uint16_t FLOAT16_EXPONENT_BIAS = UINT16_C(15);
19 
20 static const uint32_t FLOAT32_SIGN_MASK = UINT32_C(0x80000000);
21 static const uint32_t FLOAT32_EXPONENT_MASK = UINT32_C(0x7F800000);
22 static const uint32_t FLOAT32_SIGNIFICAND_MASK = UINT32_C(0x007FFFFF);
23 
24 static const uint32_t FLOAT32_SIGN_BIT_POSITION = UINT32_C(31);
25 static const uint32_t FLOAT32_EXPONENT_BIT_POSITION = UINT32_C(23);
26 
27 static const uint32_t FLOAT32_SIGNIFICAND_NUM_BITS = FLOAT32_EXPONENT_BIT_POSITION;
28 
29 static const uint32_t FLOAT32_EXPONENT_INFINITY_NAN = UINT32_C(0x00FF);
30 static const uint32_t FLOAT32_EXPONENT_BIAS = UINT32_C(127);
31 
32 float convertUInt16ToFloat(uint16_t float16Value)
33 {
34  // decompose half precision float (float16)
35  const uint16_t sign16Shifted = (float16Value & FLOAT16_SIGN_MASK);
36  const uint16_t exponent16 = static_cast<uint16_t>(
37  static_cast<uint16_t>(float16Value & FLOAT16_EXPONENT_MASK) >> FLOAT16_EXPONENT_BIT_POSITION);
38  const uint16_t significand16 = (float16Value & FLOAT16_SIGNIFICAND_MASK);
39 
40  // calculate significand for single precision float (float32)
41  uint32_t significand32 = static_cast<uint32_t>(significand16)
42  << (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS);
43 
44  // calculate exponent for single precision float (float32)
45  uint32_t exponent32 = 0;
46  if (exponent16 == 0)
47  {
48  if (significand32 != 0)
49  {
50  // subnormal (denormal) number will be normalized
51  exponent32 = 1 + FLOAT32_EXPONENT_BIAS - FLOAT16_EXPONENT_BIAS; // exp is initialized by -14
52  // shift significand until leading bit overflows into exponent bit
53  while ((significand32 & (FLOAT32_SIGNIFICAND_MASK + 1)) == 0)
54  {
55  exponent32--;
56  significand32 <<= 1U;
57  }
58  // mask out overflowed leading bit from significand (normalized has implicit leading bit 1)
59  significand32 &= FLOAT32_SIGNIFICAND_MASK;
60  }
61  }
62  else if (exponent16 == FLOAT16_EXPONENT_INFINITY_NAN)
63  {
64  // infinity or NaN
65  exponent32 = FLOAT32_EXPONENT_INFINITY_NAN;
66  }
67  else
68  {
69  // normal number
70  exponent32 = static_cast<uint32_t>(exponent16) - FLOAT16_EXPONENT_BIAS + FLOAT32_EXPONENT_BIAS;
71  }
72 
73  // compose single precision float (float32)
74  const uint32_t sign32Shifted = static_cast<uint32_t>(sign16Shifted)
75  << (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION);
76  const uint32_t exponent32Shifted = exponent32 << FLOAT32_EXPONENT_BIT_POSITION;
77  const uint32_t float32Value = sign32Shifted | exponent32Shifted | significand32;
78 
79  // convert it to float
80  return convertUInt32ToFloat(float32Value);
81 }
82 
83 uint16_t convertFloatToUInt16(float float32)
84 {
85  const uint32_t float32Value = convertFloatToUInt32(float32);
86 
87  // decompose single precision float (float32)
88  const uint32_t sign32Shifted = (float32Value & FLOAT32_SIGN_MASK);
89  const uint32_t exponent32 = (float32Value & FLOAT32_EXPONENT_MASK) >> FLOAT32_EXPONENT_BIT_POSITION;
90  const uint32_t significand32 = (float32Value & FLOAT32_SIGNIFICAND_MASK);
91 
92  // calculate significand for half precision float (float16)
93  uint16_t significand16 = static_cast<uint16_t>(
94  (significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS)));
95 
96  // calculate exponent for half precision float (float16)
97  bool needsRounding = false;
98  uint16_t exponent16 = 0;
99  if (exponent32 == 0)
100  {
101  if (significand32 != 0)
102  {
103  // subnormal (denormal) number will be zero
104  significand16 = 0;
105  }
106  }
107  else if (exponent32 == FLOAT32_EXPONENT_INFINITY_NAN)
108  {
109  // infinity or NaN
110  exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
111  }
112  else
113  {
114  // normal number
115  const int16_t signedExponent16 = static_cast<int16_t>(static_cast<int32_t>(exponent32) -
116  static_cast<int32_t>(FLOAT32_EXPONENT_BIAS) + static_cast<int32_t>(FLOAT16_EXPONENT_BIAS));
117  if (signedExponent16 > FLOAT16_EXPONENT_INFINITY_NAN)
118  {
119  // exponent overflow, set infinity or NaN
120  exponent16 = FLOAT16_EXPONENT_INFINITY_NAN;
121  }
122  else if (signedExponent16 <= 0)
123  {
124  // exponent underflow
125  if (signedExponent16 <= static_cast<int16_t>(-FLOAT16_SIGNIFICAND_NUM_BITS))
126  {
127  // too big underflow, set to zero
128  significand16 = 0;
129  }
130  else
131  {
132  // we can still use subnormal numbers
133  const uint32_t fullSignificand32 = significand32 | (FLOAT32_SIGNIFICAND_MASK + 1);
134  const uint32_t significandShift = static_cast<uint32_t>(1 - signedExponent16);
135  significand16 = static_cast<uint16_t>(fullSignificand32 >>
136  (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS + significandShift));
137 
138  needsRounding =
139  ((fullSignificand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS +
140  significandShift - 1)) &
141  UINT32_C(1)) != 0;
142  }
143  }
144  else
145  {
146  // exponent ok
147  exponent16 = static_cast<uint16_t>(signedExponent16);
148  needsRounding =
149  ((significand32 >> (FLOAT32_SIGNIFICAND_NUM_BITS - FLOAT16_SIGNIFICAND_NUM_BITS - 1)) &
150  UINT32_C(1)) != 0;
151  }
152  }
153 
154  // compose half precision float (float16)
155  const uint16_t sign16Shifted =
156  static_cast<uint16_t>(sign32Shifted >> (FLOAT32_SIGN_BIT_POSITION - FLOAT16_SIGN_BIT_POSITION));
157  const uint16_t exponent16Shifted = static_cast<uint16_t>(exponent16 << FLOAT16_EXPONENT_BIT_POSITION);
158  uint16_t float16Value = static_cast<uint16_t>(sign16Shifted | exponent16Shifted) | significand16;
159 
160  // check rounding
161  if (needsRounding)
162  {
163  ++float16Value; // might overflow to infinity
164  }
165 
166  return float16Value;
167 }
168 
169 float convertUInt32ToFloat(uint32_t float32Value)
170 {
171  float convertedFloat = 0.0F;
172  (void)std::memcpy(&convertedFloat, &float32Value, sizeof(uint32_t));
173 
174  return convertedFloat;
175 }
176 
177 uint32_t convertFloatToUInt32(float float32)
178 {
179  uint32_t float32Value = 0;
180  (void)std::memcpy(&float32Value, &float32, sizeof(float));
181 
182  return float32Value;
183 }
184 
185 double convertUInt64ToDouble(uint64_t float64Value)
186 {
187  double convertedDouble = 0.0;
188  (void)std::memcpy(&convertedDouble, &float64Value, sizeof(uint64_t));
189 
190  return convertedDouble;
191 }
192 
193 uint64_t convertDoubleToUInt64(double float64)
194 {
195  uint64_t float64Value = 0;
196  (void)std::memcpy(&float64Value, &float64, sizeof(double));
197 
198  return float64Value;
199 }
200 
201 } // namespace zserio
uint64_t convertDoubleToUInt64(double float64)
Definition: FloatUtil.cpp:193
uint32_t convertFloatToUInt32(float float32)
Definition: FloatUtil.cpp:177
float convertUInt32ToFloat(uint32_t float32Value)
Definition: FloatUtil.cpp:169
double convertUInt64ToDouble(uint64_t float64Value)
Definition: FloatUtil.cpp:185
uint16_t convertFloatToUInt16(float float32)
Definition: FloatUtil.cpp:83
float convertUInt16ToFloat(uint16_t float16Value)
Definition: FloatUtil.cpp:32