Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <assert.h>
- #include <cmath>
- #include <limits>
- #include <stdint.h>
- #include <stdio.h>
- #include <string.h>
- static_assert(std::numeric_limits<float>::has_quiet_NaN, "need quiet nan");
- template <typename T> inline constexpr T ones(unsigned count) {
- constexpr unsigned num_bits = sizeof(T) << 3;
- return static_cast<T>(~T(0)) >> (num_bits - count);
- }
- struct Float16 {
- uint16_t _n;
- Float16() = default;
- // Returns zero
- static constexpr Float16 zero() { return Float16{0}; }
- // Returns the infinity representation
- static constexpr Float16 positive_infinity() {
- Float16 inf{0};
- inf.set_f(0);
- inf.set_e(31);
- return inf;
- }
- // Returns the negative infinity representation
- static constexpr Float16 negative_infinity() {
- Float16 neginf{0};
- neginf.set_f(0);
- neginf.set_e(31);
- neginf.set_s(1);
- return neginf;
- }
- // Minimum normalized number that is greater than 0
- static constexpr Float16 min_normal() {
- Float16 f{0};
- f.set_f(1);
- f.set_e(1);
- return f;
- }
- // Maximum normalized number that's less than infinity
- static constexpr Float16 max_normal() {
- Float16 f{0};
- f.set_f(ones<uint16_t>(10));
- f.set_e(30);
- return f;
- }
- static constexpr Float16 min_denormal() {
- Float16 f{0};
- f.set_f(1);
- return f;
- }
- static constexpr Float16 from_float32(float r) {
- uint32_t r_bits = 0;
- memcpy(&r_bits, &r, sizeof(uint32_t));
- const uint32_t r_frac = r_bits & ones<uint32_t>(23);
- const uint32_t r_exp = (r_bits & (ones<uint32_t>(8) << 23)) >> 23;
- const uint32_t r_sign = (r_bits & (uint32_t(1) << 31)) >> 31;
- Float16 f{0};
- f.set_s(uint16_t(r_sign));
- f.set_f(uint16_t(r_frac));
- f.set_e(uint16_t(r_exp - 127 + 15));
- return f;
- }
- // Fractional part
- constexpr uint16_t f() const { return _n & ones<uint16_t>(10); }
- // Exponent part
- constexpr uint16_t e() const {
- return (_n & (ones<uint16_t>(5) << 10)) >> 10;
- }
- // Sign bit
- constexpr uint16_t s() const { return (_n & (uint16_t(1) << 15)) >> 15; }
- constexpr void set_f(uint16_t f) {
- _n &= static_cast<uint16_t>(~ones<uint16_t>(10));
- _n |= f;
- }
- constexpr void set_e(uint16_t e) {
- _n = _n & static_cast<uint16_t>(~(ones<uint16_t>(5) << 10));
- _n = _n | (e << 10);
- }
- constexpr void set_s(uint16_t s) {
- _n = _n & static_cast<uint16_t>(~(uint16_t(1) << 15));
- _n = _n | (s << 15);
- }
- // Returns the next representable number. Returns 0 if `this` is max
- // representable number. Only works with positive numbers for now.
- inline constexpr Float16 next() const;
- // Convert to a native float
- inline operator float() const;
- // True if denormal
- inline constexpr bool is_denormal() const;
- // True if NaN
- inline constexpr bool is_nan() const { return e() == 31 && f() != 0; }
- // Just keeping this method of comparsion here for interest. These work ok
- // except two constraints - they treat NaN as being greater than Inf, and
- // -0 is strictly less than +0. Second constraint is definitely the more
- // unacceptable one. (From the book "Hacker's Delight")
- #if 0
- constexpr bool operator==(const Float16 &b) const {
- if (is_nan() || b.is_nan())
- return false;
- return _n == b._n;
- }
- constexpr bool operator!=(const Float16 &b) const { return !(*this == b); }
- constexpr bool operator<(const Float16 &b) const {
- const bool ge0 = s() == 0;
- return (ge0 && int16_t(_n) < int16_t(b._n)) || (!ge0 && uint16_t(_n) > uint16_t(b._n));
- }
- constexpr bool operator<=(const Float16 &b) const {
- const bool ge0 = s() == 0;
- return (ge0 && int16_t(_n) <= int16_t(b._n)) || (!ge0 && uint16_t(_n) >= uint16_t(b._n));
- }
- #endif
- constexpr bool operator==(const Float16 &b) const {
- return int16_t(_n) == int16_t(_n) || -int16_t(_n) == -int16_t(_n);
- }
- constexpr bool operator<(const Float16 &b) const {
- const bool ge0 = s() == 0;
- return ((ge0 && int16_t(_n) < int16_t(b._n)) || (!ge0 && _n > b._n)) ||
- ((_n | b._n) != (uint16_t(1) << 15));
- }
- constexpr bool operator<=(const Float16 &b) const {
- const bool ge0 = s() == 0;
- return ((ge0 && int16_t(_n) <= int16_t(b._n)) || (!ge0 && _n >= b._n)) ||
- ((_n | b._n) == (uint16_t(1) << 15));
- }
- constexpr bool operator>(const Float16 &b) const { return !(*this <= b); }
- constexpr bool operator>=(const Float16 &b) const { return !(*this < b); }
- inline constexpr Float16 operator+(const Float16 &b) const;
- inline constexpr Float16 operator-(const Float16 &b) const;
- inline constexpr Float16 operator*(const Float16 &b) const;
- inline constexpr Float16 operator/(const Float16 &b) const;
- };
- struct Float16Parts {
- uint16_t f, e, s;
- constexpr Float16Parts(Float16 f16) : f(f16.f()), e(f16.e()), s(f16.s()) {}
- constexpr operator Float16() const {
- Float16 f16{f};
- f16._n |= e << 10;
- f16._n |= s << 15;
- return f16;
- }
- constexpr void increment() {
- constexpr uint16_t MAX_F = ones<uint16_t>(10);
- constexpr uint16_t MAX_E = ones<uint16_t>(5);
- if (f != MAX_F) {
- ++f;
- } else if (e != MAX_E) {
- f = 0;
- ++e;
- } else {
- f = e = s = 0;
- }
- }
- operator float() const {
- const float sign = s == 1 ? -1.0f : 1.0f;
- if (e == 0) {
- if (f == 0) {
- return sign * 0.0f;
- } else {
- // Denormal number
- return sign * std::pow(2.0f, -14.0f) * float(f) / std::pow(2.0f, 10.0f);
- }
- } else if (e == 31) {
- if (f == 0) {
- return sign * std::numeric_limits<float>::infinity();
- } else {
- return std::numeric_limits<float>::quiet_NaN();
- }
- } else {
- return sign * std::pow(2.0, float(e) - 15.0) *
- (1.0 + float(f) / std::pow(2.0f, 10.0f));
- }
- }
- };
- constexpr Float16 Float16::next() const {
- Float16Parts p(*this);
- p.increment();
- return p;
- }
- Float16::operator float() const { return float(Float16Parts(*this)); }
- constexpr bool Float16::is_denormal() const {
- Float16Parts p = Float16Parts(*this);
- return p.e == 0 && p.f != 0;
- }
- int main() {
- Float16 f = Float16::zero();
- uint16_t last_e = f.e();
- printf("e = %u\n", f.e());
- while (f != Float16::max_normal()) {
- printf("%.10f\n", (float)f);
- f = f.next();
- if (f.e() != last_e) {
- printf("e = %u\n", f.e());
- last_e = f.e();
- }
- }
- printf("%.5f\n", (float)f);
- printf("# Min normal = %.10f\n", float(Float16::min_normal()));
- printf("# Max normal = %.10f\n", float(Float16::max_normal()));
- }
Add Comment
Please, Sign In to add comment