GCC Code Coverage Report
Directory: . Exec Total Coverage
File: src/util/floatingpoint.cpp Lines: 164 221 74.2 %
Date: 2021-08-17 Branches: 167 522 32.0 %

Line Exec Source
1
/******************************************************************************
2
 * Top contributors (to current version):
3
 *   Aina Niemetz, Martin Brain, Haniel Barbosa
4
 * Copyright (c) 2013  University of Oxford
5
 *
6
 * This file is part of the cvc5 project.
7
 *
8
 * Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
9
 * in the top-level source directory and their institutional affiliations.
10
 * All rights reserved.  See the file COPYING in the top-level source
11
 * directory for licensing information.
12
 * ****************************************************************************
13
 *
14
 * A floating-point value.
15
 *
16
 * This file contains the data structures used by the constant and parametric
17
 * types of the floating point theory.
18
 */
19
20
#include "util/floatingpoint.h"
21
22
#include <math.h>
23
24
#include <limits>
25
26
#include "base/check.h"
27
#include "util/floatingpoint_literal_symfpu.h"
28
#include "util/integer.h"
29
30
/* -------------------------------------------------------------------------- */
31
32
namespace cvc5 {
33
34
/* -------------------------------------------------------------------------- */
35
36
96
uint32_t FloatingPoint::getUnpackedExponentWidth(FloatingPointSize& size)
37
{
38
96
  return FloatingPointLiteral::getUnpackedExponentWidth(size);
39
}
40
41
96
uint32_t FloatingPoint::getUnpackedSignificandWidth(FloatingPointSize& size)
42
{
43
96
  return FloatingPointLiteral::getUnpackedSignificandWidth(size);
44
}
45
46
261
FloatingPoint::FloatingPoint(uint32_t d_exp_size,
47
                             uint32_t d_sig_size,
48
261
                             const BitVector& bv)
49
261
    : d_fpl(new FloatingPointLiteral(d_exp_size, d_sig_size, bv))
50
{
51
261
}
52
53
64
FloatingPoint::FloatingPoint(const FloatingPointSize& size, const BitVector& bv)
54
64
    : d_fpl(new FloatingPointLiteral(size, bv))
55
{
56
64
}
57
58
68
FloatingPoint::FloatingPoint(const FloatingPointSize& size,
59
                             const RoundingMode& rm,
60
                             const BitVector& bv,
61
68
                             bool signedBV)
62
68
    : d_fpl(new FloatingPointLiteral(size, rm, bv, signedBV))
63
{
64
68
}
65
66
1612
FloatingPoint::FloatingPoint(FloatingPointLiteral* fpl) { d_fpl.reset(fpl); }
67
68
2804
FloatingPoint::FloatingPoint(const FloatingPoint& fp)
69
{
70
2804
  d_fpl.reset(new FloatingPointLiteral(*fp.d_fpl));
71
2804
}
72
73
38
FloatingPoint::FloatingPoint(const FloatingPointSize& size,
74
                             const RoundingMode& rm,
75
38
                             const Rational& r)
76
{
77
76
  Rational two(2, 1);
78
79
38
  if (r.isZero())
80
  {
81
    // In keeping with the SMT-LIB standard
82
20
    d_fpl.reset(new FloatingPointLiteral(
83
10
        size, FloatingPointLiteral::SpecialConstKind::FPZERO, false));
84
  }
85
  else
86
  {
87
28
    uint32_t negative = (r.sgn() < 0) ? 1 : 0;
88
56
    Rational rabs(r.abs());
89
90
    // Compute the exponent
91
56
    Integer exp(0U);
92
56
    Integer inc(1U);
93
56
    Rational working(1, 1);
94
95
28
    if (rabs != working)
96
    {
97
20
      if (rabs < working)
98
      {
99
148
        while (rabs < working)
100
        {
101
64
          exp -= inc;
102
64
          working /= two;
103
        }
104
      }
105
      else
106
      {
107
        while (rabs >= working)
108
        {
109
          exp += inc;
110
          working *= two;
111
        }
112
        exp -= inc;
113
        working /= two;
114
      }
115
    }
116
117
28
    Assert(working <= rabs);
118
28
    Assert(rabs < working * two);
119
120
    // Work out the number of bits required to represent the exponent for a
121
    // normal number
122
28
    uint32_t expBits = 2;  // No point starting with an invalid amount
123
124
56
    Integer doubleInt(2);
125
28
    if (exp.strictlyPositive())
126
    {
127
      // 1 more than exactly representable with expBits
128
      Integer representable(4);
129
      while (representable <= exp)
130
      {  // hence <=
131
        representable *= doubleInt;
132
        ++expBits;
133
      }
134
    }
135
28
    else if (exp.strictlyNegative())
136
    {
137
40
      Integer representable(-4);  // Exactly representable with expBits + sign
138
                                  // but -2^n and -(2^n - 1) are both subnormal
139
44
      while ((representable + doubleInt) > exp)
140
      {
141
12
        representable *= doubleInt;
142
12
        ++expBits;
143
      }
144
    }
145
28
    ++expBits;  // To allow for sign
146
147
56
    BitVector exactExp(expBits, exp);
148
149
    // Compute the significand.
150
28
    uint32_t sigBits = size.significandWidth() + 2;  // guard and sticky bits
151
56
    BitVector sig(sigBits, 0U);
152
56
    BitVector one(sigBits, 1U);
153
56
    Rational workingSig(0, 1);
154
1140
    for (uint32_t i = 0; i < sigBits - 1; ++i)
155
    {
156
2224
      Rational mid(workingSig + working);
157
158
1112
      if (mid <= rabs)
159
      {
160
392
        sig = sig.setBit(0, true);
161
392
        workingSig = mid;
162
      }
163
164
1112
      sig = sig.leftShift(one);
165
1112
      working /= two;
166
    }
167
168
    // Compute the sticky bit
169
56
    Rational remainder(rabs - workingSig);
170
28
    Assert(Rational(0, 1) <= remainder);
171
172
28
    if (!remainder.isZero())
173
    {
174
20
      sig = sig.setBit(0, true);
175
    }
176
177
    // Build an exact float
178
28
    FloatingPointSize exactFormat(expBits, sigBits);
179
180
    // A small subtlety... if the format has expBits the unpacked format
181
    // may have more to allow subnormals to be normalised.
182
    // Thus...
183
    uint32_t extension =
184
28
        FloatingPointLiteral::getUnpackedExponentWidth(exactFormat) - expBits;
185
186
    FloatingPointLiteral exactFloat(
187
56
        exactFormat, negative, exactExp.signExtend(extension), sig);
188
189
    // Then cast...
190
28
    d_fpl.reset(new FloatingPointLiteral(exactFloat.convert(size, rm)));
191
  }
192
38
}
193
194
4847
FloatingPoint::~FloatingPoint()
195
{
196
4847
}
197
198
6387
const FloatingPointSize& FloatingPoint::getSize() const
199
{
200
6387
  return d_fpl->getSize();
201
}
202
203
30
FloatingPoint FloatingPoint::makeNaN(const FloatingPointSize& size)
204
{
205
  return FloatingPoint(new FloatingPointLiteral(
206
30
      size, FloatingPointLiteral::SpecialConstKind::FPNAN));
207
}
208
209
72
FloatingPoint FloatingPoint::makeInf(const FloatingPointSize& size, bool sign)
210
{
211
  return FloatingPoint(new FloatingPointLiteral(
212
72
      size, FloatingPointLiteral::SpecialConstKind::FPINF, sign));
213
}
214
215
29
FloatingPoint FloatingPoint::makeZero(const FloatingPointSize& size, bool sign)
216
{
217
  return FloatingPoint(new FloatingPointLiteral(
218
29
      size, FloatingPointLiteral::SpecialConstKind::FPZERO, sign));
219
}
220
221
16
FloatingPoint FloatingPoint::makeMinSubnormal(const FloatingPointSize& size,
222
                                              bool sign)
223
{
224
32
  BitVector bvsign = sign ? BitVector::mkOne(1) : BitVector::mkZero(1);
225
32
  BitVector bvexp = BitVector::mkZero(size.packedExponentWidth());
226
32
  BitVector bvsig = BitVector::mkOne(size.packedSignificandWidth());
227
32
  return FloatingPoint(size, bvsign.concat(bvexp).concat(bvsig));
228
}
229
230
16
FloatingPoint FloatingPoint::makeMaxSubnormal(const FloatingPointSize& size,
231
                                              bool sign)
232
{
233
32
  BitVector bvsign = sign ? BitVector::mkOne(1) : BitVector::mkZero(1);
234
32
  BitVector bvexp = BitVector::mkZero(size.packedExponentWidth());
235
32
  BitVector bvsig = BitVector::mkOnes(size.packedSignificandWidth());
236
32
  return FloatingPoint(size, bvsign.concat(bvexp).concat(bvsig));
237
}
238
239
16
FloatingPoint FloatingPoint::makeMinNormal(const FloatingPointSize& size,
240
                                           bool sign)
241
{
242
32
  BitVector bvsign = sign ? BitVector::mkOne(1) : BitVector::mkZero(1);
243
32
  BitVector bvexp = BitVector::mkOne(size.packedExponentWidth());
244
32
  BitVector bvsig = BitVector::mkZero(size.packedSignificandWidth());
245
32
  return FloatingPoint(size, bvsign.concat(bvexp).concat(bvsig));
246
}
247
248
16
FloatingPoint FloatingPoint::makeMaxNormal(const FloatingPointSize& size,
249
                                           bool sign)
250
{
251
32
  BitVector bvsign = sign ? BitVector::mkOne(1) : BitVector::mkZero(1);
252
32
  BitVector bvexp = BitVector::mkOnes(size.packedExponentWidth());
253
16
  bvexp.setBit(0, false);
254
32
  BitVector bvsig = BitVector::mkOnes(size.packedSignificandWidth());
255
32
  return FloatingPoint(size, bvsign.concat(bvexp).concat(bvsig));
256
}
257
258
13
FloatingPoint FloatingPoint::absolute(void) const
259
{
260
13
  return FloatingPoint(new FloatingPointLiteral(d_fpl->absolute()));
261
}
262
263
129
FloatingPoint FloatingPoint::negate(void) const
264
{
265
129
  return FloatingPoint(new FloatingPointLiteral(d_fpl->negate()));
266
}
267
268
484
FloatingPoint FloatingPoint::add(const RoundingMode& rm,
269
                                 const FloatingPoint& arg) const
270
{
271
484
  return FloatingPoint(new FloatingPointLiteral(d_fpl->add(rm, *arg.d_fpl)));
272
}
273
274
FloatingPoint FloatingPoint::sub(const RoundingMode& rm,
275
                                 const FloatingPoint& arg) const
276
{
277
  return FloatingPoint(new FloatingPointLiteral(d_fpl->sub(rm, *arg.d_fpl)));
278
}
279
280
145
FloatingPoint FloatingPoint::mult(const RoundingMode& rm,
281
                                  const FloatingPoint& arg) const
282
{
283
145
  return FloatingPoint(new FloatingPointLiteral(d_fpl->mult(rm, *arg.d_fpl)));
284
}
285
286
FloatingPoint FloatingPoint::fma(const RoundingMode& rm,
287
                                 const FloatingPoint& arg1,
288
                                 const FloatingPoint& arg2) const
289
{
290
  return FloatingPoint(
291
      new FloatingPointLiteral(d_fpl->fma(rm, *arg1.d_fpl, *arg2.d_fpl)));
292
}
293
294
196
FloatingPoint FloatingPoint::div(const RoundingMode& rm,
295
                                 const FloatingPoint& arg) const
296
{
297
196
  return FloatingPoint(new FloatingPointLiteral(d_fpl->div(rm, *arg.d_fpl)));
298
}
299
300
30
FloatingPoint FloatingPoint::sqrt(const RoundingMode& rm) const
301
{
302
30
  return FloatingPoint(new FloatingPointLiteral(d_fpl->sqrt(rm)));
303
}
304
305
140
FloatingPoint FloatingPoint::rti(const RoundingMode& rm) const
306
{
307
140
  return FloatingPoint(new FloatingPointLiteral(d_fpl->rti(rm)));
308
}
309
310
109
FloatingPoint FloatingPoint::rem(const FloatingPoint& arg) const
311
{
312
109
  return FloatingPoint(new FloatingPointLiteral(d_fpl->rem(*arg.d_fpl)));
313
}
314
315
216
FloatingPoint FloatingPoint::maxTotal(const FloatingPoint& arg,
316
                                      bool zeroCaseLeft) const
317
{
318
  return FloatingPoint(
319
216
      new FloatingPointLiteral(d_fpl->maxTotal(*arg.d_fpl, zeroCaseLeft)));
320
}
321
322
FloatingPoint FloatingPoint::minTotal(const FloatingPoint& arg,
323
                                      bool zeroCaseLeft) const
324
{
325
  return FloatingPoint(
326
      new FloatingPointLiteral(d_fpl->minTotal(*arg.d_fpl, zeroCaseLeft)));
327
}
328
329
108
FloatingPoint::PartialFloatingPoint FloatingPoint::max(
330
    const FloatingPoint& arg) const
331
{
332
216
  FloatingPoint tmp(maxTotal(arg, true));
333
216
  return PartialFloatingPoint(tmp, tmp == maxTotal(arg, false));
334
}
335
336
FloatingPoint::PartialFloatingPoint FloatingPoint::min(
337
    const FloatingPoint& arg) const
338
{
339
  FloatingPoint tmp(minTotal(arg, true));
340
  return PartialFloatingPoint(tmp, tmp == minTotal(arg, false));
341
}
342
343
2437
bool FloatingPoint::operator==(const FloatingPoint& fp) const
344
{
345
2437
  return *d_fpl == *fp.d_fpl;
346
}
347
348
10
bool FloatingPoint::operator<=(const FloatingPoint& fp) const
349
{
350
10
  return *d_fpl <= *fp.d_fpl;
351
}
352
353
2
bool FloatingPoint::operator<(const FloatingPoint& fp) const
354
{
355
2
  return *d_fpl < *fp.d_fpl;
356
}
357
358
6
BitVector FloatingPoint::getExponent() const { return d_fpl->getExponent(); }
359
360
6
BitVector FloatingPoint::getSignificand() const
361
{
362
6
  return d_fpl->getSignificand();
363
}
364
365
6
bool FloatingPoint::getSign() const { return d_fpl->getSign(); }
366
367
39
bool FloatingPoint::isNormal(void) const { return d_fpl->isNormal(); }
368
369
39
bool FloatingPoint::isSubnormal(void) const { return d_fpl->isSubnormal(); }
370
371
150
bool FloatingPoint::isZero(void) const { return d_fpl->isZero(); }
372
373
19
bool FloatingPoint::isInfinite(void) const { return d_fpl->isInfinite(); }
374
375
41
bool FloatingPoint::isNaN(void) const { return d_fpl->isNaN(); }
376
377
12
bool FloatingPoint::isNegative(void) const { return d_fpl->isNegative(); }
378
379
9
bool FloatingPoint::isPositive(void) const { return d_fpl->isPositive(); }
380
381
19
FloatingPoint FloatingPoint::convert(const FloatingPointSize& target,
382
                                     const RoundingMode& rm) const
383
{
384
19
  return FloatingPoint(new FloatingPointLiteral(d_fpl->convert(target, rm)));
385
}
386
387
BitVector FloatingPoint::convertToBVTotal(BitVectorSize width,
388
                                          const RoundingMode& rm,
389
                                          bool signedBV,
390
                                          BitVector undefinedCase) const
391
{
392
  if (signedBV)
393
  {
394
    return d_fpl->convertToSBVTotal(width, rm, undefinedCase);
395
  }
396
  return d_fpl->convertToUBVTotal(width, rm, undefinedCase);
397
}
398
399
4
Rational FloatingPoint::convertToRationalTotal(Rational undefinedCase) const
400
{
401
8
  PartialRational p(convertToRational());
402
403
8
  return p.second ? p.first : undefinedCase;
404
}
405
406
FloatingPoint::PartialBitVector FloatingPoint::convertToBV(
407
    BitVectorSize width, const RoundingMode& rm, bool signedBV) const
408
{
409
  BitVector tmp(convertToBVTotal(width, rm, signedBV, BitVector(width, 0U)));
410
  BitVector confirm(
411
      convertToBVTotal(width, rm, signedBV, BitVector(width, 1U)));
412
413
  return PartialBitVector(tmp, tmp == confirm);
414
}
415
416
8
FloatingPoint::PartialRational FloatingPoint::convertToRational(void) const
417
{
418
8
  if (isNaN() || isInfinite())
419
  {
420
8
    return PartialRational(Rational(0U, 1U), false);
421
  }
422
  if (isZero())
423
  {
424
    return PartialRational(Rational(0U, 1U), true);
425
  }
426
  Integer sign((d_fpl->getSign()) ? -1 : 1);
427
  Integer exp(
428
      d_fpl->getExponent().toSignedInteger()
429
      - (Integer(d_fpl->getSize().significandWidth()
430
                 - 1)));  // -1 as forcibly normalised into the [1,2) range
431
  Integer significand(d_fpl->getSignificand().toInteger());
432
  Integer signedSignificand(sign * significand);
433
434
  // We only have multiplyByPow(uint32_t) so we can't convert all numbers.
435
  // As we convert Integer -> unsigned int -> uint32_t we need that
436
  // unsigned int is not smaller than uint32_t
437
  static_assert(sizeof(unsigned int) >= sizeof(uint32_t),
438
                "Conversion float -> real could loose data");
439
#ifdef CVC5_ASSERTIONS
440
  // Note that multipling by 2^n requires n bits of space (worst case)
441
  // so, in effect, these tests limit us to cases where the resultant
442
  // number requires up to 2^32 bits = 512 megabyte to represent.
443
  Integer shiftLimit(std::numeric_limits<uint32_t>::max());
444
#endif
445
446
  if (!(exp.strictlyNegative()))
447
  {
448
    Assert(exp <= shiftLimit);
449
    Integer r(signedSignificand.multiplyByPow2(exp.toUnsignedInt()));
450
    return PartialRational(Rational(r), true);
451
  }
452
  Integer one(1U);
453
  Assert((-exp) <= shiftLimit);
454
  Integer q(one.multiplyByPow2((-exp).toUnsignedInt()));
455
  Rational r(signedSignificand, q);
456
  return PartialRational(r, true);
457
}
458
459
3159
BitVector FloatingPoint::pack(void) const { return d_fpl->pack(); }
460
461
2
std::string FloatingPoint::toString(bool printAsIndexed) const
462
{
463
2
  std::string str;
464
  // retrive BV value
465
4
  BitVector bv(pack());
466
  uint32_t largestSignificandBit =
467
2
      getSize().significandWidth() - 2;  // -1 for -inclusive, -1 for hidden
468
  uint32_t largestExponentBit =
469
2
      (getSize().exponentWidth() - 1) + (largestSignificandBit + 1);
470
4
  BitVector v[3];
471
2
  v[0] = bv.extract(largestExponentBit + 1, largestExponentBit + 1);
472
2
  v[1] = bv.extract(largestExponentBit, largestSignificandBit + 1);
473
2
  v[2] = bv.extract(largestSignificandBit, 0);
474
2
  str.append("(fp ");
475
8
  for (uint32_t i = 0; i < 3; ++i)
476
  {
477
6
    if (printAsIndexed)
478
    {
479
      str.append("(_ bv");
480
      str.append(v[i].getValue().toString());
481
      str.append(" ");
482
      str.append(std::to_string(v[i].getSize()));
483
      str.append(")");
484
    }
485
    else
486
    {
487
6
      str.append("#b");
488
6
      str.append(v[i].toString());
489
    }
490
6
    if (i < 2)
491
    {
492
4
      str.append(" ");
493
    }
494
  }
495
2
  str.append(")");
496
4
  return str;
497
}
498
499
std::ostream& operator<<(std::ostream& os, const FloatingPoint& fp)
500
{
501
  // print in binary notation
502
  return os << fp.toString();
503
}
504
505
std::ostream& operator<<(std::ostream& os, const FloatingPointSize& fps)
506
{
507
  return os << "(_ FloatingPoint " << fps.exponentWidth() << " "
508
            << fps.significandWidth() << ")";
509
}
510
511
std::ostream& operator<<(std::ostream& os, const FloatingPointConvertSort& fpcs)
512
{
513
  return os << "(_ to_fp " << fpcs.getSize().exponentWidth() << " "
514
            << fpcs.getSize().significandWidth() << ")";
515
}
516
29337
}  // namespace cvc5