1 |
|
/****************************************************************************** |
2 |
|
* Top contributors (to current version): |
3 |
|
* Andrew Reynolds, Tim King, Tianyi Liang |
4 |
|
* |
5 |
|
* This file is part of the cvc5 project. |
6 |
|
* |
7 |
|
* Copyright (c) 2009-2021 by the authors listed in the file AUTHORS |
8 |
|
* in the top-level source directory and their institutional affiliations. |
9 |
|
* All rights reserved. See the file COPYING in the top-level source |
10 |
|
* directory for licensing information. |
11 |
|
* **************************************************************************** |
12 |
|
* |
13 |
|
* Implementation of the string data type. |
14 |
|
*/ |
15 |
|
|
16 |
|
#include "util/string.h" |
17 |
|
|
18 |
|
#include <algorithm> |
19 |
|
#include <climits> |
20 |
|
#include <iomanip> |
21 |
|
#include <iostream> |
22 |
|
#include <sstream> |
23 |
|
|
24 |
|
#include "base/check.h" |
25 |
|
#include "base/exception.h" |
26 |
|
|
27 |
|
using namespace std; |
28 |
|
|
29 |
|
namespace cvc5 { |
30 |
|
|
31 |
|
static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values."); |
32 |
|
|
33 |
13163 |
String::String(const std::wstring& s) |
34 |
|
{ |
35 |
13163 |
d_str.resize(s.size()); |
36 |
137354 |
for (size_t i = 0, n = s.size(); i < n; ++i) |
37 |
|
{ |
38 |
124191 |
d_str[i] = static_cast<unsigned>(s[i]); |
39 |
|
} |
40 |
13163 |
} |
41 |
|
|
42 |
2918632 |
String::String(const std::vector<unsigned> &s) : d_str(s) |
43 |
|
{ |
44 |
|
#ifdef CVC5_ASSERTIONS |
45 |
7259272 |
for (unsigned u : d_str) |
46 |
|
{ |
47 |
4340640 |
Assert(u < num_codes()); |
48 |
|
} |
49 |
|
#endif |
50 |
2918632 |
} |
51 |
|
|
52 |
2914622 |
int String::cmp(const String &y) const { |
53 |
2914622 |
if (size() != y.size()) { |
54 |
19488 |
return size() < y.size() ? -1 : 1; |
55 |
|
} |
56 |
4826253 |
for (unsigned int i = 0; i < size(); ++i) { |
57 |
1938418 |
if (d_str[i] != y.d_str[i]) { |
58 |
7299 |
unsigned cp = d_str[i]; |
59 |
7299 |
unsigned cpy = y.d_str[i]; |
60 |
7299 |
return cp < cpy ? -1 : 1; |
61 |
|
} |
62 |
|
} |
63 |
2887835 |
return 0; |
64 |
|
} |
65 |
|
|
66 |
24437 |
String String::concat(const String &other) const { |
67 |
48874 |
std::vector<unsigned int> ret_vec(d_str); |
68 |
24437 |
ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end()); |
69 |
48874 |
return String(ret_vec); |
70 |
|
} |
71 |
|
|
72 |
31956 |
bool String::strncmp(const String& y, std::size_t n) const |
73 |
|
{ |
74 |
31956 |
std::size_t b = (size() >= y.size()) ? size() : y.size(); |
75 |
31956 |
std::size_t s = (size() <= y.size()) ? size() : y.size(); |
76 |
31956 |
if (n > s) { |
77 |
|
if (b == s) { |
78 |
|
n = s; |
79 |
|
} else { |
80 |
|
return false; |
81 |
|
} |
82 |
|
} |
83 |
119847 |
for (std::size_t i = 0; i < n; ++i) { |
84 |
92279 |
if (d_str[i] != y.d_str[i]) return false; |
85 |
|
} |
86 |
27568 |
return true; |
87 |
|
} |
88 |
|
|
89 |
27187 |
bool String::rstrncmp(const String& y, std::size_t n) const |
90 |
|
{ |
91 |
27187 |
std::size_t b = (size() >= y.size()) ? size() : y.size(); |
92 |
27187 |
std::size_t s = (size() <= y.size()) ? size() : y.size(); |
93 |
27187 |
if (n > s) { |
94 |
|
if (b == s) { |
95 |
|
n = s; |
96 |
|
} else { |
97 |
|
return false; |
98 |
|
} |
99 |
|
} |
100 |
71462 |
for (std::size_t i = 0; i < n; ++i) { |
101 |
48377 |
if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false; |
102 |
|
} |
103 |
23085 |
return true; |
104 |
|
} |
105 |
|
|
106 |
24812 |
void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str) |
107 |
|
{ |
108 |
|
// if not a printable character |
109 |
24812 |
if (ch > 127 || ch < 32) |
110 |
|
{ |
111 |
2 |
std::stringstream serr; |
112 |
|
serr << "Illegal string character: \"" << ch |
113 |
1 |
<< "\", must use escape sequence"; |
114 |
1 |
throw cvc5::Exception(serr.str()); |
115 |
|
} |
116 |
|
else |
117 |
|
{ |
118 |
24811 |
str.push_back(static_cast<unsigned>(ch)); |
119 |
|
} |
120 |
24811 |
} |
121 |
|
|
122 |
25141 |
std::vector<unsigned> String::toInternal(const std::string& s, |
123 |
|
bool useEscSequences) |
124 |
|
{ |
125 |
25141 |
std::vector<unsigned> str; |
126 |
25141 |
unsigned i = 0; |
127 |
69535 |
while (i < s.size()) |
128 |
|
{ |
129 |
|
// get the current character |
130 |
22198 |
char si = s[i]; |
131 |
43685 |
if (si != '\\' || !useEscSequences) |
132 |
|
{ |
133 |
21488 |
addCharToInternal(si, str); |
134 |
21487 |
++i; |
135 |
21487 |
continue; |
136 |
|
} |
137 |
|
// the vector of characters, in case we fail to read an escape sequence |
138 |
1420 |
std::vector<unsigned> nonEscCache; |
139 |
|
// process the '\' |
140 |
710 |
addCharToInternal(si, nonEscCache); |
141 |
710 |
++i; |
142 |
|
// are we an escape sequence? |
143 |
710 |
bool isEscapeSequence = true; |
144 |
|
// the string corresponding to the hexadecimal code point |
145 |
1420 |
std::stringstream hexString; |
146 |
|
// is the slash followed by a 'u'? Could be last character. |
147 |
710 |
if (i >= s.size() || s[i] != 'u') |
148 |
|
{ |
149 |
180 |
isEscapeSequence = false; |
150 |
|
} |
151 |
|
else |
152 |
|
{ |
153 |
|
// process the 'u' |
154 |
530 |
addCharToInternal(s[i], nonEscCache); |
155 |
530 |
++i; |
156 |
530 |
bool isStart = true; |
157 |
530 |
bool isEnd = false; |
158 |
530 |
bool hasBrace = false; |
159 |
3674 |
while (i < s.size()) |
160 |
|
{ |
161 |
|
// add the next character |
162 |
2090 |
si = s[i]; |
163 |
2090 |
if (isStart) |
164 |
|
{ |
165 |
526 |
isStart = false; |
166 |
|
// possibly read '{' |
167 |
994 |
if (si == '{') |
168 |
|
{ |
169 |
468 |
hasBrace = true; |
170 |
468 |
addCharToInternal(si, nonEscCache); |
171 |
468 |
++i; |
172 |
468 |
continue; |
173 |
|
} |
174 |
|
} |
175 |
1564 |
else if (si == '}') |
176 |
|
{ |
177 |
|
// can only end if we had an open brace and read at least one digit |
178 |
456 |
isEscapeSequence = hasBrace && !hexString.str().empty(); |
179 |
456 |
isEnd = true; |
180 |
456 |
addCharToInternal(si, nonEscCache); |
181 |
456 |
++i; |
182 |
456 |
break; |
183 |
|
} |
184 |
|
// must be a hex digit at this point |
185 |
1166 |
if (!isHexDigit(static_cast<unsigned>(si))) |
186 |
|
{ |
187 |
6 |
isEscapeSequence = false; |
188 |
6 |
break; |
189 |
|
} |
190 |
1160 |
hexString << si; |
191 |
1160 |
addCharToInternal(si, nonEscCache); |
192 |
1160 |
++i; |
193 |
1160 |
if (!hasBrace && hexString.str().size() == 4) |
194 |
|
{ |
195 |
|
// will be finished reading \ u d_3 d_2 d_1 d_0 with no parens |
196 |
54 |
isEnd = true; |
197 |
54 |
break; |
198 |
|
} |
199 |
1106 |
else if (hasBrace && hexString.str().size() > 5) |
200 |
|
{ |
201 |
|
// too many digits enclosed in brace, not an escape sequence |
202 |
2 |
isEscapeSequence = false; |
203 |
2 |
break; |
204 |
|
} |
205 |
|
} |
206 |
530 |
if (!isEnd) |
207 |
|
{ |
208 |
|
// if we were interrupted before ending, then this is not a valid |
209 |
|
// escape sequence |
210 |
20 |
isEscapeSequence = false; |
211 |
|
} |
212 |
|
} |
213 |
710 |
if (isEscapeSequence) |
214 |
|
{ |
215 |
508 |
Assert(!hexString.str().empty() && hexString.str().size() <= 5); |
216 |
|
// Otherwise, we add the escaped character. |
217 |
|
// This is guaranteed not to overflow due to the length of hstr. |
218 |
|
uint32_t val; |
219 |
508 |
hexString >> std::hex >> val; |
220 |
508 |
if (val > num_codes()) |
221 |
|
{ |
222 |
|
// Failed due to being out of range. This can happen for strings of |
223 |
|
// the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexadecimal not |
224 |
|
// in the range [0-2]. |
225 |
2 |
isEscapeSequence = false; |
226 |
|
} |
227 |
|
else |
228 |
|
{ |
229 |
506 |
str.push_back(val); |
230 |
|
} |
231 |
|
} |
232 |
|
// if we did not successfully parse an escape sequence, we add back all |
233 |
|
// characters that we cached |
234 |
710 |
if (!isEscapeSequence) |
235 |
|
{ |
236 |
204 |
str.insert(str.end(), nonEscCache.begin(), nonEscCache.end()); |
237 |
|
} |
238 |
|
} |
239 |
|
#ifdef CVC5_ASSERTIONS |
240 |
47367 |
for (unsigned u : str) |
241 |
|
{ |
242 |
22227 |
Assert(u < num_codes()); |
243 |
|
} |
244 |
|
#endif |
245 |
25140 |
return str; |
246 |
|
} |
247 |
|
|
248 |
75064 |
unsigned String::front() const |
249 |
|
{ |
250 |
75064 |
Assert(!d_str.empty()); |
251 |
75064 |
return d_str.front(); |
252 |
|
} |
253 |
|
|
254 |
37513 |
unsigned String::back() const |
255 |
|
{ |
256 |
37513 |
Assert(!d_str.empty()); |
257 |
37513 |
return d_str.back(); |
258 |
|
} |
259 |
|
|
260 |
2350 |
std::size_t String::overlap(const String &y) const { |
261 |
2350 |
std::size_t i = size() < y.size() ? size() : y.size(); |
262 |
6052 |
for (; i > 0; i--) { |
263 |
4344 |
String s = suffix(i); |
264 |
4344 |
String p = y.prefix(i); |
265 |
2493 |
if (s == p) { |
266 |
642 |
return i; |
267 |
|
} |
268 |
|
} |
269 |
1708 |
return i; |
270 |
|
} |
271 |
|
|
272 |
2195 |
std::size_t String::roverlap(const String &y) const { |
273 |
2195 |
std::size_t i = size() < y.size() ? size() : y.size(); |
274 |
4513 |
for (; i > 0; i--) { |
275 |
2936 |
String s = prefix(i); |
276 |
2936 |
String p = y.suffix(i); |
277 |
1777 |
if (s == p) { |
278 |
618 |
return i; |
279 |
|
} |
280 |
|
} |
281 |
1577 |
return i; |
282 |
|
} |
283 |
|
|
284 |
3016039 |
std::string String::toString(bool useEscSequences) const { |
285 |
6032078 |
std::stringstream str; |
286 |
5724294 |
for (unsigned int i = 0; i < size(); ++i) { |
287 |
|
// we always print backslash as a code point so that it cannot be |
288 |
|
// interpreted as specifying part of a code point, e.g. the string '\' + |
289 |
|
// 'u' + '0' of length three. |
290 |
2708255 |
if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences) |
291 |
|
{ |
292 |
2664474 |
str << static_cast<char>(d_str[i]); |
293 |
|
} |
294 |
|
else |
295 |
|
{ |
296 |
87562 |
std::stringstream ss; |
297 |
43781 |
ss << std::hex << d_str[i]; |
298 |
43781 |
str << "\\u{" << ss.str() << "}"; |
299 |
|
} |
300 |
|
} |
301 |
6032078 |
return str.str(); |
302 |
|
} |
303 |
|
|
304 |
12678 |
std::wstring String::toWString() const |
305 |
|
{ |
306 |
12678 |
std::wstring res(size(), static_cast<wchar_t>(0)); |
307 |
134603 |
for (std::size_t i = 0; i < size(); ++i) |
308 |
|
{ |
309 |
121925 |
res[i] = static_cast<wchar_t>(d_str[i]); |
310 |
|
} |
311 |
12678 |
return res; |
312 |
|
} |
313 |
|
|
314 |
76 |
bool String::isLeq(const String &y) const |
315 |
|
{ |
316 |
145 |
for (unsigned i = 0; i < size(); ++i) |
317 |
|
{ |
318 |
126 |
if (i >= y.size()) |
319 |
|
{ |
320 |
6 |
return false; |
321 |
|
} |
322 |
120 |
unsigned ci = d_str[i]; |
323 |
120 |
unsigned cyi = y.d_str[i]; |
324 |
120 |
if (ci > cyi) |
325 |
|
{ |
326 |
27 |
return false; |
327 |
|
} |
328 |
93 |
if (ci < cyi) |
329 |
|
{ |
330 |
24 |
return true; |
331 |
|
} |
332 |
|
} |
333 |
19 |
return true; |
334 |
|
} |
335 |
|
|
336 |
20 |
bool String::isRepeated() const { |
337 |
20 |
if (size() > 1) { |
338 |
20 |
unsigned int f = d_str[0]; |
339 |
34 |
for (unsigned i = 1; i < size(); ++i) { |
340 |
20 |
if (f != d_str[i]) return false; |
341 |
|
} |
342 |
|
} |
343 |
14 |
return true; |
344 |
|
} |
345 |
|
|
346 |
18 |
bool String::tailcmp(const String &y, int &c) const { |
347 |
18 |
int id_x = size() - 1; |
348 |
18 |
int id_y = y.size() - 1; |
349 |
138 |
while (id_x >= 0 && id_y >= 0) { |
350 |
60 |
if (d_str[id_x] != y.d_str[id_y]) { |
351 |
|
c = id_x; |
352 |
|
return false; |
353 |
|
} |
354 |
60 |
--id_x; |
355 |
60 |
--id_y; |
356 |
|
} |
357 |
18 |
c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1); |
358 |
18 |
return true; |
359 |
|
} |
360 |
|
|
361 |
97424 |
std::size_t String::find(const String &y, const std::size_t start) const { |
362 |
97424 |
if (size() < y.size() + start) return std::string::npos; |
363 |
93822 |
if (y.empty()) return start; |
364 |
93555 |
if (empty()) return std::string::npos; |
365 |
|
|
366 |
|
std::vector<unsigned>::const_iterator itr = std::search( |
367 |
93555 |
d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end()); |
368 |
93555 |
if (itr != d_str.end()) { |
369 |
89472 |
return itr - d_str.begin(); |
370 |
|
} |
371 |
4083 |
return std::string::npos; |
372 |
|
} |
373 |
|
|
374 |
7214 |
std::size_t String::rfind(const String &y, const std::size_t start) const { |
375 |
7214 |
if (size() < y.size() + start) return std::string::npos; |
376 |
5998 |
if (y.empty()) return start; |
377 |
5996 |
if (empty()) return std::string::npos; |
378 |
|
|
379 |
|
std::vector<unsigned>::const_reverse_iterator itr = std::search( |
380 |
5996 |
d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend()); |
381 |
5996 |
if (itr != d_str.rend()) { |
382 |
5337 |
return itr - d_str.rbegin(); |
383 |
|
} |
384 |
659 |
return std::string::npos; |
385 |
|
} |
386 |
|
|
387 |
10298 |
bool String::hasPrefix(const String& y) const |
388 |
|
{ |
389 |
10298 |
size_t s = size(); |
390 |
10298 |
size_t ys = y.size(); |
391 |
10298 |
if (ys > s) |
392 |
|
{ |
393 |
2 |
return false; |
394 |
|
} |
395 |
28077 |
for (size_t i = 0; i < ys; i++) |
396 |
|
{ |
397 |
17931 |
if (d_str[i] != y.d_str[i]) |
398 |
|
{ |
399 |
150 |
return false; |
400 |
|
} |
401 |
|
} |
402 |
10146 |
return true; |
403 |
|
} |
404 |
|
|
405 |
11526 |
bool String::hasSuffix(const String& y) const |
406 |
|
{ |
407 |
11526 |
size_t s = size(); |
408 |
11526 |
size_t ys = y.size(); |
409 |
11526 |
if (ys > s) |
410 |
|
{ |
411 |
2 |
return false; |
412 |
|
} |
413 |
11524 |
size_t idiff = s - ys; |
414 |
34887 |
for (size_t i = 0; i < ys; i++) |
415 |
|
{ |
416 |
23457 |
if (d_str[i + idiff] != y.d_str[i]) |
417 |
|
{ |
418 |
94 |
return false; |
419 |
|
} |
420 |
|
} |
421 |
11430 |
return true; |
422 |
|
} |
423 |
|
|
424 |
92 |
String String::update(std::size_t i, const String& t) const |
425 |
|
{ |
426 |
92 |
if (i < size()) |
427 |
|
{ |
428 |
184 |
std::vector<unsigned> vec(d_str.begin(), d_str.begin() + i); |
429 |
92 |
size_t remNum = size() - i; |
430 |
92 |
size_t tnum = t.d_str.size(); |
431 |
92 |
if (tnum >= remNum) |
432 |
|
{ |
433 |
22 |
vec.insert(vec.end(), t.d_str.begin(), t.d_str.begin() + remNum); |
434 |
|
} |
435 |
|
else |
436 |
|
{ |
437 |
70 |
vec.insert(vec.end(), t.d_str.begin(), t.d_str.end()); |
438 |
70 |
vec.insert(vec.end(), d_str.begin() + i + tnum, d_str.end()); |
439 |
|
} |
440 |
92 |
return String(vec); |
441 |
|
} |
442 |
|
return *this; |
443 |
|
} |
444 |
|
|
445 |
264 |
String String::replace(const String &s, const String &t) const { |
446 |
264 |
std::size_t ret = find(s); |
447 |
264 |
if (ret != std::string::npos) { |
448 |
188 |
std::vector<unsigned> vec; |
449 |
94 |
vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret); |
450 |
94 |
vec.insert(vec.end(), t.d_str.begin(), t.d_str.end()); |
451 |
94 |
vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end()); |
452 |
94 |
return String(vec); |
453 |
|
} else { |
454 |
170 |
return *this; |
455 |
|
} |
456 |
|
} |
457 |
|
|
458 |
23376 |
String String::substr(std::size_t i) const { |
459 |
23376 |
Assert(i <= size()); |
460 |
46752 |
std::vector<unsigned> ret_vec; |
461 |
23376 |
std::vector<unsigned>::const_iterator itr = d_str.begin() + i; |
462 |
23376 |
ret_vec.insert(ret_vec.end(), itr, d_str.end()); |
463 |
46752 |
return String(ret_vec); |
464 |
|
} |
465 |
|
|
466 |
401262 |
String String::substr(std::size_t i, std::size_t j) const { |
467 |
401262 |
Assert(i + j <= size()); |
468 |
802524 |
std::vector<unsigned> ret_vec; |
469 |
401262 |
std::vector<unsigned>::const_iterator itr = d_str.begin() + i; |
470 |
401262 |
ret_vec.insert(ret_vec.end(), itr, itr + j); |
471 |
802524 |
return String(ret_vec); |
472 |
|
} |
473 |
|
|
474 |
195 |
bool String::noOverlapWith(const String& y) const |
475 |
|
{ |
476 |
195 |
return y.find(*this) == std::string::npos |
477 |
123 |
&& this->find(y) == std::string::npos && this->overlap(y) == 0 |
478 |
268 |
&& y.overlap(*this) == 0; |
479 |
|
} |
480 |
|
|
481 |
724 |
bool String::isNumber() const { |
482 |
724 |
if (d_str.empty()) { |
483 |
30 |
return false; |
484 |
|
} |
485 |
1609 |
for (unsigned character : d_str) { |
486 |
1170 |
if (!isDigit(character)) |
487 |
|
{ |
488 |
255 |
return false; |
489 |
|
} |
490 |
|
} |
491 |
439 |
return true; |
492 |
|
} |
493 |
|
|
494 |
2406 |
bool String::isDigit(unsigned character) |
495 |
|
{ |
496 |
|
// '0' to '9' |
497 |
2406 |
return 48 <= character && character <= 57; |
498 |
|
} |
499 |
|
|
500 |
1166 |
bool String::isHexDigit(unsigned character) |
501 |
|
{ |
502 |
|
// '0' to '9' or 'A' to 'F' or 'a' to 'f' |
503 |
1396 |
return isDigit(character) || (65 <= character && character <= 70) |
504 |
1382 |
|| (97 <= character && character <= 102); |
505 |
|
} |
506 |
|
|
507 |
2708255 |
bool String::isPrintable(unsigned character) |
508 |
|
{ |
509 |
|
// Unicode 0x00020 (' ') to 0x0007E ('~') |
510 |
2708255 |
return 32 <= character && character <= 126; |
511 |
|
} |
512 |
|
|
513 |
14655 |
size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); } |
514 |
|
|
515 |
153 |
Rational String::toNumber() const |
516 |
|
{ |
517 |
|
// when smt2 standard for strings is set, this may change, based on the |
518 |
|
// semantics of str.from.int for leading zeros |
519 |
153 |
return Rational(toString()); |
520 |
|
} |
521 |
|
|
522 |
|
std::ostream &operator<<(std::ostream &os, const String &s) { |
523 |
|
return os << "\"" << s.toString() << "\""; |
524 |
|
} |
525 |
|
|
526 |
29511 |
} // namespace cvc5 |