1 |
|
/****************************************************************************** |
2 |
|
* Top contributors (to current version): |
3 |
|
* Andrew Reynolds, Tim King, Tianyi Liang |
4 |
|
* |
5 |
|
* This file is part of the cvc5 project. |
6 |
|
* |
7 |
|
* Copyright (c) 2009-2021 by the authors listed in the file AUTHORS |
8 |
|
* in the top-level source directory and their institutional affiliations. |
9 |
|
* All rights reserved. See the file COPYING in the top-level source |
10 |
|
* directory for licensing information. |
11 |
|
* **************************************************************************** |
12 |
|
* |
13 |
|
* The string data type. |
14 |
|
*/ |
15 |
|
|
16 |
|
#include "cvc5_public.h" |
17 |
|
|
18 |
|
#ifndef CVC5__UTIL__STRING_H |
19 |
|
#define CVC5__UTIL__STRING_H |
20 |
|
|
21 |
|
#include <iosfwd> |
22 |
|
#include <string> |
23 |
|
#include <vector> |
24 |
|
|
25 |
|
#include "util/rational.h" |
26 |
|
|
27 |
|
namespace cvc5 { |
28 |
|
|
29 |
|
/** The cvc5 string class |
30 |
|
* |
31 |
|
* This data structure is the domain of values for the string type. It can also |
32 |
|
* be used as a generic utility for representing strings. |
33 |
|
*/ |
34 |
4952346 |
class String |
35 |
|
{ |
36 |
|
public: |
37 |
|
/** |
38 |
|
* This is the cardinality of the alphabet that is representable by this |
39 |
|
* class. Notice that this must be greater than or equal to the cardinality |
40 |
|
* of the alphabet that the string theory reasons about. |
41 |
|
* |
42 |
|
* This must be strictly less than std::numeric_limits<unsigned>::max(). |
43 |
|
* |
44 |
|
* As per the SMT-LIB standard for strings, we support the first 3 planes of |
45 |
|
* Unicode characters, where 196608 = 3*16^4. |
46 |
|
*/ |
47 |
4329059 |
static inline unsigned num_codes() { return 196608; } |
48 |
|
/** constructors for String |
49 |
|
* |
50 |
|
* Internally, a cvc5::String is represented by a vector of unsigned |
51 |
|
* integers (d_str) representing the code points of the characters. |
52 |
|
* |
53 |
|
* To build a string from a C++ string, we may process escape sequences |
54 |
|
* according to the SMT-LIB standard. In particular, if useEscSequences is |
55 |
|
* true, we convert unicode escape sequences: |
56 |
|
* \u d_3 d_2 d_1 d_0 |
57 |
|
* \u{d_0} |
58 |
|
* \u{d_1 d_0} |
59 |
|
* \u{d_2 d_1 d_0} |
60 |
|
* \u{d_3 d_2 d_1 d_0} |
61 |
|
* \u{d_4 d_3 d_2 d_1 d_0} |
62 |
|
* where d_0 ... d_4 are hexadecimal digits, to the appropriate character. |
63 |
|
* |
64 |
|
* If useEscSequences is false, then the characters of the constructed |
65 |
|
* cvc5::String correspond one-to-one with the input string. |
66 |
|
*/ |
67 |
583711 |
String() = default; |
68 |
8799 |
explicit String(const std::string& s, bool useEscSequences = false) |
69 |
8799 |
: d_str(toInternal(s, useEscSequences)) |
70 |
|
{ |
71 |
8798 |
} |
72 |
|
explicit String(const std::wstring& s); |
73 |
15292 |
explicit String(const char* s, bool useEscSequences = false) |
74 |
15292 |
: d_str(toInternal(std::string(s), useEscSequences)) |
75 |
|
{ |
76 |
15292 |
} |
77 |
|
explicit String(const std::vector<unsigned>& s); |
78 |
|
|
79 |
608101 |
String& operator=(const String& y) { |
80 |
608101 |
if (this != &y) { |
81 |
608101 |
d_str = y.d_str; |
82 |
|
} |
83 |
608101 |
return *this; |
84 |
|
} |
85 |
|
|
86 |
|
String concat(const String& other) const; |
87 |
|
|
88 |
2492631 |
bool operator==(const String& y) const { return cmp(y) == 0; } |
89 |
65 |
bool operator!=(const String& y) const { return cmp(y) != 0; } |
90 |
4078 |
bool operator<(const String& y) const { return cmp(y) < 0; } |
91 |
|
bool operator>(const String& y) const { return cmp(y) > 0; } |
92 |
|
bool operator<=(const String& y) const { return cmp(y) <= 0; } |
93 |
|
bool operator>=(const String& y) const { return cmp(y) >= 0; } |
94 |
|
|
95 |
|
/** |
96 |
|
* Returns true if this string is equal to y for their first n characters. |
97 |
|
* If n is larger than the length of this string or y, this method returns |
98 |
|
* true if and only if this string is equal to y. |
99 |
|
*/ |
100 |
|
bool strncmp(const String& y, std::size_t n) const; |
101 |
|
/** |
102 |
|
* Returns true if this string is equal to y for their last n characters. |
103 |
|
* Similar to strncmp, if n is larger than the length of this string or y, |
104 |
|
* this method returns true if and only if this string is equal to y. |
105 |
|
*/ |
106 |
|
bool rstrncmp(const String& y, std::size_t n) const; |
107 |
|
|
108 |
|
/* toString |
109 |
|
* Converts this string to a std::string. |
110 |
|
* |
111 |
|
* The unprintable characters are converted to unicode escape sequences as |
112 |
|
* described above. |
113 |
|
* |
114 |
|
* If useEscSequences is false, the string's printable characters are |
115 |
|
* printed as characters. Notice that for all std::string s having only |
116 |
|
* printable characters, we have that |
117 |
|
* cvc5::String( s ).toString() = s. |
118 |
|
*/ |
119 |
|
std::string toString(bool useEscSequences = false) const; |
120 |
|
/* toWString |
121 |
|
* Converts this string to a std::wstring. |
122 |
|
* |
123 |
|
* Unlike toString(), this method uses no escape sequences as both this class |
124 |
|
* and std::wstring use 32bit characters. |
125 |
|
*/ |
126 |
|
std::wstring toWString() const; |
127 |
|
/** is this the empty string? */ |
128 |
189339 |
bool empty() const { return d_str.empty(); } |
129 |
|
/** is less than or equal to string y */ |
130 |
|
bool isLeq(const String& y) const; |
131 |
|
/** Return the length of the string */ |
132 |
18279483 |
std::size_t size() const { return d_str.size(); } |
133 |
|
|
134 |
|
bool isRepeated() const; |
135 |
|
bool tailcmp(const String& y, int& c) const; |
136 |
|
|
137 |
|
/** |
138 |
|
* Return the first position y occurs in this string, or std::string::npos |
139 |
|
* otherwise. |
140 |
|
*/ |
141 |
|
std::size_t find(const String& y, const std::size_t start = 0) const; |
142 |
|
/** |
143 |
|
* Return the first position y occurs in this string searching from the end, |
144 |
|
* or std::string::npos otherwise. |
145 |
|
*/ |
146 |
|
std::size_t rfind(const String& y, const std::size_t start = 0) const; |
147 |
|
/** Returns true if y is a prefix of this */ |
148 |
|
bool hasPrefix(const String& y) const; |
149 |
|
/** Returns true if y is a suffix of this */ |
150 |
|
bool hasSuffix(const String& y) const; |
151 |
|
/** Replace the character at index i in this string with t */ |
152 |
|
String update(std::size_t i, const String& t) const; |
153 |
|
/** Replace the first occurrence of s in this string with t */ |
154 |
|
String replace(const String& s, const String& t) const; |
155 |
|
/** Return the substring of this string starting at index i */ |
156 |
|
String substr(std::size_t i) const; |
157 |
|
/** Return the substring of this string starting at index i with size at most |
158 |
|
* j */ |
159 |
|
String substr(std::size_t i, std::size_t j) const; |
160 |
|
/** Return the prefix of this string of size at most i */ |
161 |
4363 |
String prefix(std::size_t i) const { return substr(0, i); } |
162 |
|
/** Return the suffix of this string of size at most i */ |
163 |
19569 |
String suffix(std::size_t i) const { return substr(size() - i, i); } |
164 |
|
|
165 |
|
/** |
166 |
|
* Checks if there is any overlap between this string and another string. This |
167 |
|
* corresponds to checking whether one string contains the other and whether a |
168 |
|
* substring of one is a prefix of the other and vice-versa. |
169 |
|
* |
170 |
|
* @param y The other string |
171 |
|
* @return True if there is an overlap, false otherwise |
172 |
|
*/ |
173 |
|
bool noOverlapWith(const String& y) const; |
174 |
|
|
175 |
|
/** string overlap |
176 |
|
* |
177 |
|
* if overlap returns m>0, |
178 |
|
* then the maximal suffix of this string that is a prefix of y is of length m. |
179 |
|
* |
180 |
|
* For example, if x is "abcdef", then: |
181 |
|
* x.overlap("defg") = 3 |
182 |
|
* x.overlap("ab") = 0 |
183 |
|
* x.overlap("d") = 0 |
184 |
|
* x.overlap("bcdefdef") = 5 |
185 |
|
*/ |
186 |
|
std::size_t overlap(const String& y) const; |
187 |
|
/** string reverse overlap |
188 |
|
* |
189 |
|
* if roverlap returns m>0, |
190 |
|
* then the maximal prefix of this string that is a suffix of y is of length m. |
191 |
|
* |
192 |
|
* For example, if x is "abcdef", then: |
193 |
|
* x.roverlap("aaabc") = 3 |
194 |
|
* x.roverlap("def") = 0 |
195 |
|
* x.roverlap("d") = 0 |
196 |
|
* x.roverlap("defabcde") = 5 |
197 |
|
* |
198 |
|
* Notice that x.overlap(y) = y.roverlap(x) |
199 |
|
*/ |
200 |
|
std::size_t roverlap(const String& y) const; |
201 |
|
|
202 |
|
/** |
203 |
|
* Returns true if this string corresponds in text to a number, for example |
204 |
|
* this returns true for strings "7", "12", "004", "0" and false for strings |
205 |
|
* "abc", "4a", "-4", "". |
206 |
|
*/ |
207 |
|
bool isNumber() const; |
208 |
|
/** Returns the corresponding rational for the text of this string. */ |
209 |
|
Rational toNumber() const; |
210 |
|
/** Get the unsigned representation (code points) of this string */ |
211 |
260209 |
const std::vector<unsigned>& getVec() const { return d_str; } |
212 |
|
/** |
213 |
|
* Get the unsigned (code point) value of the first character in this string |
214 |
|
*/ |
215 |
|
unsigned front() const; |
216 |
|
/** |
217 |
|
* Get the unsigned (code point) value of the last character in this string |
218 |
|
*/ |
219 |
|
unsigned back() const; |
220 |
|
/** is the unsigned a digit? |
221 |
|
* |
222 |
|
* This is true for code points between 48 ('0') and 57 ('9'). |
223 |
|
*/ |
224 |
|
static bool isDigit(unsigned character); |
225 |
|
/** is the unsigned a hexadecimal digit? |
226 |
|
* |
227 |
|
* This is true for code points between 48 ('0') and 57 ('9'), code points |
228 |
|
* between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f). |
229 |
|
*/ |
230 |
|
static bool isHexDigit(unsigned character); |
231 |
|
/** is the unsigned a printable code point? |
232 |
|
* |
233 |
|
* This is true for Unicode 32 (' ') to 126 ('~'). |
234 |
|
*/ |
235 |
|
static bool isPrintable(unsigned character); |
236 |
|
|
237 |
|
/** |
238 |
|
* Returns the maximum length of string representable by this class. |
239 |
|
* Corresponds to the maximum size of d_str. |
240 |
|
*/ |
241 |
|
static size_t maxSize(); |
242 |
|
private: |
243 |
|
/** |
244 |
|
* Helper for toInternal: add character ch to vector vec, storing a string in |
245 |
|
* internal format. This throws an error if ch is not a printable character, |
246 |
|
* since non-printable characters must be escaped in SMT-LIB. |
247 |
|
*/ |
248 |
|
static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec); |
249 |
|
/** |
250 |
|
* Convert the string s to the internal format (vector of code points). |
251 |
|
* The argument useEscSequences is whether to process unicode escape |
252 |
|
* sequences. |
253 |
|
*/ |
254 |
|
static std::vector<unsigned> toInternal(const std::string& s, |
255 |
|
bool useEscSequences); |
256 |
|
|
257 |
|
/** |
258 |
|
* Returns a negative number if *this < y, 0 if *this and y are equal and a |
259 |
|
* positive number if *this > y. |
260 |
|
*/ |
261 |
|
int cmp(const String& y) const; |
262 |
|
|
263 |
|
std::vector<unsigned> d_str; |
264 |
|
}; /* class String */ |
265 |
|
|
266 |
|
namespace strings { |
267 |
|
|
268 |
|
struct StringHashFunction |
269 |
|
{ |
270 |
2565504 |
size_t operator()(const ::cvc5::String& s) const |
271 |
|
{ |
272 |
2565504 |
return std::hash<std::string>()(s.toString()); |
273 |
|
} |
274 |
|
}; /* struct StringHashFunction */ |
275 |
|
|
276 |
|
} // namespace strings |
277 |
|
|
278 |
|
std::ostream& operator<<(std::ostream& os, const String& s); |
279 |
|
|
280 |
|
} // namespace cvc5 |
281 |
|
|
282 |
|
#endif /* CVC5__UTIL__STRING_H */ |