1 |
|
/****************************************************************************** |
2 |
|
* Top contributors (to current version): |
3 |
|
* Andrew Reynolds, Mathias Preiner, Andres Noetzli |
4 |
|
* |
5 |
|
* This file is part of the cvc5 project. |
6 |
|
* |
7 |
|
* Copyright (c) 2009-2021 by the authors listed in the file AUTHORS |
8 |
|
* in the top-level source directory and their institutional affiliations. |
9 |
|
* All rights reserved. See the file COPYING in the top-level source |
10 |
|
* directory for licensing information. |
11 |
|
* **************************************************************************** |
12 |
|
* |
13 |
|
* Implementation of techniques for eliminating regular expressions. |
14 |
|
*/ |
15 |
|
|
16 |
|
#include "theory/strings/regexp_elim.h" |
17 |
|
|
18 |
|
#include "expr/bound_var_manager.h" |
19 |
|
#include "options/strings_options.h" |
20 |
|
#include "proof/proof_node_manager.h" |
21 |
|
#include "theory/rewriter.h" |
22 |
|
#include "theory/strings/regexp_entail.h" |
23 |
|
#include "theory/strings/theory_strings_utils.h" |
24 |
|
#include "util/rational.h" |
25 |
|
#include "util/string.h" |
26 |
|
|
27 |
|
using namespace cvc5::kind; |
28 |
|
|
29 |
|
namespace cvc5 { |
30 |
|
namespace theory { |
31 |
|
namespace strings { |
32 |
|
|
33 |
|
/** |
34 |
|
* Attributes used for constructing unique bound variables. The following |
35 |
|
* attributes are used to construct (deterministic) bound variables for |
36 |
|
* eliminations within eliminateConcat and eliminateStar respectively. |
37 |
|
*/ |
38 |
|
struct ReElimConcatIndexAttributeId |
39 |
|
{ |
40 |
|
}; |
41 |
|
typedef expr::Attribute<ReElimConcatIndexAttributeId, Node> |
42 |
|
ReElimConcatIndexAttribute; |
43 |
|
struct ReElimStarIndexAttributeId |
44 |
|
{ |
45 |
|
}; |
46 |
|
typedef expr::Attribute<ReElimStarIndexAttributeId, Node> |
47 |
|
ReElimStarIndexAttribute; |
48 |
|
|
49 |
15271 |
RegExpElimination::RegExpElimination(bool isAgg, |
50 |
|
ProofNodeManager* pnm, |
51 |
15271 |
context::Context* c) |
52 |
|
: d_isAggressive(isAgg), |
53 |
|
d_pnm(pnm), |
54 |
|
d_epg(pnm == nullptr |
55 |
|
? nullptr |
56 |
15271 |
: new EagerProofGenerator(pnm, c, "RegExpElimination::epg")) |
57 |
|
{ |
58 |
15271 |
} |
59 |
|
|
60 |
144 |
Node RegExpElimination::eliminate(Node atom, bool isAgg) |
61 |
|
{ |
62 |
144 |
Assert(atom.getKind() == STRING_IN_REGEXP); |
63 |
144 |
if (atom[1].getKind() == REGEXP_CONCAT) |
64 |
|
{ |
65 |
98 |
return eliminateConcat(atom, isAgg); |
66 |
|
} |
67 |
46 |
else if (atom[1].getKind() == REGEXP_STAR) |
68 |
|
{ |
69 |
46 |
return eliminateStar(atom, isAgg); |
70 |
|
} |
71 |
|
return Node::null(); |
72 |
|
} |
73 |
|
|
74 |
140 |
TrustNode RegExpElimination::eliminateTrusted(Node atom) |
75 |
|
{ |
76 |
280 |
Node eatom = eliminate(atom, d_isAggressive); |
77 |
140 |
if (!eatom.isNull()) |
78 |
|
{ |
79 |
|
// Currently aggressive doesnt work due to fresh bound variables |
80 |
98 |
if (isProofEnabled() && !d_isAggressive) |
81 |
|
{ |
82 |
8 |
Node eq = atom.eqNode(eatom); |
83 |
8 |
Node aggn = NodeManager::currentNM()->mkConst(d_isAggressive); |
84 |
|
std::shared_ptr<ProofNode> pn = |
85 |
8 |
d_pnm->mkNode(PfRule::RE_ELIM, {}, {atom, aggn}, eq); |
86 |
4 |
d_epg->setProofFor(eq, pn); |
87 |
4 |
return TrustNode::mkTrustRewrite(atom, eatom, d_epg.get()); |
88 |
|
} |
89 |
94 |
return TrustNode::mkTrustRewrite(atom, eatom, nullptr); |
90 |
|
} |
91 |
42 |
return TrustNode::null(); |
92 |
|
} |
93 |
|
|
94 |
98 |
Node RegExpElimination::eliminateConcat(Node atom, bool isAgg) |
95 |
|
{ |
96 |
98 |
NodeManager* nm = NodeManager::currentNM(); |
97 |
98 |
BoundVarManager* bvm = nm->getBoundVarManager(); |
98 |
196 |
Node x = atom[0]; |
99 |
196 |
Node lenx = nm->mkNode(STRING_LENGTH, x); |
100 |
196 |
Node re = atom[1]; |
101 |
196 |
Node zero = nm->mkConst(Rational(0)); |
102 |
196 |
std::vector<Node> children; |
103 |
98 |
utils::getConcat(re, children); |
104 |
|
|
105 |
|
// If it can be reduced to memberships in fixed length regular expressions. |
106 |
|
// This includes concatenations where at most one child is of the form |
107 |
|
// (re.* re.allchar), which we abbreviate _* below, and all other children |
108 |
|
// have a fixed length. |
109 |
|
// The intuition why this is a "non-aggressive" rewrite is that membership |
110 |
|
// into fixed length regular expressions are easy to handle. |
111 |
|
// the index of _* in re |
112 |
98 |
unsigned pivotIndex = 0; |
113 |
98 |
bool hasPivotIndex = false; |
114 |
98 |
bool hasFixedLength = true; |
115 |
196 |
std::vector<Node> childLengths; |
116 |
196 |
std::vector<Node> childLengthsPostPivot; |
117 |
490 |
for (unsigned i = 0, size = children.size(); i < size; i++) |
118 |
|
{ |
119 |
784 |
Node c = children[i]; |
120 |
784 |
Node fl = RegExpEntail::getFixedLengthForRegexp(c); |
121 |
392 |
if (fl.isNull()) |
122 |
|
{ |
123 |
298 |
if (!hasPivotIndex && c.getKind() == REGEXP_STAR |
124 |
273 |
&& c[0].getKind() == REGEXP_SIGMA) |
125 |
|
{ |
126 |
21 |
hasPivotIndex = true; |
127 |
21 |
pivotIndex = i; |
128 |
|
// zero is used in sum below and is used for concat-fixed-len |
129 |
21 |
fl = zero; |
130 |
|
} |
131 |
|
else |
132 |
|
{ |
133 |
85 |
hasFixedLength = false; |
134 |
|
} |
135 |
|
} |
136 |
392 |
if (!fl.isNull()) |
137 |
|
{ |
138 |
307 |
childLengths.push_back(fl); |
139 |
307 |
if (hasPivotIndex) |
140 |
|
{ |
141 |
131 |
childLengthsPostPivot.push_back(fl); |
142 |
|
} |
143 |
|
} |
144 |
|
} |
145 |
98 |
Node lenSum = childLengths.size() > 1 |
146 |
|
? nm->mkNode(PLUS, childLengths) |
147 |
196 |
: (childLengths.empty() ? zero : childLengths[0]); |
148 |
|
// if we have a fixed length |
149 |
98 |
if (hasFixedLength) |
150 |
|
{ |
151 |
29 |
Assert(re.getNumChildren() == children.size()); |
152 |
58 |
std::vector<Node> conc; |
153 |
29 |
conc.push_back(nm->mkNode(hasPivotIndex ? GEQ : EQUAL, lenx, lenSum)); |
154 |
58 |
Node currEnd = zero; |
155 |
204 |
for (unsigned i = 0, size = childLengths.size(); i < size; i++) |
156 |
|
{ |
157 |
175 |
if (hasPivotIndex && i == pivotIndex) |
158 |
|
{ |
159 |
6 |
Node ppSum = childLengthsPostPivot.size() == 1 |
160 |
|
? childLengthsPostPivot[0] |
161 |
12 |
: nm->mkNode(PLUS, childLengthsPostPivot); |
162 |
12 |
currEnd = nm->mkNode(MINUS, lenx, ppSum); |
163 |
|
} |
164 |
|
else |
165 |
|
{ |
166 |
338 |
Node curr = nm->mkNode(STRING_SUBSTR, x, currEnd, childLengths[i]); |
167 |
|
// We do not need to include memberships of the form |
168 |
|
// (str.substr x n 1) in re.allchar |
169 |
|
// since we know that by construction, n < len( x ). |
170 |
169 |
if (re[i].getKind() != REGEXP_SIGMA) |
171 |
|
{ |
172 |
184 |
Node currMem = nm->mkNode(STRING_IN_REGEXP, curr, re[i]); |
173 |
92 |
conc.push_back(currMem); |
174 |
|
} |
175 |
169 |
currEnd = nm->mkNode(PLUS, currEnd, childLengths[i]); |
176 |
169 |
currEnd = Rewriter::rewrite(currEnd); |
177 |
|
} |
178 |
|
} |
179 |
58 |
Node res = nm->mkNode(AND, conc); |
180 |
|
// For example: |
181 |
|
// x in re.++(re.union(re.range("A", "J"), re.range("N", "Z")), "AB") --> |
182 |
|
// len( x ) = 3 ^ |
183 |
|
// substr(x,0,1) in re.union(re.range("A", "J"), re.range("N", "Z")) ^ |
184 |
|
// substr(x,1,2) in "AB" |
185 |
|
// An example with a pivot index: |
186 |
|
// x in re.++( "AB" ++ _* ++ "C" ) --> |
187 |
|
// len( x ) >= 3 ^ |
188 |
|
// substr( x, 0, 2 ) in "AB" ^ |
189 |
|
// substr( x, len( x ) - 1, 1 ) in "C" |
190 |
29 |
return returnElim(atom, res, "concat-fixed-len"); |
191 |
|
} |
192 |
|
|
193 |
|
// memberships of the form x in re.++ * s1 * ... * sn *, where * are |
194 |
|
// any number of repetitions (exact or indefinite) of re.allchar. |
195 |
69 |
Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl; |
196 |
69 |
bool onlySigmasAndConsts = true; |
197 |
138 |
std::vector<Node> sep_children; |
198 |
138 |
std::vector<unsigned> gap_minsize; |
199 |
138 |
std::vector<bool> gap_exact; |
200 |
|
// the first gap is initially strict zero |
201 |
69 |
gap_minsize.push_back(0); |
202 |
69 |
gap_exact.push_back(true); |
203 |
184 |
for (const Node& c : children) |
204 |
|
{ |
205 |
169 |
Trace("re-elim-debug") << " " << c << std::endl; |
206 |
169 |
onlySigmasAndConsts = false; |
207 |
169 |
if (c.getKind() == STRING_TO_REGEXP) |
208 |
|
{ |
209 |
75 |
onlySigmasAndConsts = true; |
210 |
75 |
sep_children.push_back(c[0]); |
211 |
|
// the next gap is initially strict zero |
212 |
75 |
gap_minsize.push_back(0); |
213 |
75 |
gap_exact.push_back(true); |
214 |
|
} |
215 |
94 |
else if (c.getKind() == REGEXP_STAR && c[0].getKind() == REGEXP_SIGMA) |
216 |
|
{ |
217 |
|
// found a gap of any size |
218 |
35 |
onlySigmasAndConsts = true; |
219 |
35 |
gap_exact[gap_exact.size() - 1] = false; |
220 |
|
} |
221 |
59 |
else if (c.getKind() == REGEXP_SIGMA) |
222 |
|
{ |
223 |
|
// add one to the minimum size of the gap |
224 |
5 |
onlySigmasAndConsts = true; |
225 |
5 |
gap_minsize[gap_minsize.size() - 1]++; |
226 |
|
} |
227 |
169 |
if (!onlySigmasAndConsts) |
228 |
|
{ |
229 |
54 |
Trace("re-elim-debug") << "...cannot handle " << c << std::endl; |
230 |
54 |
break; |
231 |
|
} |
232 |
|
} |
233 |
|
// we should always rewrite concatenations that are purely re.allchar |
234 |
|
// and re.*( re.allchar ). |
235 |
69 |
Assert(!onlySigmasAndConsts || !sep_children.empty()); |
236 |
69 |
if (onlySigmasAndConsts && !sep_children.empty()) |
237 |
|
{ |
238 |
15 |
bool canProcess = true; |
239 |
15 |
std::vector<Node> conj; |
240 |
|
// The following constructs a set of constraints that encodes that a |
241 |
|
// set of string terms are found, in order, in string x. |
242 |
|
// prev_end stores the current (symbolic) index in x that we are |
243 |
|
// searching. |
244 |
15 |
Node prev_end = zero; |
245 |
|
// the symbolic index we start searching, for each child in sep_children. |
246 |
15 |
std::vector<Node> prev_ends; |
247 |
15 |
unsigned gap_minsize_end = gap_minsize.back(); |
248 |
15 |
bool gap_exact_end = gap_exact.back(); |
249 |
15 |
std::vector<Node> non_greedy_find_vars; |
250 |
60 |
for (unsigned i = 0, size = sep_children.size(); i < size; i++) |
251 |
|
{ |
252 |
45 |
if (gap_minsize[i] > 0) |
253 |
|
{ |
254 |
|
// the gap to this child is at least gap_minsize[i] |
255 |
5 |
prev_end = |
256 |
10 |
nm->mkNode(PLUS, prev_end, nm->mkConst(Rational(gap_minsize[i]))); |
257 |
|
} |
258 |
45 |
prev_ends.push_back(prev_end); |
259 |
90 |
Node sc = sep_children[i]; |
260 |
90 |
Node lensc = nm->mkNode(STRING_LENGTH, sc); |
261 |
45 |
if (gap_exact[i]) |
262 |
|
{ |
263 |
|
// if the gap is exact, it is a substring constraint |
264 |
30 |
Node curr = prev_end; |
265 |
30 |
Node ss = nm->mkNode(STRING_SUBSTR, x, curr, lensc); |
266 |
15 |
conj.push_back(ss.eqNode(sc)); |
267 |
15 |
prev_end = nm->mkNode(PLUS, curr, lensc); |
268 |
|
} |
269 |
|
else |
270 |
|
{ |
271 |
|
// otherwise, we can use indexof to represent some next occurrence |
272 |
30 |
if (gap_exact[i + 1] && i + 1 != size) |
273 |
|
{ |
274 |
|
if (!isAgg) |
275 |
|
{ |
276 |
|
canProcess = false; |
277 |
|
break; |
278 |
|
} |
279 |
|
// if the gap after this one is strict, we need a non-greedy find |
280 |
|
// thus, we add a symbolic constant |
281 |
|
Node cacheVal = |
282 |
|
BoundVarManager::getCacheValue(atom, nm->mkConst(Rational(i))); |
283 |
|
TypeNode intType = nm->integerType(); |
284 |
|
Node k = |
285 |
|
bvm->mkBoundVar<ReElimConcatIndexAttribute>(cacheVal, intType); |
286 |
|
non_greedy_find_vars.push_back(k); |
287 |
|
prev_end = nm->mkNode(PLUS, prev_end, k); |
288 |
|
} |
289 |
60 |
Node curr = nm->mkNode(STRING_INDEXOF, x, sc, prev_end); |
290 |
60 |
Node idofFind = curr.eqNode(nm->mkConst(Rational(-1))).negate(); |
291 |
30 |
conj.push_back(idofFind); |
292 |
30 |
prev_end = nm->mkNode(PLUS, curr, lensc); |
293 |
|
} |
294 |
|
} |
295 |
|
|
296 |
15 |
if (canProcess) |
297 |
|
{ |
298 |
|
// since sep_children is non-empty, conj is non-empty |
299 |
15 |
Assert(!conj.empty()); |
300 |
|
// Process the last gap, if necessary. |
301 |
|
// Notice that if the last gap is not exact and its minsize is zero, |
302 |
|
// then the last indexof/substr constraint entails the following |
303 |
|
// constraint, so it is not necessary to add. |
304 |
|
// Below, we may write "A" for (str.to.re "A") and _ for re.allchar: |
305 |
30 |
Node cEnd = nm->mkConst(Rational(gap_minsize_end)); |
306 |
15 |
if (gap_exact_end) |
307 |
|
{ |
308 |
10 |
Assert(!sep_children.empty()); |
309 |
|
// if it is strict, it corresponds to a substr case. |
310 |
|
// For example: |
311 |
|
// x in (re.++ "A" (re.* _) "B" _ _) ---> |
312 |
|
// ... ^ "B" = substr( x, len( x ) - 3, 1 ) ^ ... |
313 |
20 |
Node sc = sep_children.back(); |
314 |
20 |
Node lenSc = nm->mkNode(STRING_LENGTH, sc); |
315 |
20 |
Node loc = nm->mkNode(MINUS, lenx, nm->mkNode(PLUS, lenSc, cEnd)); |
316 |
20 |
Node scc = sc.eqNode(nm->mkNode(STRING_SUBSTR, x, loc, lenSc)); |
317 |
|
// We also must ensure that we fit. This constraint is necessary in |
318 |
|
// addition to the constraint above. Take this example: |
319 |
|
// x in (re.++ "A" _ (re.* _) "B" _) ---> |
320 |
|
// substr( x, 0, 1 ) = "A" ^ // find "A" |
321 |
|
// indexof( x, "B", 2 ) != -1 ^ // find "B" >=1 after "A" |
322 |
|
// substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2. |
323 |
|
// indexof( x, "B", 2 ) <= len( x ) - 2 |
324 |
|
// The last constaint ensures that the second and third constraints |
325 |
|
// may refer to the same "B". If it were not for the last constraint, it |
326 |
|
// would have been the case than "ABB" would be a model for x, where |
327 |
|
// the second constraint refers to the third position, and the third |
328 |
|
// constraint refers to the second position. |
329 |
|
// |
330 |
|
// With respect to the above example, the following is an optimization. |
331 |
|
// For that example, we instead produce: |
332 |
|
// x in (re.++ "A" _ (re.* _) "B" _) ---> |
333 |
|
// substr( x, 0, 1 ) = "A" ^ // find "A" |
334 |
|
// substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2 |
335 |
|
// 2 <= len( x ) - 2 |
336 |
|
// The intuition is that above, there are two constraints that insist |
337 |
|
// that "B" is found, whereas we only need one. The last constraint |
338 |
|
// above says that the "B" we find at end-2 can be found >=1 after |
339 |
|
// the "A". |
340 |
10 |
conj.pop_back(); |
341 |
20 |
Node fit = nm->mkNode(gap_exact[sep_children.size() - 1] ? EQUAL : LEQ, |
342 |
10 |
prev_ends.back(), |
343 |
50 |
loc); |
344 |
|
|
345 |
10 |
conj.push_back(scc); |
346 |
10 |
conj.push_back(fit); |
347 |
|
} |
348 |
5 |
else if (gap_minsize_end > 0) |
349 |
|
{ |
350 |
|
// if it is non-strict, we are in a "greedy find" situtation where |
351 |
|
// we just need to ensure that the next occurrence fits. |
352 |
|
// For example: |
353 |
|
// x in (re.++ "A" (re.* _) "B" _ _ (re.* _)) ---> |
354 |
|
// ... ^ indexof( x, "B", 1 ) + 2 <= len( x ) |
355 |
|
Node fit = nm->mkNode(LEQ, nm->mkNode(PLUS, prev_end, cEnd), lenx); |
356 |
|
conj.push_back(fit); |
357 |
|
} |
358 |
30 |
Node res = nm->mkAnd(conj); |
359 |
|
// process the non-greedy find variables |
360 |
15 |
if (!non_greedy_find_vars.empty()) |
361 |
|
{ |
362 |
|
std::vector<Node> children2; |
363 |
|
for (const Node& v : non_greedy_find_vars) |
364 |
|
{ |
365 |
|
Node bound = nm->mkNode( |
366 |
|
AND, nm->mkNode(LEQ, zero, v), nm->mkNode(LT, v, lenx)); |
367 |
|
children2.push_back(bound); |
368 |
|
} |
369 |
|
children2.push_back(res); |
370 |
|
Node body = nm->mkNode(AND, children2); |
371 |
|
Node bvl = nm->mkNode(BOUND_VAR_LIST, non_greedy_find_vars); |
372 |
|
res = utils::mkForallInternal(bvl, body.negate()).negate(); |
373 |
|
} |
374 |
|
// must also give a minimum length requirement |
375 |
15 |
res = nm->mkNode(AND, res, nm->mkNode(GEQ, lenx, lenSum)); |
376 |
|
// Examples of this elimination: |
377 |
|
// x in (re.++ "A" (re.* _) "B" (re.* _)) ---> |
378 |
|
// substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1 |
379 |
|
// x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) ---> |
380 |
|
// indexof(x,"A",0)!=-1 ^ |
381 |
|
// indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^ |
382 |
|
// indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x) ^ |
383 |
|
// len(x) >= 7 |
384 |
|
|
385 |
|
// An example of a non-greedy find: |
386 |
|
// x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) ---> |
387 |
|
// (exists k. 0 <= k < len( x ) ^ |
388 |
|
// indexof( x, "A", k ) != -1 ^ |
389 |
|
// substr( x, indexof( x, "A", k )+2, 1 ) = "B") ^ |
390 |
|
// len(x) >= 3 |
391 |
15 |
return returnElim(atom, res, "concat-with-gaps"); |
392 |
|
} |
393 |
|
} |
394 |
|
|
395 |
54 |
if (!isAgg) |
396 |
|
{ |
397 |
20 |
return Node::null(); |
398 |
|
} |
399 |
|
// only aggressive rewrites below here |
400 |
|
|
401 |
|
// if the first or last child is constant string, we can split the membership |
402 |
|
// into a conjunction of two memberships. |
403 |
68 |
Node sStartIndex = zero; |
404 |
68 |
Node sLength = lenx; |
405 |
68 |
std::vector<Node> sConstraints; |
406 |
68 |
std::vector<Node> rexpElimChildren; |
407 |
34 |
unsigned nchildren = children.size(); |
408 |
34 |
Assert(nchildren > 1); |
409 |
102 |
for (unsigned r = 0; r < 2; r++) |
410 |
|
{ |
411 |
68 |
unsigned index = r == 0 ? 0 : nchildren - 1; |
412 |
136 |
Node c = children[index]; |
413 |
68 |
if (c.getKind() == STRING_TO_REGEXP) |
414 |
|
{ |
415 |
30 |
Assert(children[index + (r == 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP); |
416 |
60 |
Node s = c[0]; |
417 |
60 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
418 |
60 |
Node sss = r == 0 ? zero : nm->mkNode(MINUS, lenx, lens); |
419 |
60 |
Node ss = nm->mkNode(STRING_SUBSTR, x, sss, lens); |
420 |
30 |
sConstraints.push_back(ss.eqNode(s)); |
421 |
30 |
if (r == 0) |
422 |
|
{ |
423 |
26 |
sStartIndex = lens; |
424 |
|
} |
425 |
4 |
else if (r == 1 && sConstraints.size() == 2) |
426 |
|
{ |
427 |
|
// first and last children cannot overlap |
428 |
4 |
Node bound = nm->mkNode(GEQ, sss, sStartIndex); |
429 |
2 |
sConstraints.push_back(bound); |
430 |
|
} |
431 |
30 |
sLength = nm->mkNode(MINUS, sLength, lens); |
432 |
|
} |
433 |
68 |
if (r == 1 && !sConstraints.empty()) |
434 |
|
{ |
435 |
|
// add the middle children |
436 |
30 |
for (unsigned i = 1; i < (nchildren - 1); i++) |
437 |
|
{ |
438 |
2 |
rexpElimChildren.push_back(children[i]); |
439 |
|
} |
440 |
|
} |
441 |
68 |
if (c.getKind() != STRING_TO_REGEXP) |
442 |
|
{ |
443 |
38 |
rexpElimChildren.push_back(c); |
444 |
|
} |
445 |
|
} |
446 |
34 |
if (!sConstraints.empty()) |
447 |
|
{ |
448 |
56 |
Node ss = nm->mkNode(STRING_SUBSTR, x, sStartIndex, sLength); |
449 |
28 |
Assert(!rexpElimChildren.empty()); |
450 |
56 |
Node regElim = utils::mkConcat(rexpElimChildren, nm->regExpType()); |
451 |
28 |
sConstraints.push_back(nm->mkNode(STRING_IN_REGEXP, ss, regElim)); |
452 |
56 |
Node ret = nm->mkNode(AND, sConstraints); |
453 |
|
// e.g. |
454 |
|
// x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R |
455 |
28 |
return returnElim(atom, ret, "concat-splice"); |
456 |
|
} |
457 |
6 |
Assert(nchildren > 1); |
458 |
12 |
for (unsigned i = 0; i < nchildren; i++) |
459 |
|
{ |
460 |
12 |
if (children[i].getKind() == STRING_TO_REGEXP) |
461 |
|
{ |
462 |
12 |
Node s = children[i][0]; |
463 |
12 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
464 |
|
// there exists an index in this string such that the substring is this |
465 |
12 |
Node k; |
466 |
12 |
std::vector<Node> echildren; |
467 |
6 |
if (i == 0) |
468 |
|
{ |
469 |
|
k = zero; |
470 |
|
} |
471 |
6 |
else if (i + 1 == nchildren) |
472 |
|
{ |
473 |
|
k = nm->mkNode(MINUS, lenx, lens); |
474 |
|
} |
475 |
|
else |
476 |
|
{ |
477 |
|
Node cacheVal = |
478 |
12 |
BoundVarManager::getCacheValue(atom, nm->mkConst(Rational(i))); |
479 |
12 |
TypeNode intType = nm->integerType(); |
480 |
6 |
k = bvm->mkBoundVar<ReElimConcatIndexAttribute>(cacheVal, intType); |
481 |
|
Node bound = |
482 |
|
nm->mkNode(AND, |
483 |
12 |
nm->mkNode(LEQ, zero, k), |
484 |
24 |
nm->mkNode(LEQ, k, nm->mkNode(MINUS, lenx, lens))); |
485 |
6 |
echildren.push_back(bound); |
486 |
|
} |
487 |
12 |
Node substrEq = nm->mkNode(STRING_SUBSTR, x, k, lens).eqNode(s); |
488 |
6 |
echildren.push_back(substrEq); |
489 |
6 |
if (i > 0) |
490 |
|
{ |
491 |
12 |
std::vector<Node> rprefix; |
492 |
6 |
rprefix.insert(rprefix.end(), children.begin(), children.begin() + i); |
493 |
12 |
Node rpn = utils::mkConcat(rprefix, nm->regExpType()); |
494 |
|
Node substrPrefix = nm->mkNode( |
495 |
12 |
STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, zero, k), rpn); |
496 |
6 |
echildren.push_back(substrPrefix); |
497 |
|
} |
498 |
6 |
if (i + 1 < nchildren) |
499 |
|
{ |
500 |
12 |
std::vector<Node> rsuffix; |
501 |
6 |
rsuffix.insert(rsuffix.end(), children.begin() + i + 1, children.end()); |
502 |
12 |
Node rps = utils::mkConcat(rsuffix, nm->regExpType()); |
503 |
12 |
Node ks = nm->mkNode(PLUS, k, lens); |
504 |
|
Node substrSuffix = nm->mkNode( |
505 |
|
STRING_IN_REGEXP, |
506 |
12 |
nm->mkNode(STRING_SUBSTR, x, ks, nm->mkNode(MINUS, lenx, ks)), |
507 |
24 |
rps); |
508 |
6 |
echildren.push_back(substrSuffix); |
509 |
|
} |
510 |
12 |
Node body = nm->mkNode(AND, echildren); |
511 |
6 |
if (k.getKind() == BOUND_VARIABLE) |
512 |
|
{ |
513 |
12 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, k); |
514 |
6 |
body = utils::mkForallInternal(bvl, body.negate()).negate(); |
515 |
|
} |
516 |
|
// e.g. x in re.++( R1, "AB", R2 ) ---> |
517 |
|
// exists k. |
518 |
|
// 0 <= k <= (len(x)-2) ^ |
519 |
|
// substr( x, k, 2 ) = "AB" ^ |
520 |
|
// substr( x, 0, k ) in R1 ^ |
521 |
|
// substr( x, k+2, len(x)-(k+2) ) in R2 |
522 |
6 |
return returnElim(atom, body, "concat-find"); |
523 |
|
} |
524 |
|
} |
525 |
|
return Node::null(); |
526 |
|
} |
527 |
|
|
528 |
46 |
Node RegExpElimination::eliminateStar(Node atom, bool isAgg) |
529 |
|
{ |
530 |
46 |
if (!isAgg) |
531 |
|
{ |
532 |
14 |
return Node::null(); |
533 |
|
} |
534 |
|
// only aggressive rewrites below here |
535 |
|
|
536 |
32 |
NodeManager* nm = NodeManager::currentNM(); |
537 |
32 |
BoundVarManager* bvm = nm->getBoundVarManager(); |
538 |
64 |
Node x = atom[0]; |
539 |
64 |
Node lenx = nm->mkNode(STRING_LENGTH, x); |
540 |
64 |
Node re = atom[1]; |
541 |
64 |
Node zero = nm->mkConst(Rational(0)); |
542 |
|
// for regular expression star, |
543 |
|
// if the period is a fixed constant, we can turn it into a bounded |
544 |
|
// quantifier |
545 |
64 |
std::vector<Node> disj; |
546 |
32 |
if (re[0].getKind() == REGEXP_UNION) |
547 |
|
{ |
548 |
12 |
for (const Node& r : re[0]) |
549 |
|
{ |
550 |
8 |
disj.push_back(r); |
551 |
|
} |
552 |
|
} |
553 |
|
else |
554 |
|
{ |
555 |
28 |
disj.push_back(re[0]); |
556 |
|
} |
557 |
32 |
bool lenOnePeriod = true; |
558 |
64 |
std::vector<Node> char_constraints; |
559 |
64 |
TypeNode intType = nm->integerType(); |
560 |
64 |
Node index = bvm->mkBoundVar<ReElimStarIndexAttribute>(atom, intType); |
561 |
|
Node substr_ch = |
562 |
64 |
nm->mkNode(STRING_SUBSTR, x, index, nm->mkConst(Rational(1))); |
563 |
32 |
substr_ch = Rewriter::rewrite(substr_ch); |
564 |
|
// handle the case where it is purely characters |
565 |
58 |
for (const Node& r : disj) |
566 |
|
{ |
567 |
36 |
Assert(r.getKind() != REGEXP_UNION); |
568 |
36 |
Assert(r.getKind() != REGEXP_SIGMA); |
569 |
36 |
lenOnePeriod = false; |
570 |
|
// lenOnePeriod is true if this regular expression is a single character |
571 |
|
// regular expression |
572 |
36 |
if (r.getKind() == STRING_TO_REGEXP) |
573 |
|
{ |
574 |
54 |
Node s = r[0]; |
575 |
27 |
if (s.isConst() && s.getConst<String>().size() == 1) |
576 |
|
{ |
577 |
21 |
lenOnePeriod = true; |
578 |
|
} |
579 |
|
} |
580 |
9 |
else if (r.getKind() == REGEXP_RANGE) |
581 |
|
{ |
582 |
5 |
lenOnePeriod = true; |
583 |
|
} |
584 |
36 |
if (!lenOnePeriod) |
585 |
|
{ |
586 |
10 |
break; |
587 |
|
} |
588 |
|
else |
589 |
|
{ |
590 |
52 |
Node regexp_ch = nm->mkNode(STRING_IN_REGEXP, substr_ch, r); |
591 |
26 |
regexp_ch = Rewriter::rewrite(regexp_ch); |
592 |
26 |
Assert(regexp_ch.getKind() != STRING_IN_REGEXP); |
593 |
26 |
char_constraints.push_back(regexp_ch); |
594 |
|
} |
595 |
|
} |
596 |
32 |
if (lenOnePeriod) |
597 |
|
{ |
598 |
22 |
Assert(!char_constraints.empty()); |
599 |
|
Node bound = nm->mkNode( |
600 |
44 |
AND, nm->mkNode(LEQ, zero, index), nm->mkNode(LT, index, lenx)); |
601 |
44 |
Node conc = char_constraints.size() == 1 ? char_constraints[0] |
602 |
66 |
: nm->mkNode(OR, char_constraints); |
603 |
44 |
Node body = nm->mkNode(OR, bound.negate(), conc); |
604 |
44 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, index); |
605 |
44 |
Node res = utils::mkForallInternal(bvl, body); |
606 |
|
// e.g. |
607 |
|
// x in (re.* (re.union "A" "B" )) ---> |
608 |
|
// forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B") |
609 |
22 |
return returnElim(atom, res, "star-char"); |
610 |
|
} |
611 |
|
// otherwise, for stars of constant length these are periodic |
612 |
10 |
if (disj.size() == 1) |
613 |
|
{ |
614 |
10 |
Node r = disj[0]; |
615 |
6 |
if (r.getKind() == STRING_TO_REGEXP) |
616 |
|
{ |
617 |
10 |
Node s = r[0]; |
618 |
6 |
if (s.isConst()) |
619 |
|
{ |
620 |
4 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
621 |
2 |
lens = Rewriter::rewrite(lens); |
622 |
2 |
Assert(lens.isConst()); |
623 |
2 |
Assert(lens.getConst<Rational>().sgn() > 0); |
624 |
4 |
std::vector<Node> conj; |
625 |
|
// lens is a positive constant, so it is safe to use total div/mod here. |
626 |
|
Node bound = nm->mkNode( |
627 |
|
AND, |
628 |
4 |
nm->mkNode(LEQ, zero, index), |
629 |
8 |
nm->mkNode(LT, index, nm->mkNode(INTS_DIVISION_TOTAL, lenx, lens))); |
630 |
|
Node conc = |
631 |
4 |
nm->mkNode(STRING_SUBSTR, x, nm->mkNode(MULT, index, lens), lens) |
632 |
4 |
.eqNode(s); |
633 |
4 |
Node body = nm->mkNode(OR, bound.negate(), conc); |
634 |
4 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, index); |
635 |
4 |
Node res = utils::mkForallInternal(bvl, body); |
636 |
6 |
res = nm->mkNode( |
637 |
4 |
AND, nm->mkNode(INTS_MODULUS_TOTAL, lenx, lens).eqNode(zero), res); |
638 |
|
// e.g. |
639 |
|
// x in ("abc")* ---> |
640 |
|
// forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^ |
641 |
|
// len(x) mod 3 = 0 |
642 |
2 |
return returnElim(atom, res, "star-constant"); |
643 |
|
} |
644 |
|
} |
645 |
|
} |
646 |
8 |
return Node::null(); |
647 |
|
} |
648 |
|
|
649 |
102 |
Node RegExpElimination::returnElim(Node atom, Node atomElim, const char* id) |
650 |
|
{ |
651 |
204 |
Trace("re-elim") << "re-elim: " << atom << " to " << atomElim << " by " << id |
652 |
102 |
<< "." << std::endl; |
653 |
102 |
return atomElim; |
654 |
|
} |
655 |
98 |
bool RegExpElimination::isProofEnabled() const { return d_pnm != nullptr; } |
656 |
|
|
657 |
|
} // namespace strings |
658 |
|
} // namespace theory |
659 |
31125 |
} // namespace cvc5 |