1 |
|
/****************************************************************************** |
2 |
|
* Top contributors (to current version): |
3 |
|
* Andrew Reynolds, Mathias Preiner, Andres Noetzli |
4 |
|
* |
5 |
|
* This file is part of the cvc5 project. |
6 |
|
* |
7 |
|
* Copyright (c) 2009-2021 by the authors listed in the file AUTHORS |
8 |
|
* in the top-level source directory and their institutional affiliations. |
9 |
|
* All rights reserved. See the file COPYING in the top-level source |
10 |
|
* directory for licensing information. |
11 |
|
* **************************************************************************** |
12 |
|
* |
13 |
|
* Implementation of techniques for eliminating regular expressions. |
14 |
|
*/ |
15 |
|
|
16 |
|
#include "theory/strings/regexp_elim.h" |
17 |
|
|
18 |
|
#include "options/strings_options.h" |
19 |
|
#include "proof/proof_node_manager.h" |
20 |
|
#include "theory/rewriter.h" |
21 |
|
#include "theory/strings/regexp_entail.h" |
22 |
|
#include "theory/strings/theory_strings_utils.h" |
23 |
|
#include "util/rational.h" |
24 |
|
#include "util/string.h" |
25 |
|
|
26 |
|
using namespace cvc5::kind; |
27 |
|
|
28 |
|
namespace cvc5 { |
29 |
|
namespace theory { |
30 |
|
namespace strings { |
31 |
|
|
32 |
9913 |
RegExpElimination::RegExpElimination(bool isAgg, |
33 |
|
ProofNodeManager* pnm, |
34 |
9913 |
context::Context* c) |
35 |
|
: d_isAggressive(isAgg), |
36 |
|
d_pnm(pnm), |
37 |
|
d_epg(pnm == nullptr |
38 |
|
? nullptr |
39 |
9913 |
: new EagerProofGenerator(pnm, c, "RegExpElimination::epg")) |
40 |
|
{ |
41 |
9913 |
} |
42 |
|
|
43 |
138 |
Node RegExpElimination::eliminate(Node atom, bool isAgg) |
44 |
|
{ |
45 |
138 |
Assert(atom.getKind() == STRING_IN_REGEXP); |
46 |
138 |
if (atom[1].getKind() == REGEXP_CONCAT) |
47 |
|
{ |
48 |
96 |
return eliminateConcat(atom, isAgg); |
49 |
|
} |
50 |
42 |
else if (atom[1].getKind() == REGEXP_STAR) |
51 |
|
{ |
52 |
42 |
return eliminateStar(atom, isAgg); |
53 |
|
} |
54 |
|
return Node::null(); |
55 |
|
} |
56 |
|
|
57 |
134 |
TrustNode RegExpElimination::eliminateTrusted(Node atom) |
58 |
|
{ |
59 |
268 |
Node eatom = eliminate(atom, d_isAggressive); |
60 |
134 |
if (!eatom.isNull()) |
61 |
|
{ |
62 |
|
// Currently aggressive doesnt work due to fresh bound variables |
63 |
92 |
if (isProofEnabled() && !d_isAggressive) |
64 |
|
{ |
65 |
8 |
Node eq = atom.eqNode(eatom); |
66 |
8 |
Node aggn = NodeManager::currentNM()->mkConst(d_isAggressive); |
67 |
|
std::shared_ptr<ProofNode> pn = |
68 |
8 |
d_pnm->mkNode(PfRule::RE_ELIM, {}, {atom, aggn}, eq); |
69 |
4 |
d_epg->setProofFor(eq, pn); |
70 |
4 |
return TrustNode::mkTrustRewrite(atom, eatom, d_epg.get()); |
71 |
|
} |
72 |
88 |
return TrustNode::mkTrustRewrite(atom, eatom, nullptr); |
73 |
|
} |
74 |
42 |
return TrustNode::null(); |
75 |
|
} |
76 |
|
|
77 |
96 |
Node RegExpElimination::eliminateConcat(Node atom, bool isAgg) |
78 |
|
{ |
79 |
96 |
NodeManager* nm = NodeManager::currentNM(); |
80 |
192 |
Node x = atom[0]; |
81 |
192 |
Node lenx = nm->mkNode(STRING_LENGTH, x); |
82 |
192 |
Node re = atom[1]; |
83 |
192 |
Node zero = nm->mkConst(Rational(0)); |
84 |
192 |
std::vector<Node> children; |
85 |
96 |
utils::getConcat(re, children); |
86 |
|
|
87 |
|
// If it can be reduced to memberships in fixed length regular expressions. |
88 |
|
// This includes concatenations where at most one child is of the form |
89 |
|
// (re.* re.allchar), which we abbreviate _* below, and all other children |
90 |
|
// have a fixed length. |
91 |
|
// The intuition why this is a "non-aggressive" rewrite is that membership |
92 |
|
// into fixed length regular expressions are easy to handle. |
93 |
|
// the index of _* in re |
94 |
96 |
unsigned pivotIndex = 0; |
95 |
96 |
bool hasPivotIndex = false; |
96 |
96 |
bool hasFixedLength = true; |
97 |
192 |
std::vector<Node> childLengths; |
98 |
192 |
std::vector<Node> childLengthsPostPivot; |
99 |
484 |
for (unsigned i = 0, size = children.size(); i < size; i++) |
100 |
|
{ |
101 |
776 |
Node c = children[i]; |
102 |
776 |
Node fl = RegExpEntail::getFixedLengthForRegexp(c); |
103 |
388 |
if (fl.isNull()) |
104 |
|
{ |
105 |
292 |
if (!hasPivotIndex && c.getKind() == REGEXP_STAR |
106 |
267 |
&& c[0].getKind() == REGEXP_SIGMA) |
107 |
|
{ |
108 |
21 |
hasPivotIndex = true; |
109 |
21 |
pivotIndex = i; |
110 |
|
// zero is used in sum below and is used for concat-fixed-len |
111 |
21 |
fl = zero; |
112 |
|
} |
113 |
|
else |
114 |
|
{ |
115 |
83 |
hasFixedLength = false; |
116 |
|
} |
117 |
|
} |
118 |
388 |
if (!fl.isNull()) |
119 |
|
{ |
120 |
305 |
childLengths.push_back(fl); |
121 |
305 |
if (hasPivotIndex) |
122 |
|
{ |
123 |
131 |
childLengthsPostPivot.push_back(fl); |
124 |
|
} |
125 |
|
} |
126 |
|
} |
127 |
96 |
Node lenSum = childLengths.size() > 1 |
128 |
|
? nm->mkNode(PLUS, childLengths) |
129 |
192 |
: (childLengths.empty() ? zero : childLengths[0]); |
130 |
|
// if we have a fixed length |
131 |
96 |
if (hasFixedLength) |
132 |
|
{ |
133 |
29 |
Assert(re.getNumChildren() == children.size()); |
134 |
58 |
std::vector<Node> conc; |
135 |
29 |
conc.push_back(nm->mkNode(hasPivotIndex ? GEQ : EQUAL, lenx, lenSum)); |
136 |
58 |
Node currEnd = zero; |
137 |
204 |
for (unsigned i = 0, size = childLengths.size(); i < size; i++) |
138 |
|
{ |
139 |
175 |
if (hasPivotIndex && i == pivotIndex) |
140 |
|
{ |
141 |
6 |
Node ppSum = childLengthsPostPivot.size() == 1 |
142 |
|
? childLengthsPostPivot[0] |
143 |
12 |
: nm->mkNode(PLUS, childLengthsPostPivot); |
144 |
12 |
currEnd = nm->mkNode(MINUS, lenx, ppSum); |
145 |
|
} |
146 |
|
else |
147 |
|
{ |
148 |
338 |
Node curr = nm->mkNode(STRING_SUBSTR, x, currEnd, childLengths[i]); |
149 |
|
// We do not need to include memberships of the form |
150 |
|
// (str.substr x n 1) in re.allchar |
151 |
|
// since we know that by construction, n < len( x ). |
152 |
169 |
if (re[i].getKind() != REGEXP_SIGMA) |
153 |
|
{ |
154 |
184 |
Node currMem = nm->mkNode(STRING_IN_REGEXP, curr, re[i]); |
155 |
92 |
conc.push_back(currMem); |
156 |
|
} |
157 |
169 |
currEnd = nm->mkNode(PLUS, currEnd, childLengths[i]); |
158 |
169 |
currEnd = Rewriter::rewrite(currEnd); |
159 |
|
} |
160 |
|
} |
161 |
58 |
Node res = nm->mkNode(AND, conc); |
162 |
|
// For example: |
163 |
|
// x in re.++(re.union(re.range("A", "J"), re.range("N", "Z")), "AB") --> |
164 |
|
// len( x ) = 3 ^ |
165 |
|
// substr(x,0,1) in re.union(re.range("A", "J"), re.range("N", "Z")) ^ |
166 |
|
// substr(x,1,2) in "AB" |
167 |
|
// An example with a pivot index: |
168 |
|
// x in re.++( "AB" ++ _* ++ "C" ) --> |
169 |
|
// len( x ) >= 3 ^ |
170 |
|
// substr( x, 0, 2 ) in "AB" ^ |
171 |
|
// substr( x, len( x ) - 1, 1 ) in "C" |
172 |
29 |
return returnElim(atom, res, "concat-fixed-len"); |
173 |
|
} |
174 |
|
|
175 |
|
// memberships of the form x in re.++ * s1 * ... * sn *, where * are |
176 |
|
// any number of repetitions (exact or indefinite) of re.allchar. |
177 |
67 |
Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl; |
178 |
67 |
bool onlySigmasAndConsts = true; |
179 |
134 |
std::vector<Node> sep_children; |
180 |
134 |
std::vector<unsigned> gap_minsize; |
181 |
134 |
std::vector<bool> gap_exact; |
182 |
|
// the first gap is initially strict zero |
183 |
67 |
gap_minsize.push_back(0); |
184 |
67 |
gap_exact.push_back(true); |
185 |
182 |
for (const Node& c : children) |
186 |
|
{ |
187 |
167 |
Trace("re-elim-debug") << " " << c << std::endl; |
188 |
167 |
onlySigmasAndConsts = false; |
189 |
167 |
if (c.getKind() == STRING_TO_REGEXP) |
190 |
|
{ |
191 |
75 |
onlySigmasAndConsts = true; |
192 |
75 |
sep_children.push_back(c[0]); |
193 |
|
// the next gap is initially strict zero |
194 |
75 |
gap_minsize.push_back(0); |
195 |
75 |
gap_exact.push_back(true); |
196 |
|
} |
197 |
92 |
else if (c.getKind() == REGEXP_STAR && c[0].getKind() == REGEXP_SIGMA) |
198 |
|
{ |
199 |
|
// found a gap of any size |
200 |
35 |
onlySigmasAndConsts = true; |
201 |
35 |
gap_exact[gap_exact.size() - 1] = false; |
202 |
|
} |
203 |
57 |
else if (c.getKind() == REGEXP_SIGMA) |
204 |
|
{ |
205 |
|
// add one to the minimum size of the gap |
206 |
5 |
onlySigmasAndConsts = true; |
207 |
5 |
gap_minsize[gap_minsize.size() - 1]++; |
208 |
|
} |
209 |
167 |
if (!onlySigmasAndConsts) |
210 |
|
{ |
211 |
52 |
Trace("re-elim-debug") << "...cannot handle " << c << std::endl; |
212 |
52 |
break; |
213 |
|
} |
214 |
|
} |
215 |
|
// we should always rewrite concatenations that are purely re.allchar |
216 |
|
// and re.*( re.allchar ). |
217 |
67 |
Assert(!onlySigmasAndConsts || !sep_children.empty()); |
218 |
67 |
if (onlySigmasAndConsts && !sep_children.empty()) |
219 |
|
{ |
220 |
15 |
bool canProcess = true; |
221 |
15 |
std::vector<Node> conj; |
222 |
|
// The following constructs a set of constraints that encodes that a |
223 |
|
// set of string terms are found, in order, in string x. |
224 |
|
// prev_end stores the current (symbolic) index in x that we are |
225 |
|
// searching. |
226 |
15 |
Node prev_end = zero; |
227 |
|
// the symbolic index we start searching, for each child in sep_children. |
228 |
15 |
std::vector<Node> prev_ends; |
229 |
15 |
unsigned gap_minsize_end = gap_minsize.back(); |
230 |
15 |
bool gap_exact_end = gap_exact.back(); |
231 |
15 |
std::vector<Node> non_greedy_find_vars; |
232 |
60 |
for (unsigned i = 0, size = sep_children.size(); i < size; i++) |
233 |
|
{ |
234 |
45 |
if (gap_minsize[i] > 0) |
235 |
|
{ |
236 |
|
// the gap to this child is at least gap_minsize[i] |
237 |
5 |
prev_end = |
238 |
10 |
nm->mkNode(PLUS, prev_end, nm->mkConst(Rational(gap_minsize[i]))); |
239 |
|
} |
240 |
45 |
prev_ends.push_back(prev_end); |
241 |
90 |
Node sc = sep_children[i]; |
242 |
90 |
Node lensc = nm->mkNode(STRING_LENGTH, sc); |
243 |
45 |
if (gap_exact[i]) |
244 |
|
{ |
245 |
|
// if the gap is exact, it is a substring constraint |
246 |
30 |
Node curr = prev_end; |
247 |
30 |
Node ss = nm->mkNode(STRING_SUBSTR, x, curr, lensc); |
248 |
15 |
conj.push_back(ss.eqNode(sc)); |
249 |
15 |
prev_end = nm->mkNode(PLUS, curr, lensc); |
250 |
|
} |
251 |
|
else |
252 |
|
{ |
253 |
|
// otherwise, we can use indexof to represent some next occurrence |
254 |
30 |
if (gap_exact[i + 1] && i + 1 != size) |
255 |
|
{ |
256 |
|
if (!isAgg) |
257 |
|
{ |
258 |
|
canProcess = false; |
259 |
|
break; |
260 |
|
} |
261 |
|
// if the gap after this one is strict, we need a non-greedy find |
262 |
|
// thus, we add a symbolic constant |
263 |
|
Node k = nm->mkBoundVar(nm->integerType()); |
264 |
|
non_greedy_find_vars.push_back(k); |
265 |
|
prev_end = nm->mkNode(PLUS, prev_end, k); |
266 |
|
} |
267 |
60 |
Node curr = nm->mkNode(STRING_INDEXOF, x, sc, prev_end); |
268 |
60 |
Node idofFind = curr.eqNode(nm->mkConst(Rational(-1))).negate(); |
269 |
30 |
conj.push_back(idofFind); |
270 |
30 |
prev_end = nm->mkNode(PLUS, curr, lensc); |
271 |
|
} |
272 |
|
} |
273 |
|
|
274 |
15 |
if (canProcess) |
275 |
|
{ |
276 |
|
// since sep_children is non-empty, conj is non-empty |
277 |
15 |
Assert(!conj.empty()); |
278 |
|
// Process the last gap, if necessary. |
279 |
|
// Notice that if the last gap is not exact and its minsize is zero, |
280 |
|
// then the last indexof/substr constraint entails the following |
281 |
|
// constraint, so it is not necessary to add. |
282 |
|
// Below, we may write "A" for (str.to.re "A") and _ for re.allchar: |
283 |
30 |
Node cEnd = nm->mkConst(Rational(gap_minsize_end)); |
284 |
15 |
if (gap_exact_end) |
285 |
|
{ |
286 |
10 |
Assert(!sep_children.empty()); |
287 |
|
// if it is strict, it corresponds to a substr case. |
288 |
|
// For example: |
289 |
|
// x in (re.++ "A" (re.* _) "B" _ _) ---> |
290 |
|
// ... ^ "B" = substr( x, len( x ) - 3, 1 ) ^ ... |
291 |
20 |
Node sc = sep_children.back(); |
292 |
20 |
Node lenSc = nm->mkNode(STRING_LENGTH, sc); |
293 |
20 |
Node loc = nm->mkNode(MINUS, lenx, nm->mkNode(PLUS, lenSc, cEnd)); |
294 |
20 |
Node scc = sc.eqNode(nm->mkNode(STRING_SUBSTR, x, loc, lenSc)); |
295 |
|
// We also must ensure that we fit. This constraint is necessary in |
296 |
|
// addition to the constraint above. Take this example: |
297 |
|
// x in (re.++ "A" _ (re.* _) "B" _) ---> |
298 |
|
// substr( x, 0, 1 ) = "A" ^ // find "A" |
299 |
|
// indexof( x, "B", 2 ) != -1 ^ // find "B" >=1 after "A" |
300 |
|
// substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2. |
301 |
|
// indexof( x, "B", 2 ) <= len( x ) - 2 |
302 |
|
// The last constaint ensures that the second and third constraints |
303 |
|
// may refer to the same "B". If it were not for the last constraint, it |
304 |
|
// would have been the case than "ABB" would be a model for x, where |
305 |
|
// the second constraint refers to the third position, and the third |
306 |
|
// constraint refers to the second position. |
307 |
|
// |
308 |
|
// With respect to the above example, the following is an optimization. |
309 |
|
// For that example, we instead produce: |
310 |
|
// x in (re.++ "A" _ (re.* _) "B" _) ---> |
311 |
|
// substr( x, 0, 1 ) = "A" ^ // find "A" |
312 |
|
// substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2 |
313 |
|
// 2 <= len( x ) - 2 |
314 |
|
// The intuition is that above, there are two constraints that insist |
315 |
|
// that "B" is found, whereas we only need one. The last constraint |
316 |
|
// above says that the "B" we find at end-2 can be found >=1 after |
317 |
|
// the "A". |
318 |
10 |
conj.pop_back(); |
319 |
20 |
Node fit = nm->mkNode(gap_exact[sep_children.size() - 1] ? EQUAL : LEQ, |
320 |
10 |
prev_ends.back(), |
321 |
50 |
loc); |
322 |
|
|
323 |
10 |
conj.push_back(scc); |
324 |
10 |
conj.push_back(fit); |
325 |
|
} |
326 |
5 |
else if (gap_minsize_end > 0) |
327 |
|
{ |
328 |
|
// if it is non-strict, we are in a "greedy find" situtation where |
329 |
|
// we just need to ensure that the next occurrence fits. |
330 |
|
// For example: |
331 |
|
// x in (re.++ "A" (re.* _) "B" _ _ (re.* _)) ---> |
332 |
|
// ... ^ indexof( x, "B", 1 ) + 2 <= len( x ) |
333 |
|
Node fit = nm->mkNode(LEQ, nm->mkNode(PLUS, prev_end, cEnd), lenx); |
334 |
|
conj.push_back(fit); |
335 |
|
} |
336 |
30 |
Node res = nm->mkAnd(conj); |
337 |
|
// process the non-greedy find variables |
338 |
15 |
if (!non_greedy_find_vars.empty()) |
339 |
|
{ |
340 |
|
std::vector<Node> children2; |
341 |
|
for (const Node& v : non_greedy_find_vars) |
342 |
|
{ |
343 |
|
Node bound = nm->mkNode( |
344 |
|
AND, nm->mkNode(LEQ, zero, v), nm->mkNode(LT, v, lenx)); |
345 |
|
children2.push_back(bound); |
346 |
|
} |
347 |
|
children2.push_back(res); |
348 |
|
Node body = nm->mkNode(AND, children2); |
349 |
|
Node bvl = nm->mkNode(BOUND_VAR_LIST, non_greedy_find_vars); |
350 |
|
res = utils::mkForallInternal(bvl, body.negate()).negate(); |
351 |
|
} |
352 |
|
// must also give a minimum length requirement |
353 |
15 |
res = nm->mkNode(AND, res, nm->mkNode(GEQ, lenx, lenSum)); |
354 |
|
// Examples of this elimination: |
355 |
|
// x in (re.++ "A" (re.* _) "B" (re.* _)) ---> |
356 |
|
// substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1 |
357 |
|
// x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) ---> |
358 |
|
// indexof(x,"A",0)!=-1 ^ |
359 |
|
// indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^ |
360 |
|
// indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x) ^ |
361 |
|
// len(x) >= 7 |
362 |
|
|
363 |
|
// An example of a non-greedy find: |
364 |
|
// x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) ---> |
365 |
|
// (exists k. 0 <= k < len( x ) ^ |
366 |
|
// indexof( x, "A", k ) != -1 ^ |
367 |
|
// substr( x, indexof( x, "A", k )+2, 1 ) = "B") ^ |
368 |
|
// len(x) >= 3 |
369 |
15 |
return returnElim(atom, res, "concat-with-gaps"); |
370 |
|
} |
371 |
|
} |
372 |
|
|
373 |
52 |
if (!isAgg) |
374 |
|
{ |
375 |
20 |
return Node::null(); |
376 |
|
} |
377 |
|
// only aggressive rewrites below here |
378 |
|
|
379 |
|
// if the first or last child is constant string, we can split the membership |
380 |
|
// into a conjunction of two memberships. |
381 |
64 |
Node sStartIndex = zero; |
382 |
64 |
Node sLength = lenx; |
383 |
64 |
std::vector<Node> sConstraints; |
384 |
64 |
std::vector<Node> rexpElimChildren; |
385 |
32 |
unsigned nchildren = children.size(); |
386 |
32 |
Assert(nchildren > 1); |
387 |
96 |
for (unsigned r = 0; r < 2; r++) |
388 |
|
{ |
389 |
64 |
unsigned index = r == 0 ? 0 : nchildren - 1; |
390 |
128 |
Node c = children[index]; |
391 |
64 |
if (c.getKind() == STRING_TO_REGEXP) |
392 |
|
{ |
393 |
28 |
Assert(children[index + (r == 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP); |
394 |
56 |
Node s = c[0]; |
395 |
56 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
396 |
56 |
Node sss = r == 0 ? zero : nm->mkNode(MINUS, lenx, lens); |
397 |
56 |
Node ss = nm->mkNode(STRING_SUBSTR, x, sss, lens); |
398 |
28 |
sConstraints.push_back(ss.eqNode(s)); |
399 |
28 |
if (r == 0) |
400 |
|
{ |
401 |
26 |
sStartIndex = lens; |
402 |
|
} |
403 |
2 |
else if (r == 1 && sConstraints.size() == 2) |
404 |
|
{ |
405 |
|
// first and last children cannot overlap |
406 |
4 |
Node bound = nm->mkNode(GEQ, sss, sStartIndex); |
407 |
2 |
sConstraints.push_back(bound); |
408 |
|
} |
409 |
28 |
sLength = nm->mkNode(MINUS, sLength, lens); |
410 |
|
} |
411 |
64 |
if (r == 1 && !sConstraints.empty()) |
412 |
|
{ |
413 |
|
// add the middle children |
414 |
28 |
for (unsigned i = 1; i < (nchildren - 1); i++) |
415 |
|
{ |
416 |
2 |
rexpElimChildren.push_back(children[i]); |
417 |
|
} |
418 |
|
} |
419 |
64 |
if (c.getKind() != STRING_TO_REGEXP) |
420 |
|
{ |
421 |
36 |
rexpElimChildren.push_back(c); |
422 |
|
} |
423 |
|
} |
424 |
32 |
if (!sConstraints.empty()) |
425 |
|
{ |
426 |
52 |
Node ss = nm->mkNode(STRING_SUBSTR, x, sStartIndex, sLength); |
427 |
26 |
Assert(!rexpElimChildren.empty()); |
428 |
52 |
Node regElim = utils::mkConcat(rexpElimChildren, nm->regExpType()); |
429 |
26 |
sConstraints.push_back(nm->mkNode(STRING_IN_REGEXP, ss, regElim)); |
430 |
52 |
Node ret = nm->mkNode(AND, sConstraints); |
431 |
|
// e.g. |
432 |
|
// x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R |
433 |
26 |
return returnElim(atom, ret, "concat-splice"); |
434 |
|
} |
435 |
6 |
Assert(nchildren > 1); |
436 |
12 |
for (unsigned i = 0; i < nchildren; i++) |
437 |
|
{ |
438 |
12 |
if (children[i].getKind() == STRING_TO_REGEXP) |
439 |
|
{ |
440 |
12 |
Node s = children[i][0]; |
441 |
12 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
442 |
|
// there exists an index in this string such that the substring is this |
443 |
12 |
Node k; |
444 |
12 |
std::vector<Node> echildren; |
445 |
6 |
if (i == 0) |
446 |
|
{ |
447 |
|
k = zero; |
448 |
|
} |
449 |
6 |
else if (i + 1 == nchildren) |
450 |
|
{ |
451 |
|
k = nm->mkNode(MINUS, lenx, lens); |
452 |
|
} |
453 |
|
else |
454 |
|
{ |
455 |
6 |
k = nm->mkBoundVar(nm->integerType()); |
456 |
|
Node bound = |
457 |
|
nm->mkNode(AND, |
458 |
12 |
nm->mkNode(LEQ, zero, k), |
459 |
24 |
nm->mkNode(LEQ, k, nm->mkNode(MINUS, lenx, lens))); |
460 |
6 |
echildren.push_back(bound); |
461 |
|
} |
462 |
12 |
Node substrEq = nm->mkNode(STRING_SUBSTR, x, k, lens).eqNode(s); |
463 |
6 |
echildren.push_back(substrEq); |
464 |
6 |
if (i > 0) |
465 |
|
{ |
466 |
12 |
std::vector<Node> rprefix; |
467 |
6 |
rprefix.insert(rprefix.end(), children.begin(), children.begin() + i); |
468 |
12 |
Node rpn = utils::mkConcat(rprefix, nm->regExpType()); |
469 |
|
Node substrPrefix = nm->mkNode( |
470 |
12 |
STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, zero, k), rpn); |
471 |
6 |
echildren.push_back(substrPrefix); |
472 |
|
} |
473 |
6 |
if (i + 1 < nchildren) |
474 |
|
{ |
475 |
12 |
std::vector<Node> rsuffix; |
476 |
6 |
rsuffix.insert(rsuffix.end(), children.begin() + i + 1, children.end()); |
477 |
12 |
Node rps = utils::mkConcat(rsuffix, nm->regExpType()); |
478 |
12 |
Node ks = nm->mkNode(PLUS, k, lens); |
479 |
|
Node substrSuffix = nm->mkNode( |
480 |
|
STRING_IN_REGEXP, |
481 |
12 |
nm->mkNode(STRING_SUBSTR, x, ks, nm->mkNode(MINUS, lenx, ks)), |
482 |
24 |
rps); |
483 |
6 |
echildren.push_back(substrSuffix); |
484 |
|
} |
485 |
12 |
Node body = nm->mkNode(AND, echildren); |
486 |
6 |
if (k.getKind() == BOUND_VARIABLE) |
487 |
|
{ |
488 |
12 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, k); |
489 |
6 |
body = utils::mkForallInternal(bvl, body.negate()).negate(); |
490 |
|
} |
491 |
|
// e.g. x in re.++( R1, "AB", R2 ) ---> |
492 |
|
// exists k. |
493 |
|
// 0 <= k <= (len(x)-2) ^ |
494 |
|
// substr( x, k, 2 ) = "AB" ^ |
495 |
|
// substr( x, 0, k ) in R1 ^ |
496 |
|
// substr( x, k+2, len(x)-(k+2) ) in R2 |
497 |
6 |
return returnElim(atom, body, "concat-find"); |
498 |
|
} |
499 |
|
} |
500 |
|
return Node::null(); |
501 |
|
} |
502 |
|
|
503 |
42 |
Node RegExpElimination::eliminateStar(Node atom, bool isAgg) |
504 |
|
{ |
505 |
42 |
if (!isAgg) |
506 |
|
{ |
507 |
14 |
return Node::null(); |
508 |
|
} |
509 |
|
// only aggressive rewrites below here |
510 |
|
|
511 |
28 |
NodeManager* nm = NodeManager::currentNM(); |
512 |
56 |
Node x = atom[0]; |
513 |
56 |
Node lenx = nm->mkNode(STRING_LENGTH, x); |
514 |
56 |
Node re = atom[1]; |
515 |
56 |
Node zero = nm->mkConst(Rational(0)); |
516 |
|
// for regular expression star, |
517 |
|
// if the period is a fixed constant, we can turn it into a bounded |
518 |
|
// quantifier |
519 |
56 |
std::vector<Node> disj; |
520 |
28 |
if (re[0].getKind() == REGEXP_UNION) |
521 |
|
{ |
522 |
12 |
for (const Node& r : re[0]) |
523 |
|
{ |
524 |
8 |
disj.push_back(r); |
525 |
|
} |
526 |
|
} |
527 |
|
else |
528 |
|
{ |
529 |
24 |
disj.push_back(re[0]); |
530 |
|
} |
531 |
28 |
bool lenOnePeriod = true; |
532 |
56 |
std::vector<Node> char_constraints; |
533 |
56 |
Node index = nm->mkBoundVar(nm->integerType()); |
534 |
|
Node substr_ch = |
535 |
56 |
nm->mkNode(STRING_SUBSTR, x, index, nm->mkConst(Rational(1))); |
536 |
28 |
substr_ch = Rewriter::rewrite(substr_ch); |
537 |
|
// handle the case where it is purely characters |
538 |
50 |
for (const Node& r : disj) |
539 |
|
{ |
540 |
32 |
Assert(r.getKind() != REGEXP_UNION); |
541 |
32 |
Assert(r.getKind() != REGEXP_SIGMA); |
542 |
32 |
lenOnePeriod = false; |
543 |
|
// lenOnePeriod is true if this regular expression is a single character |
544 |
|
// regular expression |
545 |
32 |
if (r.getKind() == STRING_TO_REGEXP) |
546 |
|
{ |
547 |
46 |
Node s = r[0]; |
548 |
23 |
if (s.isConst() && s.getConst<String>().size() == 1) |
549 |
|
{ |
550 |
17 |
lenOnePeriod = true; |
551 |
|
} |
552 |
|
} |
553 |
9 |
else if (r.getKind() == REGEXP_RANGE) |
554 |
|
{ |
555 |
5 |
lenOnePeriod = true; |
556 |
|
} |
557 |
32 |
if (!lenOnePeriod) |
558 |
|
{ |
559 |
10 |
break; |
560 |
|
} |
561 |
|
else |
562 |
|
{ |
563 |
44 |
Node regexp_ch = nm->mkNode(STRING_IN_REGEXP, substr_ch, r); |
564 |
22 |
regexp_ch = Rewriter::rewrite(regexp_ch); |
565 |
22 |
Assert(regexp_ch.getKind() != STRING_IN_REGEXP); |
566 |
22 |
char_constraints.push_back(regexp_ch); |
567 |
|
} |
568 |
|
} |
569 |
28 |
if (lenOnePeriod) |
570 |
|
{ |
571 |
18 |
Assert(!char_constraints.empty()); |
572 |
|
Node bound = nm->mkNode( |
573 |
36 |
AND, nm->mkNode(LEQ, zero, index), nm->mkNode(LT, index, lenx)); |
574 |
36 |
Node conc = char_constraints.size() == 1 ? char_constraints[0] |
575 |
54 |
: nm->mkNode(OR, char_constraints); |
576 |
36 |
Node body = nm->mkNode(OR, bound.negate(), conc); |
577 |
36 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, index); |
578 |
36 |
Node res = utils::mkForallInternal(bvl, body); |
579 |
|
// e.g. |
580 |
|
// x in (re.* (re.union "A" "B" )) ---> |
581 |
|
// forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B") |
582 |
18 |
return returnElim(atom, res, "star-char"); |
583 |
|
} |
584 |
|
// otherwise, for stars of constant length these are periodic |
585 |
10 |
if (disj.size() == 1) |
586 |
|
{ |
587 |
10 |
Node r = disj[0]; |
588 |
6 |
if (r.getKind() == STRING_TO_REGEXP) |
589 |
|
{ |
590 |
10 |
Node s = r[0]; |
591 |
6 |
if (s.isConst()) |
592 |
|
{ |
593 |
4 |
Node lens = nm->mkNode(STRING_LENGTH, s); |
594 |
2 |
lens = Rewriter::rewrite(lens); |
595 |
2 |
Assert(lens.isConst()); |
596 |
2 |
Assert(lens.getConst<Rational>().sgn() > 0); |
597 |
4 |
std::vector<Node> conj; |
598 |
|
// lens is a positive constant, so it is safe to use total div/mod here. |
599 |
|
Node bound = nm->mkNode( |
600 |
|
AND, |
601 |
4 |
nm->mkNode(LEQ, zero, index), |
602 |
8 |
nm->mkNode(LT, index, nm->mkNode(INTS_DIVISION_TOTAL, lenx, lens))); |
603 |
|
Node conc = |
604 |
4 |
nm->mkNode(STRING_SUBSTR, x, nm->mkNode(MULT, index, lens), lens) |
605 |
4 |
.eqNode(s); |
606 |
4 |
Node body = nm->mkNode(OR, bound.negate(), conc); |
607 |
4 |
Node bvl = nm->mkNode(BOUND_VAR_LIST, index); |
608 |
4 |
Node res = utils::mkForallInternal(bvl, body); |
609 |
6 |
res = nm->mkNode( |
610 |
4 |
AND, nm->mkNode(INTS_MODULUS_TOTAL, lenx, lens).eqNode(zero), res); |
611 |
|
// e.g. |
612 |
|
// x in ("abc")* ---> |
613 |
|
// forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^ |
614 |
|
// len(x) mod 3 = 0 |
615 |
2 |
return returnElim(atom, res, "star-constant"); |
616 |
|
} |
617 |
|
} |
618 |
|
} |
619 |
8 |
return Node::null(); |
620 |
|
} |
621 |
|
|
622 |
96 |
Node RegExpElimination::returnElim(Node atom, Node atomElim, const char* id) |
623 |
|
{ |
624 |
192 |
Trace("re-elim") << "re-elim: " << atom << " to " << atomElim << " by " << id |
625 |
96 |
<< "." << std::endl; |
626 |
96 |
return atomElim; |
627 |
|
} |
628 |
92 |
bool RegExpElimination::isProofEnabled() const { return d_pnm != nullptr; } |
629 |
|
|
630 |
|
} // namespace strings |
631 |
|
} // namespace theory |
632 |
29502 |
} // namespace cvc5 |