1 

/********************* */ 
2 

/*! \file core_solver.h 
3 

** \verbatim 
4 

** Top contributors (to current version): 
5 

** Andrew Reynolds, Andres Noetzli, Tianyi Liang 
6 

** This file is part of the CVC4 project. 
7 

** Copyright (c) 20092021 by the authors listed in the file AUTHORS 
8 

** in the toplevel source directory and their institutional affiliations. 
9 

** All rights reserved. See the file COPYING in the toplevel source 
10 

** directory for licensing information.\endverbatim 
11 

** 
12 

** \brief Core solver for the theory of strings, responsible for reasoning 
13 

** string concatenation plus length constraints. 
14 

**/ 
15 


16 

#include "cvc4_private.h" 
17 


18 

#ifndef CVC4__THEORY__STRINGS__CORE_SOLVER_H 
19 

#define CVC4__THEORY__STRINGS__CORE_SOLVER_H 
20 


21 

#include "context/cdhashset.h" 
22 

#include "context/cdlist.h" 
23 

#include "theory/strings/base_solver.h" 
24 

#include "theory/strings/infer_info.h" 
25 

#include "theory/strings/inference_manager.h" 
26 

#include "theory/strings/normal_form.h" 
27 

#include "theory/strings/solver_state.h" 
28 

#include "theory/strings/term_registry.h" 
29 


30 

namespace CVC4 { 
31 

namespace theory { 
32 

namespace strings { 
33 


34 

/** 
35 

* This data structure encapsulates an inference for the core solver of the 
36 

* theory of strings. This includes the form of the inference to be processed 
37 

* by the inference manager, the side effects it generates for the core solver, 
38 

* and information used for heuristics and debugging. 
39 

*/ 
40 
25522 
class CoreInferInfo 
41 

{ 
42 

public: 
43 

CoreInferInfo(InferenceId id); 
44 
38994 
~CoreInferInfo() {} 
45 

/** The infer info of this class */ 
46 

InferInfo d_infer; 
47 

/** 
48 

* The pending phase requirements, see InferenceManager::sendPhaseRequirement. 
49 

*/ 
50 

std::map<Node, bool> d_pendingPhase; 
51 

/** 
52 

* The index in the normal forms under which this inference is addressing. 
53 

* For example, if the inference is inferring x = y from x=y and 
54 

* w ++ x ++ ... = w ++ y ++ ... 
55 

* then d_index is 1, since x and y are at index 1 in these concat terms. 
56 

*/ 
57 

unsigned d_index; 
58 

/** 
59 

* The normal form pair that is cached as a result of this inference. 
60 

*/ 
61 

Node d_nfPair[2]; 
62 

/** for debugging 
63 

* 
64 

* The base pair of strings d_i/d_j that led to the inference, and whether 
65 

* (d_rev) we were processing the normal forms of these strings in reverse 
66 

* direction. 
67 

*/ 
68 

Node d_i; 
69 

Node d_j; 
70 

bool d_rev; 
71 

}; 
72 


73 

/** The core solver for the theory of strings 
74 

* 
75 

* This implements techniques for handling (dis)equalities involving 
76 

* string concatenation terms based on the procedure by Liang et al CAV 2014. 
77 

*/ 
78 

class CoreSolver 
79 

{ 
80 

friend class InferenceManager; 
81 

using NodeIntMap = context::CDHashMap<Node, int, NodeHashFunction>; 
82 


83 

public: 
84 

CoreSolver(SolverState& s, 
85 

InferenceManager& im, 
86 

TermRegistry& tr, 
87 

BaseSolver& bs); 
88 

~CoreSolver(); 
89 


90 

//inference steps 
91 

/** check cycles 
92 

* 
93 

* This inference schema ensures that a containment ordering < over the 
94 

* string equivalence classes is acyclic. We define this ordering < such that 
95 

* for equivalence classes e1 = { t1...tn } and e2 = { s1...sm }, e1 < e2 
96 

* if there exists a ti whose flat form (see below) is [w1...sj...wk] for 
97 

* some i,j. If e1 < ... < en < e1 for some chain, we infer that the flat 
98 

* form components that do not constitute this chain, e.g. (w1...wk) \ sj 
99 

* in the flat form above, must be empty. 
100 

* 
101 

* For more details, see the inference SCycle in Liang et al CAV 2014. 
102 

*/ 
103 

void checkCycles(); 
104 

/** check flat forms 
105 

* 
106 

* This applies an inference schema based on "flat forms". The flat form of a 
107 

* string term t is a vector of representative terms [r1, ..., rn] such that 
108 

* t is of the form t1 ++ ... ++ tm and r1 ++ ... ++ rn is equivalent to 
109 

* rewrite( [t1] ++ ... ++ [tm] ), where [t1] denotes the representative of 
110 

* the equivalence class containing t1. For example, if t is y ++ z ++ z, 
111 

* E is { y = "", w = z }, and w is the representative of the equivalence 
112 

* class { w, z }, then the flat form of t is [w, w]. Say t1 and t2 are terms 
113 

* in the same equivalence classes with flat forms [r1...rn] and [s1...sm]. 
114 

* We may infer various facts based on this pair of terms. For example: 
115 

* ri = si, if ri != si, rj == sj for each j < i, and len(ri)=len(si), 
116 

* rn = sn, if n=m and rj == sj for each j < n, 
117 

* ri = empty, if n=m+1 and ri == rj for each i=1,...,m. 
118 

* We refer to these as "unify", "endpointeq" and "endpointemp" inferences 
119 

* respectively. 
120 

* 
121 

* Notice that this inference scheme is an optimization and not needed for 
122 

* modelsoundness. The motivation for this schema is that it is simpler than 
123 

* checkNormalFormsEq, which can be seen as a recursive version of this 
124 

* schema (see difference of "normal form" vs "flat form" below), and 
125 

* checkNormalFormsEq is complete, in the sense that if it passes with no 
126 

* inferences, we are ensured that all string equalities in the current 
127 

* context are satisfied. 
128 

* 
129 

* Must call checkCycles before this function in a strategy. 
130 

*/ 
131 

void checkFlatForms(); 
132 

/** check normal forms equalities 
133 

* 
134 

* This applies an inference schema based on "normal forms". The normal form 
135 

* of an equivalence class of string terms e = {t1, ..., tn} union {x1....xn}, 
136 

* where t1...tn are concatenation terms is a vector of representative terms 
137 

* [r1, ..., rm] such that: 
138 

* (1) if n=0, then m=1 and r1 is the representative of e, 
139 

* (2) if n>0, say 
140 

* t1 = t^1_1 ++ ... ++ t^1_m_1 
141 

* ... 
142 

* tn = t^1_n ++ ... ++ t^_m_n 
143 

* for *each* i=1, ..., n, the result of concenating the normal forms of 
144 

* t^1_1 ++ ... ++ t^1_m_1 is equal to [r1, ..., rm]. If an equivalence class 
145 

* can be assigned a normal form, then all equalities between ti and tj are 
146 

* satisfied by all models that correspond to extensions of the current 
147 

* assignment. For further detail on this terminology, see Liang et al 
148 

* CAV 2014. 
149 

* 
150 

* Notice that all constant words are implicitly considered concatentation 
151 

* of their characters, e.g. "abc" is treated as "a" ++ "b" ++ "c". 
152 

* 
153 

* At a high level, we build normal forms for equivalence classes bottomup, 
154 

* starting with equivalence classes that are minimal with respect to the 
155 

* containment ordering < computed during checkCycles. While computing a 
156 

* normal form for an equivalence class, we may infer equalities between 
157 

* components of strings that must be equal (e.g. x=y when x++z == y++w when 
158 

* len(x)==len(y) is asserted), derive conflicts if two strings have disequal 
159 

* prefixes/suffixes (e.g. "a" ++ x == "b" ++ y is a conflict), or split 
160 

* string terms into smaller components using fresh skolem variables (see 
161 

* Inference values with names "SPLIT"). We also may introduce regular 
162 

* expression constraints in this method for looping word equations (see 
163 

* the Inference INFER_FLOOP). 
164 

* 
165 

* If this inference schema returns no facts, lemmas, or conflicts, then 
166 

* we have successfully assigned normal forms for all equivalence classes, as 
167 

* stored in d_normal_forms. Otherwise, this method may add a fact, lemma, or 
168 

* conflict based on inferences in the Inference enumeration above. 
169 

*/ 
170 

void checkNormalFormsEq(); 
171 

/** check normal forms disequalities 
172 

* 
173 

* This inference schema can be seen as the converse of the above schema. In 
174 

* particular, it ensures that each pair of distinct equivalence classes 
175 

* e1 and e2 have distinct normal forms. 
176 

* 
177 

* This method considers all pairs of distinct equivalence classes (e1,e2) 
178 

* such that len(x1)==len(x2) is asserted for some x1 in e1 and x2 in e2. It 
179 

* then traverses the normal forms of x1 and x2, say they are [r1, ..., rn] 
180 

* and [s1, ..., sm]. For the minimial i such that ri!=si, if ri and si are 
181 

* disequal and have the same length, then x1 and x2 have distinct normal 
182 

* forms. Otherwise, we may add splitting lemmas on the length of ri and si, 
183 

* or split on an equality between ri and si. 
184 

* 
185 

* If this inference schema returns no facts, lemmas, or conflicts, then all 
186 

* disequalities between string terms are satisfied by all models that are 
187 

* extensions of the current assignment. 
188 

*/ 
189 

void checkNormalFormsDeq(); 
190 

/** check lengths for equivalence classes 
191 

* 
192 

* This inference schema adds lemmas of the form: 
193 

* E => len( x ) = rewrite( len( r1 ++ ... ++ rn ) ) 
194 

* where [r1, ..., rn] is the normal form of the equivalence class containing 
195 

* x. This schema is not required for correctness but experimentally has 
196 

* shown to be helpful. 
197 

*/ 
198 

void checkLengthsEqc(); 
199 

//end inference steps 
200 


201 

// query functions 
202 

/** 
203 

* Get normal form for string term n. For details on this data structure, 
204 

* see theory/strings/normal_form.h. 
205 

* 
206 

* This query is valid after a successful call to checkNormalFormsEq, e.g. 
207 

* a call where the inference manager was not given any lemmas or inferences. 
208 

*/ 
209 

NormalForm& getNormalForm(Node n); 
210 

/** get normal string 
211 

* 
212 

* This method returns the node that is equivalent to the normal form of x, 
213 

* and adds the corresponding explanation to nf_exp. 
214 

* 
215 

* For example, if x = y ++ z is an assertion in the current context, then 
216 

* this method returns the term y ++ z and adds x = y ++ z to nf_exp. 
217 

*/ 
218 

Node getNormalString(Node x, std::vector<Node>& nf_exp); 
219 

// end query functions 
220 


221 

/** 
222 

* This returns the conclusion of the proof rule corresponding to splitting 
223 

* on the arrangement of terms x and y appearing in an equation of the form 
224 

* x ++ x' = y ++ y' or x' ++ x = y' ++ y 
225 

* where we are in the second case if isRev is true. This method is called 
226 

* both by the core solver and by the strings proof checker. 
227 

* 
228 

* @param x The first term 
229 

* @param y The second term 
230 

* @param rule The proof rule whose conclusion we are asking for 
231 

* @param isRev Whether the equation is in a reverse direction 
232 

* @param skc The skolem cache (to allocate fresh variables if necessary) 
233 

* @param newSkolems The vector to add new variables to 
234 

* @return The conclusion of the inference. 
235 

*/ 
236 

static Node getConclusion(Node x, 
237 

Node y, 
238 

PfRule rule, 
239 

bool isRev, 
240 

SkolemCache* skc, 
241 

std::vector<Node>& newSkolems); 
242 

/** 
243 

* Get sufficient nonempty overlap of string constants c and d. 
244 

* 
245 

* This is called when handling equations of the form: 
246 

* x ++ d ++ ... = c ++ ... 
247 

* when x is nonempty and nonconstant. 
248 

* 
249 

* This returns the maximal index in c which x must have as a prefix, which 
250 

* notice is an integer >= 1 since x is nonempty. 
251 

* 
252 

* @param c The first constant 
253 

* @param d The second constant 
254 

* @param isRev Whether the equation is in the reverse direction 
255 

* @return The position in c. 
256 

*/ 
257 

static size_t getSufficientNonEmptyOverlap(Node c, Node d, bool isRev); 
258 

/** 
259 

* This returns the conclusion of the decompose proof rule. This returns 
260 

* a conjunction of splitting string x into pieces based on length l, e.g.: 
261 

* x = k_1 ++ k_2 
262 

* where k_1 (resp. k_2) is a skolem corresponding to a substring of x of 
263 

* length l if isRev is false (resp. true). The function also adds a 
264 

* length constraint len(k_1) = l (resp. len(k_2) = l). Note that adding this 
265 

* constraint to the conclusion is *not* optional, since the skolems k_1 and 
266 

* k_2 may be shared, hence their length constraint must be guarded by the 
267 

* premises of this inference. 
268 

* 
269 

* @param x The string term 
270 

* @param l The length term 
271 

* @param isRev Whether the equation is in a reverse direction 
272 

* @param skc The skolem cache (to allocate fresh variables if necessary) 
273 

* @param newSkolems The vector to add new variables to 
274 

* @return The conclusion of the inference. 
275 

*/ 
276 

static Node getDecomposeConclusion(Node x, 
277 

Node l, 
278 

bool isRev, 
279 

SkolemCache* skc, 
280 

std::vector<Node>& newSkolems); 
281 


282 

private: 
283 

/** 
284 

* This processes the infer info ii as an inference. In more detail, it calls 
285 

* the inference manager to process the inference, and updates the set of 
286 

* normal form pairs. Returns true if the conclusion of ii was not true 
287 

* after rewriting. If the conclusion is true, this method does nothing. 
288 

*/ 
289 

bool processInferInfo(CoreInferInfo& ii); 
290 

/** Add that (n1,n2) is a normal form pair in the current context. */ 
291 

void addNormalFormPair(Node n1, Node n2); 
292 

/** Is (n1,n2) a normal form pair in the current context? */ 
293 

bool isNormalFormPair(Node n1, Node n2); 
294 


295 

//for checkFlatForm 
296 

/** 
297 

* This checks whether there are flat form inferences between eqc[start] and 
298 

* eqc[j] for some j>start. If the flag isRev is true, we check for flat form 
299 

* interferences in the reverse direction of the flat forms (note: 
300 

* `d_flat_form` and `d_flat_form_index` must be in reverse order if `isRev` 
301 

* is true). For more details, see checkFlatForms below. 
302 

*/ 
303 

void checkFlatForm(std::vector<Node>& eqc, size_t start, bool isRev); 
304 

/** debug print current flat forms on trace tc */ 
305 

void debugPrintFlatForms(const char* tc); 
306 

//end for checkFlatForm 
307 


308 

//for checkCycles 
309 

Node checkCycles(Node eqc, std::vector<Node>& curr, std::vector<Node>& exp); 
310 

//end for checkCycles 
311 


312 

//for checkNormalFormsEq 
313 

/** normalize equivalence class 
314 

* 
315 

* This method attempts to build a "normal form" for the equivalence class 
316 

* of string term n (for more details on normal forms, see normal_form.h 
317 

* or see Liang et al CAV 2014). In particular, this method checks whether the 
318 

* current normal form for each term in this equivalence class is identical. 
319 

* If it is not, then we add an inference via sendInference and abort the 
320 

* call. 
321 

* 
322 

* stype is the stringlike type of the equivalence class we are processing. 
323 

*/ 
324 

void normalizeEquivalenceClass(Node n, TypeNode stype); 
325 

/** 
326 

* For each term in the equivalence class of eqc, this adds data regarding its 
327 

* normal form to normal_forms. The map term_to_nf_index maps terms to the 
328 

* index in normal_forms where their normal form data is located. 
329 

* 
330 

* stype is the stringlike type of the equivalence class we are processing. 
331 

*/ 
332 

void getNormalForms(Node eqc, 
333 

std::vector<NormalForm>& normal_forms, 
334 

std::map<Node, unsigned>& term_to_nf_index, 
335 

TypeNode stype); 
336 

/** process normalize equivalence class 
337 

* 
338 

* This is called when an equivalence class eqc contains a set of terms that 
339 

* have normal forms given by the argument normal_forms. It either 
340 

* verifies that all normal forms in this vector are identical, or otherwise 
341 

* adds a conflict, lemma, or inference via the sendInference method. 
342 

* 
343 

* To prioritize one inference versus another, it builds a set of possible 
344 

* inferences, at most two for each pair of distinct normal forms, 
345 

* corresponding to processing the normal form pair in the (forward, reverse) 
346 

* directions. Once all possible inferences are recorded, it executes the 
347 

* one with highest priority based on the enumeration type Inference. 
348 

* 
349 

* stype is the stringlike type of the equivalence class we are processing. 
350 

*/ 
351 

void processNEqc(Node eqc, 
352 

std::vector<NormalForm>& normal_forms, 
353 

TypeNode stype); 
354 

/** process simple normal equality 
355 

* 
356 

* This method is called when two equal terms have normal forms nfi and nfj. 
357 

* It adds (typically at most one) possible inference to the vector pinfer. 
358 

* This inference is in the form of an CoreInferInfo object, which stores the 
359 

* necessary information regarding how to process the inference. 
360 

* 
361 

* index: The index in the normal form vectors (nfi.d_nf and nfj.d_nf) that 
362 

* we are currently checking. This method will increment this index until 
363 

* it finds an index where these vectors differ, or until it reaches the 
364 

* end of these vectors. 
365 

* isRev: Whether we are processing the normal forms in reverse direction. 
366 

* Notice in this case the normal form vectors have been reversed, hence, 
367 

* many operations are identical to the forward case, e.g. index is 
368 

* incremented not decremented, while others require special care, e.g. 
369 

* constant strings "ABC" in the normal form vectors are not reversed to 
370 

* "CBA" and hence all operations should assume a flipped semantics for 
371 

* constants when isRev is true, 
372 

* rproc: the number of string components on the suffix of the normal form of 
373 

* nfi and nfj that were already processed. This is used when using 
374 

* fowards/backwards traversals of normal forms to ensure that duplicate 
375 

* inferences are not processed. 
376 

* pinfer: the set of possible inferences we add to. 
377 

* 
378 

* stype is the stringlike type of the equivalence class we are processing. 
379 

*/ 
380 

void processSimpleNEq(NormalForm& nfi, 
381 

NormalForm& nfj, 
382 

unsigned& index, 
383 

bool isRev, 
384 

unsigned rproc, 
385 

std::vector<CoreInferInfo>& pinfer, 
386 

TypeNode stype); 
387 

//end for checkNormalFormsEq 
388 


389 

//for checkNormalFormsEq with loops 
390 

bool detectLoop(NormalForm& nfi, 
391 

NormalForm& nfj, 
392 

int index, 
393 

int& loop_in_i, 
394 

int& loop_in_j, 
395 

unsigned rproc); 
396 


397 

/** 
398 

* Result of processLoop() below. 
399 

*/ 
400 

enum class ProcessLoopResult 
401 

{ 
402 

/** Loop processing made an inference */ 
403 

INFERENCE, 
404 

/** Loop processing detected a conflict */ 
405 

CONFLICT, 
406 

/** Loop not processed or no loop detected */ 
407 

SKIPPED, 
408 

}; 
409 


410 

ProcessLoopResult processLoop(NormalForm& nfi, 
411 

NormalForm& nfj, 
412 

int loop_index, 
413 

int index, 
414 

CoreInferInfo& info); 
415 

//end for checkNormalFormsEq with loops 
416 


417 

//for checkNormalFormsDeq 
418 


419 

/** 
420 

* Given a pair of disequal strings with the same length, checks whether the 
421 

* disequality holds. This may result in inferences or conflicts. 
422 

* 
423 

* @param n1 The first string in the disequality 
424 

* @param n2 The second string in the disequality 
425 

*/ 
426 

void processDeq(Node n1, Node n2); 
427 


428 

/** 
429 

* Given a pair of disequal strings with the same length and their normal 
430 

* forms, checks whether the disequality holds. This may result in 
431 

* inferences. 
432 

* 
433 

* @param nfi The normal form for the first string in the disequality 
434 

* @param nfj The normal form for the second string in the disequality 
435 

* @param ni The first string in the disequality 
436 

* @param nj The second string in the disequality 
437 

* @return true if the disequality is satisfied, false otherwise 
438 

*/ 
439 

bool processReverseDeq(std::vector<Node>& nfi, 
440 

std::vector<Node>& nfj, 
441 

Node ni, 
442 

Node nj); 
443 


444 

/** 
445 

* Given a pair of disequal strings with the same length and their normal 
446 

* forms, performs some simple checks whether the disequality holds. The 
447 

* check is done starting from a given index and can either be performed on 
448 

* reversed normal forms or the original normal forms. If the function cannot 
449 

* show that a disequality holds, it updates the index to point to the first 
450 

* element in the normal forms for which the relationship is unclear. 
451 

* 
452 

* @param nfi The normal form for the first string in the disequality 
453 

* @param nfj The normal form for the second string in the disequality 
454 

* @param ni The first string in the disequality 
455 

* @param nj The second string in the disequality 
456 

* @param index The index to start at. If this function returns false, the 
457 

* index points to the first index in the normal forms for which 
458 

* it is not known whether they are equal or disequal 
459 

* @param isRev This should be true if the normal forms are reversed, false 
460 

* otherwise 
461 

* @return true if the disequality is satisfied, false otherwise 
462 

*/ 
463 

bool processSimpleDeq(std::vector<Node>& nfi, 
464 

std::vector<Node>& nfj, 
465 

Node ni, 
466 

Node nj, 
467 

size_t& index, 
468 

bool isRev); 
469 

//end for checkNormalFormsDeq 
470 


471 

/** The solver state object */ 
472 

SolverState& d_state; 
473 

/** The (custom) output channel of the theory of strings */ 
474 

InferenceManager& d_im; 
475 

/** Reference to the term registry of theory of strings */ 
476 

TermRegistry& d_termReg; 
477 

/** reference to the base solver, used for certain queries */ 
478 

BaseSolver& d_bsolver; 
479 

/** Commonly used constants */ 
480 

Node d_true; 
481 

Node d_false; 
482 

Node d_zero; 
483 

Node d_one; 
484 

Node d_neg_one; 
485 

/** empty vector (used for trivial explanations) */ 
486 

std::vector<Node> d_emptyVec; 
487 

/** 
488 

* The list of equivalence classes of type string. These are ordered based 
489 

* on the ordering described in checkCycles. 
490 

*/ 
491 

std::vector<Node> d_strings_eqc; 
492 

/** map from terms to their normal forms */ 
493 

std::map<Node, NormalForm> d_normal_form; 
494 

/** 
495 

* In certain cases, we know that two terms are equivalent despite 
496 

* not having to verify their normal forms are identical. For example, 
497 

* after applying the RLoop rule to two terms a and b, we know they 
498 

* are entailed to be equal in the current context without having 
499 

* to look at their normal forms. We call (a,b) a normal form pair. 
500 

* 
501 

* We map representative terms a to a list of nodes b1,...,bn such that 
502 

* (a,b1) ... (a, bn) are normal form pairs. This list is SATcontext 
503 

* dependent. We use a contextdependent integer along with a context 
504 

* indepedent map from nodes to lists of nodes to model this, given by 
505 

* the two data members below. 
506 

*/ 
507 

NodeIntMap d_nfPairs; 
508 

std::map<Node, std::vector<Node> > d_nf_pairs_data; 
509 

/** list of noncongruent concat terms in each equivalence class */ 
510 

std::map<Node, std::vector<Node> > d_eqc; 
511 

/** 
512 

* The flat form for each equivalence class. For a term (str.++ t1 ... tn), 
513 

* this is a list of the form (r_{i1} ... r_{im}) where each empty t1...tn 
514 

* is dropped and the others are replaced in order with their representative. 
515 

*/ 
516 

std::map<Node, std::vector<Node> > d_flat_form; 
517 

/** 
518 

* For each component r_{i1} ... r_{im} in a flat form, this stores 
519 

* the argument number of the t1 ... tn they were generated from. 
520 

*/ 
521 

std::map<Node, std::vector<int> > d_flat_form_index; 
522 

}; /* class CoreSolver */ 
523 


524 

} // namespace strings 
525 

} // namespace theory 
526 

} // namespace CVC4 
527 


528 

#endif /* CVC4__THEORY__STRINGS__CORE_SOLVER_H */ 