1 |
|
/****************************************************************************** |
2 |
|
* Top contributors (to current version): |
3 |
|
* Morgan Deters, Andres Noetzli, Mathias Preiner |
4 |
|
* |
5 |
|
* This file is part of the cvc5 project. |
6 |
|
* |
7 |
|
* Copyright (c) 2009-2021 by the authors listed in the file AUTHORS |
8 |
|
* in the top-level source directory and their institutional affiliations. |
9 |
|
* All rights reserved. See the file COPYING in the top-level source |
10 |
|
* directory for licensing information. |
11 |
|
* **************************************************************************** |
12 |
|
* |
13 |
|
* A custom ANTLR input stream that reads from the input stream lazily |
14 |
|
* |
15 |
|
* WARNING: Edits to this and related files should be done carefully due to the |
16 |
|
* interaction with ANTLR internals. |
17 |
|
* |
18 |
|
* This overwrites the _LA and the consume functions of the ANTLR input stream |
19 |
|
* to use a LineBuffer instead of accessing a buffer. The lines are kept in |
20 |
|
* memory to make sure that existing tokens remain valid (tokens store pointers |
21 |
|
* to the corresponding input). We do not overwrite mark(), etc. because |
22 |
|
* we can use the line number and the position within that line to index into |
23 |
|
* the line buffer and the default markers already store and restore that |
24 |
|
* information. The line buffer guarantees that lines are consecutive in |
25 |
|
* memory, so ANTLR3_INPUT_STREAM::getLineBuf() should work as intended and |
26 |
|
* tokens themselves are consecutive in memory (we are assuming that tokens |
27 |
|
* are not split across multiple lines). |
28 |
|
*/ |
29 |
|
|
30 |
|
#include "parser/antlr_line_buffered_input.h" |
31 |
|
|
32 |
|
#include <antlr3.h> |
33 |
|
|
34 |
|
#include <iostream> |
35 |
|
#include <string> |
36 |
|
|
37 |
|
#include "base/check.h" |
38 |
|
#include "base/output.h" |
39 |
|
|
40 |
|
namespace cvc5 { |
41 |
|
namespace parser { |
42 |
|
|
43 |
|
static pANTLR3_INPUT_STREAM antlr3CreateLineBufferedStream( |
44 |
|
std::istream& in, LineBuffer* line_buffer); |
45 |
|
|
46 |
|
static void |
47 |
4 |
setupInputStream(pANTLR3_INPUT_STREAM input) |
48 |
|
{ |
49 |
|
#if 0 |
50 |
|
ANTLR3_BOOLEAN isBigEndian; |
51 |
|
|
52 |
|
// Used to determine the endianness of the machine we are currently |
53 |
|
// running on. |
54 |
|
// |
55 |
|
ANTLR3_UINT16 bomTest = 0xFEFF; |
56 |
|
|
57 |
|
// What endianess is the machine we are running on? If the incoming |
58 |
|
// encoding endianess is the same as this machine's natural byte order |
59 |
|
// then we can use more efficient API calls. |
60 |
|
// |
61 |
|
if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE) |
62 |
|
{ |
63 |
|
isBigEndian = ANTLR3_TRUE; |
64 |
|
} |
65 |
|
else |
66 |
|
{ |
67 |
|
isBigEndian = ANTLR3_FALSE; |
68 |
|
} |
69 |
|
|
70 |
|
// What encoding did the user tell us {s}he thought it was? I am going |
71 |
|
// to get sick of the questions on antlr-interest, I know I am. |
72 |
|
// |
73 |
|
switch (input->encoding) |
74 |
|
{ |
75 |
|
case ANTLR3_ENC_UTF8: |
76 |
|
|
77 |
|
// See if there is a BOM at the start of this UTF-8 sequence |
78 |
|
// and just eat it if there is. Windows .TXT files have this for instance |
79 |
|
// as it identifies UTF-8 even though it is of no consequence for byte order |
80 |
|
// as UTF-8 does not have a byte order. |
81 |
|
// |
82 |
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF |
83 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB |
84 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF |
85 |
|
) |
86 |
|
{ |
87 |
|
// The UTF8 BOM is present so skip it |
88 |
|
// |
89 |
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3); |
90 |
|
} |
91 |
|
|
92 |
|
// Install the UTF8 input routines |
93 |
|
// |
94 |
|
antlr3UTF8SetupStream(input); |
95 |
|
break; |
96 |
|
|
97 |
|
case ANTLR3_ENC_UTF16: |
98 |
|
|
99 |
|
// See if there is a BOM at the start of the input. If not then |
100 |
|
// we assume that the byte order is the natural order of this |
101 |
|
// machine (or it is really UCS2). If there is a BOM we determine if the encoding |
102 |
|
// is the same as the natural order of this machine. |
103 |
|
// |
104 |
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE |
105 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF |
106 |
|
) |
107 |
|
{ |
108 |
|
// BOM Present, indicates Big Endian |
109 |
|
// |
110 |
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2); |
111 |
|
|
112 |
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE); |
113 |
|
} |
114 |
|
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF |
115 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE |
116 |
|
) |
117 |
|
{ |
118 |
|
// BOM present, indicates Little Endian |
119 |
|
// |
120 |
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2); |
121 |
|
|
122 |
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE); |
123 |
|
} |
124 |
|
else |
125 |
|
{ |
126 |
|
// No BOM present, assume local computer byte order |
127 |
|
// |
128 |
|
antlr3UTF16SetupStream(input, isBigEndian, isBigEndian); |
129 |
|
} |
130 |
|
break; |
131 |
|
|
132 |
|
case ANTLR3_ENC_UTF32: |
133 |
|
|
134 |
|
// See if there is a BOM at the start of the input. If not then |
135 |
|
// we assume that the byte order is the natural order of this |
136 |
|
// machine. If there is we determine if the encoding |
137 |
|
// is the same as the natural order of this machine. |
138 |
|
// |
139 |
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00 |
140 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 |
141 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE |
142 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF |
143 |
|
) |
144 |
|
{ |
145 |
|
// BOM Present, indicates Big Endian |
146 |
|
// |
147 |
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4); |
148 |
|
|
149 |
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE); |
150 |
|
} |
151 |
|
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF |
152 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE |
153 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 |
154 |
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00 |
155 |
|
) |
156 |
|
{ |
157 |
|
// BOM present, indicates Little Endian |
158 |
|
// |
159 |
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4); |
160 |
|
|
161 |
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE); |
162 |
|
} |
163 |
|
else |
164 |
|
{ |
165 |
|
// No BOM present, assume local computer byte order |
166 |
|
// |
167 |
|
antlr3UTF32SetupStream(input, isBigEndian, isBigEndian); |
168 |
|
} |
169 |
|
break; |
170 |
|
|
171 |
|
case ANTLR3_ENC_UTF16BE: |
172 |
|
|
173 |
|
// Encoding is definately Big Endian with no BOM |
174 |
|
// |
175 |
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE); |
176 |
|
break; |
177 |
|
|
178 |
|
case ANTLR3_ENC_UTF16LE: |
179 |
|
|
180 |
|
// Encoding is definately Little Endian with no BOM |
181 |
|
// |
182 |
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE); |
183 |
|
break; |
184 |
|
|
185 |
|
case ANTLR3_ENC_UTF32BE: |
186 |
|
|
187 |
|
// Encoding is definately Big Endian with no BOM |
188 |
|
// |
189 |
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE); |
190 |
|
break; |
191 |
|
|
192 |
|
case ANTLR3_ENC_UTF32LE: |
193 |
|
|
194 |
|
// Encoding is definately Little Endian with no BOM |
195 |
|
// |
196 |
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE); |
197 |
|
break; |
198 |
|
|
199 |
|
case ANTLR3_ENC_EBCDIC: |
200 |
|
|
201 |
|
// EBCDIC is basically the same as ASCII but with an on the |
202 |
|
// fly translation to ASCII |
203 |
|
// |
204 |
|
antlr3EBCDICSetupStream(input); |
205 |
|
break; |
206 |
|
|
207 |
|
case ANTLR3_ENC_8BIT: |
208 |
|
default: |
209 |
|
|
210 |
|
// Standard 8bit/ASCII |
211 |
|
// |
212 |
|
antlr38BitSetupStream(input); |
213 |
|
break; |
214 |
|
} |
215 |
|
#endif /* 0 */ |
216 |
4 |
} |
217 |
|
|
218 |
54 |
static ANTLR3_UCHAR bufferedInputLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la) { |
219 |
54 |
pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super)); |
220 |
54 |
cvc5::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM line_buffered_input = |
221 |
|
(cvc5::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input; |
222 |
216 |
uint8_t* result = line_buffered_input->line_buffer->getPtrWithOffset( |
223 |
216 |
input->line, input->charPositionInLine, la - 1); |
224 |
54 |
return (result != NULL) ? *result : ANTLR3_CHARSTREAM_EOF; |
225 |
|
} |
226 |
|
|
227 |
|
static void bufferedInputRewind(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark) { |
228 |
|
// This function is essentially the same as the original |
229 |
|
// antlr38BitRewind() but does not do any seek. The seek in the |
230 |
|
// original function does not do anything and also calls |
231 |
|
// antlr38BitSeek() instead of the overloaded seek() function, which |
232 |
|
// leads to subtle bugs. |
233 |
|
pANTLR3_LEX_STATE state; |
234 |
|
pANTLR3_INPUT_STREAM input; |
235 |
|
|
236 |
|
input = ((pANTLR3_INPUT_STREAM)is->super); |
237 |
|
|
238 |
|
// Perform any clean up of the marks |
239 |
|
input->istream->release(input->istream, mark); |
240 |
|
|
241 |
|
// Find the supplied mark state |
242 |
|
state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, |
243 |
|
(ANTLR3_UINT32)(mark - 1)); |
244 |
|
if (state == NULL) { |
245 |
|
return; |
246 |
|
} |
247 |
|
|
248 |
|
// Reset the information in the mark |
249 |
|
input->charPositionInLine = state->charPositionInLine; |
250 |
|
input->currentLine = state->currentLine; |
251 |
|
input->line = state->line; |
252 |
|
input->nextChar = state->nextChar; |
253 |
|
} |
254 |
|
|
255 |
14 |
static void bufferedInputConsume(pANTLR3_INT_STREAM is) { |
256 |
14 |
pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super)); |
257 |
14 |
cvc5::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM line_buffered_input = |
258 |
|
(cvc5::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input; |
259 |
|
|
260 |
42 |
uint8_t* current = line_buffered_input->line_buffer->getPtr( |
261 |
42 |
input->line, input->charPositionInLine); |
262 |
14 |
if (current != NULL) { |
263 |
14 |
input->charPositionInLine++; |
264 |
|
|
265 |
14 |
if (*current == LineBuffer::NewLineChar) { |
266 |
|
// Reset for start of a new line of input |
267 |
6 |
input->line++; |
268 |
6 |
input->charPositionInLine = 0; |
269 |
18 |
input->currentLine = line_buffered_input->line_buffer->getPtr( |
270 |
12 |
input->line, input->charPositionInLine); |
271 |
6 |
Debug("pipe") << "-- newline!" << std::endl; |
272 |
|
} |
273 |
|
|
274 |
42 |
input->nextChar = line_buffered_input->line_buffer->getPtr( |
275 |
28 |
input->line, input->charPositionInLine); |
276 |
|
} |
277 |
14 |
} |
278 |
|
|
279 |
|
static void bufferedInputSeek(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint) { |
280 |
|
// In contrast to the original antlr38BitSeek() function, we only |
281 |
|
// support seeking forward (seeking backwards is only supported for |
282 |
|
// rewinding in the original code, which we do not do when rewinding, |
283 |
|
// so this should be fine). |
284 |
|
pANTLR3_INPUT_STREAM input = ((pANTLR3_INPUT_STREAM)(is->super)); |
285 |
|
|
286 |
|
// Check that we are not seeking backwards. |
287 |
|
Assert(!((cvc5::parser::pANTLR3_LINE_BUFFERED_INPUT_STREAM)input) |
288 |
|
->line_buffer->isPtrBefore( |
289 |
|
(uint8_t*)seekPoint, input->line, input->charPositionInLine)); |
290 |
|
|
291 |
|
while ((ANTLR3_MARKER)(input->nextChar) != seekPoint) { |
292 |
|
is->consume(is); |
293 |
|
} |
294 |
|
} |
295 |
|
|
296 |
|
static ANTLR3_UINT32 bufferedInputSize(pANTLR3_INPUT_STREAM input) { |
297 |
|
// Not supported for this type of stream |
298 |
|
Assert(false); |
299 |
|
return 0; |
300 |
|
} |
301 |
|
|
302 |
|
static void bufferedInputSetNewLineChar(pANTLR3_INPUT_STREAM input, |
303 |
|
ANTLR3_UINT32 newlineChar) { |
304 |
|
// Not supported for this type of stream |
305 |
|
Assert(false); |
306 |
|
} |
307 |
|
|
308 |
|
static void bufferedInputSetUcaseLA(pANTLR3_INPUT_STREAM input, |
309 |
|
ANTLR3_BOOLEAN flag) { |
310 |
|
// Not supported for this type of stream |
311 |
|
Assert(false); |
312 |
|
} |
313 |
|
|
314 |
4 |
pANTLR3_INPUT_STREAM antlr3LineBufferedStreamNew(std::istream& in, |
315 |
|
ANTLR3_UINT32 encoding, |
316 |
|
pANTLR3_UINT8 name, |
317 |
|
LineBuffer* line_buffer) { |
318 |
|
pANTLR3_INPUT_STREAM input; |
319 |
|
|
320 |
4 |
if (!in) { |
321 |
|
return NULL; |
322 |
|
} |
323 |
|
|
324 |
|
// First order of business is to set up the stream and install the data |
325 |
|
// pointer. |
326 |
|
// Then we will work out the encoding and byte order and adjust the API |
327 |
|
// functions that are installed for the |
328 |
|
// default 8Bit stream accordingly. |
329 |
|
// |
330 |
4 |
input = antlr3CreateLineBufferedStream(in, line_buffer); |
331 |
4 |
if (input == NULL) { |
332 |
|
return NULL; |
333 |
|
} |
334 |
|
|
335 |
4 |
input->istream->_LA = bufferedInputLA; |
336 |
4 |
input->istream->consume = bufferedInputConsume; |
337 |
4 |
input->istream->seek = bufferedInputSeek; |
338 |
4 |
input->istream->rewind = bufferedInputRewind; |
339 |
4 |
input->size = bufferedInputSize; |
340 |
4 |
input->SetNewLineChar = bufferedInputSetNewLineChar; |
341 |
4 |
input->setUcaseLA = bufferedInputSetUcaseLA; |
342 |
|
|
343 |
|
#ifndef CVC5_ANTLR3_OLD_INPUT_STREAM |
344 |
|
// We have the data in memory now so we can deal with it according to |
345 |
|
// the encoding scheme we were given by the user. |
346 |
|
// |
347 |
4 |
input->encoding = encoding; |
348 |
|
#endif /* ! CVC5_ANTLR3_OLD_INPUT_STREAM */ |
349 |
|
|
350 |
|
// Now we need to work out the endian type and install any |
351 |
|
// API functions that differ from 8Bit |
352 |
|
// |
353 |
4 |
setupInputStream(input); |
354 |
|
|
355 |
|
// Now we can set up the file name |
356 |
|
// |
357 |
8 |
input->istream->streamName = |
358 |
8 |
input->strFactory->newStr8(input->strFactory, name); |
359 |
4 |
input->fileName = input->istream->streamName; |
360 |
|
|
361 |
4 |
return input; |
362 |
|
} |
363 |
|
|
364 |
4 |
static pANTLR3_INPUT_STREAM antlr3CreateLineBufferedStream( |
365 |
|
std::istream& in, LineBuffer* line_buffer) { |
366 |
|
// Pointer to the input stream we are going to create |
367 |
|
// |
368 |
|
pANTLR3_INPUT_STREAM input; |
369 |
|
|
370 |
4 |
if (!in) { |
371 |
|
return NULL; |
372 |
|
} |
373 |
|
|
374 |
|
// Allocate memory for the input stream structure |
375 |
|
// |
376 |
4 |
input = (pANTLR3_INPUT_STREAM)ANTLR3_CALLOC( |
377 |
|
1, sizeof(ANTLR3_LINE_BUFFERED_INPUT_STREAM)); |
378 |
|
|
379 |
4 |
if (input == NULL) { |
380 |
|
return NULL; |
381 |
|
} |
382 |
|
|
383 |
|
// Structure was allocated correctly, now we can install the pointer |
384 |
|
// |
385 |
4 |
input->data = NULL; |
386 |
4 |
input->isAllocated = ANTLR3_FALSE; |
387 |
|
|
388 |
4 |
((pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->in = ∈ |
389 |
4 |
((pANTLR3_LINE_BUFFERED_INPUT_STREAM)input)->line_buffer = line_buffer; |
390 |
|
// Call the common 8 bit input stream handler |
391 |
|
// initialization. |
392 |
|
// |
393 |
|
#ifdef CVC5_ANTLR3_OLD_INPUT_STREAM |
394 |
|
antlr3AsciiSetupStream(input, ANTLR3_CHARSTREAM); |
395 |
|
#else /* CVC5_ANTLR3_OLD_INPUT_STREAM */ |
396 |
4 |
antlr38BitSetupStream(input); |
397 |
|
// In some libantlr3c 3.4-beta versions, this call is not included in the |
398 |
|
// above. |
399 |
|
// This is probably an erroneously-deleted line in the libantlr3c source since |
400 |
|
// 3.2. |
401 |
4 |
antlr3GenericSetupStream(input); |
402 |
|
#endif /* CVC5_ANTLR3_OLD_INPUT_STREAM */ |
403 |
|
|
404 |
4 |
input->sizeBuf = 0; |
405 |
4 |
input->newlineChar = LineBuffer::NewLineChar; |
406 |
4 |
input->charPositionInLine = 0; |
407 |
4 |
input->line = 0; |
408 |
4 |
input->nextChar = line_buffer->getPtr(0, 0); |
409 |
4 |
input->currentLine = line_buffer->getPtr(0, 0); |
410 |
4 |
return input; |
411 |
|
} |
412 |
|
|
413 |
|
} // namespace parser |
414 |
29322 |
} // namespace cvc5 |