1 /* ------------------------------------------------------------------------- */
2 /* "lexer" : Lexical analyser */
4 /* Copyright (c) Graham Nelson 1993 - 2018 */
6 /* This file is part of Inform. */
8 /* Inform is free software: you can redistribute it and/or modify */
9 /* it under the terms of the GNU General Public License as published by */
10 /* the Free Software Foundation, either version 3 of the License, or */
11 /* (at your option) any later version. */
13 /* Inform is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16 /* GNU General Public License for more details. */
18 /* You should have received a copy of the GNU General Public License */
19 /* along with Inform. If not, see https://gnu.org/licenses/ */
21 /* ------------------------------------------------------------------------- */
25 int total_source_line_count, /* Number of source lines so far */
27 no_hash_printed_yet, /* Have not yet printed the first # */
28 hash_printed_since_newline, /* A hash has been printed since the
29 most recent new-line was printed
30 (generally as a result of an error
31 message or the start of pass) */
32 dont_enter_into_symbol_table, /* Return names as text (with
33 token type DQ_TT, i.e., as if
34 they had double-quotes around)
35 and not as entries in the symbol
36 table, when TRUE. If -2, only the
37 keyword table is searched. */
38 return_sp_as_variable; /* When TRUE, the word "sp" denotes
39 the stack pointer variable
40 (used in assembly language only) */
41 int next_token_begins_syntax_line; /* When TRUE, start a new syntax
42 line (for error reporting, etc.)
43 on the source code line where
44 the next token appears */
46 int32 last_mapped_line; /* Last syntax line reported to debugging file */
48 /* ------------------------------------------------------------------------- */
49 /* The lexer's output is a sequence of triples, each called a "token", */
50 /* representing one lexical unit (or "lexeme") each. Instead of providing */
51 /* "lookahead" (that is, always having available the next token after the */
52 /* current one, so that syntax analysers higher up in Inform can have */
53 /* advance knowledge of what is coming), the lexer instead has a system */
54 /* where tokens can be read in and then "put back again". */
55 /* The meaning of the number (and to some extent the text) supplied with */
56 /* a token depends on its type: see "header.h" for the list of types. */
57 /* For example, the lexeme "$1e3" is understood by Inform as a hexadecimal */
58 /* number, and translated to the token: */
59 /* type NUMBER_TT, value 483, text "$1e3" */
60 /* ------------------------------------------------------------------------- */
61 /* These three variables are set to the current token on a call to */
62 /* get_next_token() (but are not changed by a call to put_token_back()). */
63 /* ------------------------------------------------------------------------- */
69 /* ------------------------------------------------------------------------- */
70 /* The next two variables are the head and tail of a singly linked list. */
71 /* The tail stores the portion most recently read from the current */
72 /* lexical block; its end values therefore describe the location of the */
73 /* current token, and are updated whenever the three variables above are */
74 /* via set_token_location(...). Earlier vertices, if any, represent the */
75 /* regions of lexical blocks read beforehand, where new vertices are */
76 /* only introduced by interruptions like a file inclusion or an EOF. */
77 /* Vertices are deleted off of the front of the list once they are no */
78 /* longer referenced by pending debug information records. */
79 /* ------------------------------------------------------------------------- */
81 static debug_locations *first_token_locations;
82 static debug_locations *last_token_location;
84 extern debug_location get_token_location(void)
85 { debug_location result;
86 debug_location *location = &(last_token_location->location);
87 result.file_index = location->file_index;
88 result.beginning_byte_index = location->end_byte_index;
89 result.end_byte_index = location->end_byte_index;
90 result.beginning_line_number = location->end_line_number;
91 result.end_line_number = location->end_line_number;
92 result.beginning_character_number = location->end_character_number;
93 result.end_character_number = location->end_character_number;
94 result.orig_file_index = location->orig_file_index;
95 result.orig_beg_line_number = location->orig_beg_line_number;
96 result.orig_beg_char_number = location->orig_beg_char_number;
100 extern debug_locations get_token_locations(void)
101 { debug_locations result;
102 result.location = get_token_location();
104 result.reference_count = 0;
108 static void set_token_location(debug_location location)
109 { if (location.file_index == last_token_location->location.file_index)
110 { last_token_location->location.end_byte_index =
111 location.end_byte_index;
112 last_token_location->location.end_line_number =
113 location.end_line_number;
114 last_token_location->location.end_character_number =
115 location.end_character_number;
116 last_token_location->location.orig_file_index =
117 location.orig_file_index;
118 last_token_location->location.orig_beg_line_number =
119 location.orig_beg_line_number;
120 last_token_location->location.orig_beg_char_number =
121 location.orig_beg_char_number;
123 { debug_locations*successor =
125 (sizeof(debug_locations),
126 "debug locations of recent tokens");
127 successor->location = location;
128 successor->next = NULL;
129 successor->reference_count = 0;
130 last_token_location->next = successor;
131 last_token_location = successor;
135 extern debug_location_beginning get_token_location_beginning(void)
136 { debug_location_beginning result;
137 ++(last_token_location->reference_count);
138 result.head = last_token_location;
139 result.beginning_byte_index =
140 last_token_location->location.end_byte_index;
141 result.beginning_line_number =
142 last_token_location->location.end_line_number;
143 result.beginning_character_number =
144 last_token_location->location.end_character_number;
145 result.orig_file_index = last_token_location->location.orig_file_index;
146 result.orig_beg_line_number = last_token_location->location.orig_beg_line_number;
147 result.orig_beg_char_number = last_token_location->location.orig_beg_char_number;
152 static void cleanup_token_locations(debug_location_beginning*beginning)
153 { if (first_token_locations)
154 { while (first_token_locations &&
155 !first_token_locations->reference_count)
156 { debug_locations*moribund = first_token_locations;
157 first_token_locations = moribund->next;
158 my_free(&moribund, "debug locations of recent tokens");
160 (beginning->head == moribund || !first_token_locations))
162 ("Records needed by a debug_location_beginning are no "
163 "longer allocated, perhaps because of an invalid reuse "
164 "of this or an earlier beginning");
170 ("Attempt to use a debug_location_beginning when no token "
171 "locations are defined");
174 ("Attempt to clean up token locations when no token locations "
180 extern void discard_token_location(debug_location_beginning beginning)
181 { --(beginning.head->reference_count);
184 extern debug_locations get_token_location_end
185 (debug_location_beginning beginning)
186 { debug_locations result;
187 cleanup_token_locations(&beginning);
188 --(beginning.head->reference_count);
189 /* Sometimes we know what we'll read before we switch to the lexical block
190 where we'll read it. In such cases the beginning will be placed in the
191 prior block and last exactly zero bytes there. It's misleading to
192 include such ranges, so we gobble them. */
193 if (beginning.head->location.end_byte_index ==
194 beginning.beginning_byte_index &&
195 beginning.head->next)
196 { beginning.head = beginning.head->next;
197 result.location = beginning.head->location;
198 result.location.beginning_byte_index = 0;
199 result.location.beginning_line_number = 1;
200 result.location.beginning_character_number = 1;
202 { result.location = beginning.head->location;
203 result.location.beginning_byte_index =
204 beginning.beginning_byte_index;
205 result.location.beginning_line_number =
206 beginning.beginning_line_number;
207 result.location.beginning_character_number =
208 beginning.beginning_character_number;
211 result.location.orig_file_index =
212 beginning.orig_file_index;
213 result.location.orig_beg_line_number =
214 beginning.orig_beg_line_number;
215 result.location.orig_beg_char_number =
216 beginning.orig_beg_char_number;
218 result.next = beginning.head->next;
219 result.reference_count = 0;
223 /* ------------------------------------------------------------------------- */
224 /* In order to be able to put tokens back efficiently, the lexer stores */
225 /* tokens in a "circle": the variable circle_position ranges between */
226 /* 0 and CIRCLE_SIZE-1. We only need a circle size as large as the */
227 /* maximum number of tokens ever put back at once, plus 1 (in effect, the */
228 /* maximum token lookahead ever needed in syntax analysis, plus 1). */
230 /* Unlike some compilers, Inform does not have a context-free lexer: in */
231 /* fact it has 12288 different possible states. However, the context only */
232 /* affects the interpretation of "identifiers": lexemes beginning with a */
233 /* letter and containing up to 32 chars of alphanumeric and underscore */
234 /* chars. (For example, "default" may refer to the directive or statement */
235 /* of that name, and which token values are returned depends on the */
236 /* current lexical context.) */
238 /* Along with each token, we also store the lexical context it was */
239 /* translated under; because if it is called for again, there may need */
240 /* to be a fresh interpretation of it if the context has changed. */
241 /* ------------------------------------------------------------------------- */
243 #define CIRCLE_SIZE 6
245 /* (The worst case for token lookahead is distinguishing between an
246 old-style "objectloop (a in b)" and a new "objectloop (a in b ...)".) */
248 static int circle_position;
249 static token_data circle[CIRCLE_SIZE];
251 static int token_contexts[CIRCLE_SIZE];
253 /* ------------------------------------------------------------------------- */
254 /* A complication, however, is that the text of some lexemes needs to be */
255 /* held in Inform's memory for much longer periods: for example, a */
256 /* dictionary word lexeme (like "'south'") must have its text preserved */
257 /* until the code generation time for the expression it occurs in, when */
258 /* the dictionary reference is actually made. Code generation in general */
259 /* occurs as early as possible in Inform: pending some better method of */
260 /* garbage collection, we simply use a buffer so large that unless */
261 /* expressions spread across 10K of source code are found, there can be */
263 /* ------------------------------------------------------------------------- */
265 static char *lexeme_memory;
266 static char *lex_p; /* Current write position */
268 /* ------------------------------------------------------------------------- */
269 /* The lexer itself needs up to 3 characters of lookahead (it uses an */
270 /* LR(3) grammar to translate characters into tokens). */
271 /* ------------------------------------------------------------------------- */
273 #define LOOKAHEAD_SIZE 3
275 static int current, lookahead, /* The latest character read, and */
276 lookahead2, lookahead3; /* the three characters following it */
278 static int pipeline_made; /* Whether or not the pipeline of
279 characters has been constructed
282 static int (* get_next_char)(void); /* Routine for reading the stream of
283 characters: the lexer does not
284 need any "ungetc" routine for
285 putting them back again. End of
286 stream is signalled by returning
289 static char *source_to_analyse; /* The current lexical source:
290 NULL for "load from source files",
291 otherwise this points to a string
292 containing Inform code */
294 static int tokens_put_back; /* Count of the number of backward
295 moves made from the last-read
298 extern void describe_token(token_data t)
300 /* Many of the token types are not set in this file, but later on in
301 Inform's higher stages (for example, in the expression evaluator);
302 but this routine describes them all. */
308 /* The following token types occur in lexer output: */
310 case SYMBOL_TT: printf("symbol ");
311 describe_symbol(t.value);
313 case NUMBER_TT: printf("literal number %d", t.value);
315 case DQ_TT: printf("string \"%s\"", t.text);
317 case SQ_TT: printf("string '%s'", t.text);
319 case SEP_TT: printf("separator '%s'", t.text);
321 case EOF_TT: printf("end of file");
324 case STATEMENT_TT: printf("statement name '%s'", t.text);
326 case SEGMENT_MARKER_TT: printf("object segment marker '%s'", t.text);
328 case DIRECTIVE_TT: printf("directive name '%s'", t.text);
330 case CND_TT: printf("textual conditional '%s'", t.text);
332 case OPCODE_NAME_TT: printf("opcode name '%s'", t.text);
334 case SYSFUN_TT: printf("built-in function name '%s'", t.text);
336 case LOCAL_VARIABLE_TT: printf("local variable name '%s'", t.text);
338 case MISC_KEYWORD_TT: printf("statement keyword '%s'", t.text);
340 case DIR_KEYWORD_TT: printf("directive keyword '%s'", t.text);
342 case TRACE_KEYWORD_TT: printf("'trace' keyword '%s'", t.text);
344 case SYSTEM_CONSTANT_TT: printf("system constant name '%s'", t.text);
347 /* The remaining are etoken types, not set by the lexer */
349 case OP_TT: printf("operator '%s'",
350 operators[t.value].description);
352 case ENDEXP_TT: printf("end of expression");
354 case SUBOPEN_TT: printf("open bracket");
356 case SUBCLOSE_TT: printf("close bracket");
358 case LARGE_NUMBER_TT: printf("large number: '%s'=%d",t.text,t.value);
360 case SMALL_NUMBER_TT: printf("small number: '%s'=%d",t.text,t.value);
362 case VARIABLE_TT: printf("variable '%s'=%d", t.text, t.value);
364 case DICTWORD_TT: printf("dictionary word '%s'", t.text);
366 case ACTION_TT: printf("action name '%s'", t.text);
370 printf("** unknown token type %d, text='%s', value=%d **",
371 t.type, t.text, t.value);
376 /* ------------------------------------------------------------------------- */
377 /* All but one of the 280 Inform keywords (118 of them opcode names used */
378 /* only by the assembler). (The one left over is "sp", a keyword used in */
379 /* assembly language only.) */
381 /* A "keyword group" is a set of keywords to be searched for. If a match */
382 /* is made on an identifier, the token type becomes that given in the KG */
383 /* and the token value is its index in the KG. */
385 /* The keyword ordering must correspond with the appropriate #define's in */
386 /* "header.h" but is otherwise not significant. */
387 /* ------------------------------------------------------------------------- */
389 #define MAX_KEYWORDS 350
391 /* The values will be filled in at compile time, when we know
392 which opcode set to use. */
393 keyword_group opcode_names =
395 OPCODE_NAME_TT, FALSE, TRUE
398 static char *opcode_list_z[] = {
399 "je", "jl", "jg", "dec_chk", "inc_chk", "jin", "test", "or", "and",
400 "test_attr", "set_attr", "clear_attr", "store", "insert_obj", "loadw",
401 "loadb", "get_prop", "get_prop_addr", "get_next_prop", "add", "sub",
402 "mul", "div", "mod", "call", "storew", "storeb", "put_prop", "sread",
403 "print_char", "print_num", "random", "push", "pull", "split_window",
404 "set_window", "output_stream", "input_stream", "sound_effect", "jz",
405 "get_sibling", "get_child", "get_parent", "get_prop_len", "inc", "dec",
406 "print_addr", "remove_obj", "print_obj", "ret", "jump", "print_paddr",
407 "load", "not", "rtrue", "rfalse", "print", "print_ret", "nop", "save",
408 "restore", "restart", "ret_popped", "pop", "quit", "new_line",
409 "show_status", "verify", "call_2s", "call_vs", "aread", "call_vs2",
410 "erase_window", "erase_line", "set_cursor", "get_cursor",
411 "set_text_style", "buffer_mode", "read_char", "scan_table", "call_1s",
412 "call_2n", "set_colour", "throw", "call_vn", "call_vn2", "tokenise",
413 "encode_text", "copy_table", "print_table", "check_arg_count", "call_1n",
414 "catch", "piracy", "log_shift", "art_shift", "set_font", "save_undo",
415 "restore_undo", "draw_picture", "picture_data", "erase_picture",
416 "set_margins", "move_window", "window_size", "window_style",
417 "get_wind_prop", "scroll_window", "pop_stack", "read_mouse",
418 "mouse_window", "push_stack", "put_wind_prop", "print_form",
419 "make_menu", "picture_table", "print_unicode", "check_unicode",
423 static char *opcode_list_g[] = {
424 "nop", "add", "sub", "mul", "div", "mod", "neg", "bitand", "bitor",
425 "bitxor", "bitnot", "shiftl", "sshiftr", "ushiftr", "jump", "jz",
426 "jnz", "jeq", "jne", "jlt", "jge", "jgt", "jle",
427 "jltu", "jgeu", "jgtu", "jleu",
429 "catch", "throw", "tailcall",
430 "copy", "copys", "copyb", "sexs", "sexb", "aload",
431 "aloads", "aloadb", "aloadbit", "astore", "astores", "astoreb",
432 "astorebit", "stkcount", "stkpeek", "stkswap", "stkroll", "stkcopy",
433 "streamchar", "streamnum", "streamstr",
434 "gestalt", "debugtrap", "getmemsize", "setmemsize", "jumpabs",
435 "random", "setrandom", "quit", "verify",
436 "restart", "save", "restore", "saveundo", "restoreundo", "protect",
437 "glk", "getstringtbl", "setstringtbl", "getiosys", "setiosys",
438 "linearsearch", "binarysearch", "linkedsearch",
439 "callf", "callfi", "callfii", "callfiii",
441 "mzero", "mcopy", "malloc", "mfree",
442 "accelfunc", "accelparam",
443 "numtof", "ftonumz", "ftonumn", "ceil", "floor",
444 "fadd", "fsub", "fmul", "fdiv", "fmod",
445 "sqrt", "exp", "log", "pow",
446 "sin", "cos", "tan", "asin", "acos", "atan", "atan2",
447 "jfeq", "jfne", "jflt", "jfle", "jfgt", "jfge", "jisnan", "jisinf",
451 keyword_group opcode_macros =
453 OPCODE_MACRO_TT, FALSE, TRUE
456 static char *opmacro_list_z[] = { "" };
458 static char *opmacro_list_g[] = {
463 keyword_group directives =
464 { { "abbreviate", "array", "attribute", "class", "constant",
465 "default", "dictionary", "end", "endif", "extend", "fake_action",
466 "global", "ifdef", "ifndef", "ifnot", "ifv3", "ifv5", "iftrue",
467 "iffalse", "import", "include", "link", "lowstring", "message",
468 "nearby", "object", "origsource", "property", "release", "replace",
469 "serial", "switches", "statusline", "stub", "system_file", "trace",
470 "undef", "verb", "version", "zcharacter",
472 DIRECTIVE_TT, FALSE, FALSE
475 keyword_group trace_keywords =
476 { { "dictionary", "symbols", "objects", "verbs",
477 "assembly", "expressions", "lines", "tokens", "linker",
479 TRACE_KEYWORD_TT, FALSE, TRUE
482 keyword_group segment_markers =
483 { { "class", "has", "private", "with", "" },
484 SEGMENT_MARKER_TT, FALSE, TRUE
487 keyword_group directive_keywords =
488 { { "alias", "long", "additive",
490 "noun", "held", "multi", "multiheld", "multiexcept",
491 "multiinside", "creature", "special", "number", "scope", "topic",
492 "reverse", "meta", "only", "replace", "first", "last",
493 "string", "table", "buffer", "data", "initial", "initstr",
494 "with", "private", "has", "class",
495 "error", "fatalerror", "warning",
498 DIR_KEYWORD_TT, FALSE, TRUE
501 keyword_group misc_keywords =
502 { { "char", "name", "the", "a", "an", "The", "number",
503 "roman", "reverse", "bold", "underline", "fixed", "on", "off",
504 "to", "address", "string", "object", "near", "from", "property", "A", "" },
505 MISC_KEYWORD_TT, FALSE, TRUE
508 keyword_group statements =
509 { { "box", "break", "continue", "default", "do", "else", "font", "for",
510 "give", "if", "inversion", "jump", "move", "new_line", "objectloop",
511 "print", "print_ret", "quit", "read", "remove", "restore", "return",
512 "rfalse", "rtrue", "save", "spaces", "string", "style", "switch",
513 "until", "while", "" },
514 STATEMENT_TT, FALSE, TRUE
517 keyword_group conditions =
518 { { "has", "hasnt", "in", "notin", "ofclass", "or", "provides", "" },
522 keyword_group system_functions =
523 { { "child", "children", "elder", "eldest", "indirect", "parent", "random",
524 "sibling", "younger", "youngest", "metaclass", "glk", "" },
525 SYSFUN_TT, FALSE, TRUE
528 keyword_group system_constants =
529 { { "adjectives_table", "actions_table", "classes_table",
530 "identifiers_table", "preactions_table", "version_number",
531 "largest_object", "strings_offset", "code_offset",
532 "dict_par1", "dict_par2", "dict_par3", "actual_largest_object",
533 "static_memory_offset", "array_names_offset", "readable_memory_offset",
534 "cpv__start", "cpv__end", "ipv__start", "ipv__end",
535 "array__start", "array__end",
536 "lowest_attribute_number", "highest_attribute_number",
537 "attribute_names_array",
538 "lowest_property_number", "highest_property_number",
539 "property_names_array",
540 "lowest_action_number", "highest_action_number",
541 "action_names_array",
542 "lowest_fake_action_number", "highest_fake_action_number",
543 "fake_action_names_array",
544 "lowest_routine_number", "highest_routine_number", "routines_array",
545 "routine_names_array", "routine_flags_array",
546 "lowest_global_number", "highest_global_number", "globals_array",
547 "global_names_array", "global_flags_array",
548 "lowest_array_number", "highest_array_number", "arrays_array",
549 "array_names_array", "array_flags_array",
550 "lowest_constant_number", "highest_constant_number", "constants_array",
551 "constant_names_array",
552 "lowest_class_number", "highest_class_number", "class_objects_array",
553 "lowest_object_number", "highest_object_number",
555 "grammar_table", "dictionary_table", "dynam_string_table",
557 SYSTEM_CONSTANT_TT, FALSE, TRUE
560 keyword_group *keyword_groups[12]
561 = { NULL, &opcode_names, &directives, &trace_keywords, &segment_markers,
562 &directive_keywords, &misc_keywords, &statements, &conditions,
563 &system_functions, &system_constants, &opcode_macros};
565 keyword_group local_variables =
566 { { "" }, /* Filled in when routine declared */
567 LOCAL_VARIABLE_TT, FALSE, FALSE
570 static int lexical_context(void)
572 /* The lexical context is a number representing all of the context
573 information in the lexical analyser: the same input text will
574 always translate to the same output tokens whenever the context
577 In fact, for efficiency reasons this number omits the bit of
578 information held in the variable "dont_enter_into_symbol_table".
579 Inform never needs to backtrack through tokens parsed in that
580 way (thankfully, as it would be expensive indeed to check
584 if (opcode_names.enabled) c |= 1;
585 if (directives.enabled) c |= 2;
586 if (trace_keywords.enabled) c |= 4;
587 if (segment_markers.enabled) c |= 8;
588 if (directive_keywords.enabled) c |= 16;
589 if (misc_keywords.enabled) c |= 32;
590 if (statements.enabled) c |= 64;
591 if (conditions.enabled) c |= 128;
592 if (system_functions.enabled) c |= 256;
593 if (system_constants.enabled) c |= 512;
594 if (local_variables.enabled) c |= 1024;
596 if (return_sp_as_variable) c |= 2048;
600 static void print_context(int c)
602 if ((c & 1) != 0) printf("OPC ");
603 if ((c & 2) != 0) printf("DIR ");
604 if ((c & 4) != 0) printf("TK ");
605 if ((c & 8) != 0) printf("SEG ");
606 if ((c & 16) != 0) printf("DK ");
607 if ((c & 32) != 0) printf("MK ");
608 if ((c & 64) != 0) printf("STA ");
609 if ((c & 128) != 0) printf("CND ");
610 if ((c & 256) != 0) printf("SFUN ");
611 if ((c & 512) != 0) printf("SCON ");
612 if ((c & 1024) != 0) printf("LV ");
613 if ((c & 2048) != 0) printf("sp ");
616 static int *keywords_hash_table;
617 static int *keywords_hash_ends_table;
618 static int *keywords_data_table;
620 static int *local_variable_hash_table;
621 static int *local_variable_hash_codes;
622 char **local_variable_texts;
623 static char *local_variable_text_table;
625 static char one_letter_locals[128];
627 static void make_keywords_tables(void)
629 char **oplist, **maclist;
632 oplist = opcode_list_z;
633 maclist = opmacro_list_z;
636 oplist = opcode_list_g;
637 maclist = opmacro_list_g;
640 for (j=0; *(oplist[j]); j++) {
641 opcode_names.keywords[j] = oplist[j];
643 opcode_names.keywords[j] = "";
645 for (j=0; *(maclist[j]); j++) {
646 opcode_macros.keywords[j] = maclist[j];
648 opcode_macros.keywords[j] = "";
650 for (i=0; i<HASH_TAB_SIZE; i++)
651 { keywords_hash_table[i] = -1;
652 keywords_hash_ends_table[i] = -1;
655 for (i=1; i<=11; i++)
656 { keyword_group *kg = keyword_groups[i];
657 for (j=0; *(kg->keywords[j]) != 0; j++)
658 { h = hash_code_from_string(kg->keywords[j]);
659 if (keywords_hash_table[h] == -1)
660 keywords_hash_table[h] = tp;
662 *(keywords_data_table + 3*(keywords_hash_ends_table[h]) + 2) = tp;
663 keywords_hash_ends_table[h] = tp;
664 *(keywords_data_table + 3*tp) = i;
665 *(keywords_data_table + 3*tp+1) = j;
666 *(keywords_data_table + 3*tp+2) = -1;
672 extern void construct_local_variable_tables(void)
673 { int i, h; char *p = local_variable_text_table;
674 for (i=0; i<HASH_TAB_SIZE; i++) local_variable_hash_table[i] = -1;
675 for (i=0; i<128; i++) one_letter_locals[i] = MAX_LOCAL_VARIABLES;
677 for (i=0; i<no_locals; i++)
678 { char *q = local_variables.keywords[i];
680 { one_letter_locals[(uchar)q[0]] = i;
681 if (isupper(q[0])) one_letter_locals[tolower(q[0])] = i;
682 if (islower(q[0])) one_letter_locals[toupper(q[0])] = i;
684 h = hash_code_from_string(q);
685 if (local_variable_hash_table[h] == -1)
686 local_variable_hash_table[h] = i;
687 local_variable_hash_codes[i] = h;
688 local_variable_texts[i] = p;
692 for (;i<MAX_LOCAL_VARIABLES-1;i++)
693 local_variable_texts[i] = "<no such local variable>";
696 static void interpret_identifier(int pos, int dirs_only_flag)
697 { int index, hashcode; char *p = circle[pos].text;
699 /* An identifier is either a keyword or a "symbol", a name which the
700 lexical analyser leaves to higher levels of Inform to understand. */
702 hashcode = hash_code_from_string(p);
704 if (dirs_only_flag) goto KeywordSearch;
706 /* If this is assembly language, perhaps it is "sp"? */
708 if (return_sp_as_variable && (p[0]=='s') && (p[1]=='p') && (p[2]==0))
709 { circle[pos].value = 0; circle[pos].type = LOCAL_VARIABLE_TT;
713 /* Test for local variables first, quite quickly. */
715 if (local_variables.enabled)
717 { index = one_letter_locals[(uchar)p[0]];
718 if (index<MAX_LOCAL_VARIABLES)
719 { circle[pos].type = LOCAL_VARIABLE_TT;
720 circle[pos].value = index+1;
724 index = local_variable_hash_table[hashcode];
726 { for (;index<no_locals;index++)
727 { if (hashcode == local_variable_hash_codes[index])
728 { if (strcmpcis(p, local_variable_texts[index])==0)
729 { circle[pos].type = LOCAL_VARIABLE_TT;
730 circle[pos].value = index+1;
738 /* Now the bulk of the keywords. Note that the lexer doesn't recognise
739 the name of a system function which has been Replaced. */
742 index = keywords_hash_table[hashcode];
744 { int *i = keywords_data_table + 3*index;
745 keyword_group *kg = keyword_groups[*i];
746 if (((!dirs_only_flag) && (kg->enabled))
747 || (dirs_only_flag && (kg == &directives)))
748 { char *q = kg->keywords[*(i+1)];
749 if (((kg->case_sensitive) && (strcmp(p, q)==0))
750 || ((!(kg->case_sensitive)) && (strcmpcis(p, q)==0)))
751 { if ((kg != &system_functions)
752 || (system_function_usage[*(i+1)]!=2))
753 { circle[pos].type = kg->change_token_type;
754 circle[pos].value = *(i+1);
762 if (dirs_only_flag) return;
764 /* Search for the name; create it if necessary. */
766 circle[pos].value = symbol_index(p, hashcode);
767 circle[pos].type = SYMBOL_TT;
771 /* ------------------------------------------------------------------------- */
772 /* The tokeniser grid aids a rapid decision about the consequences of a */
773 /* character reached in the buffer. In effect it is an efficiently stored */
774 /* transition table using an algorithm similar to that of S. C. Johnson's */
775 /* "yacc" lexical analyser (see Aho, Sethi and Ullman, section 3.9). */
776 /* My thanks to Dilip Sequeira for suggesting this. */
778 /* tokeniser_grid[c] is (16*n + m) if c is the first character of */
779 /* separator numbers n, n+1, ..., n+m-1 */
780 /* or certain special values (QUOTE_CODE, etc) */
783 /* Since 1000/16 = 62, the code numbers below will need increasing if the */
784 /* number of separators supported exceeds 61. */
785 /* ------------------------------------------------------------------------- */
787 static int tokeniser_grid[256];
789 #define QUOTE_CODE 1000
790 #define DQUOTE_CODE 1001
791 #define NULL_CODE 1002
792 #define SPACE_CODE 1003
793 #define NEGATIVE_CODE 1004
794 #define DIGIT_CODE 1005
795 #define RADIX_CODE 1006
796 #define KEYWORD_CODE 1007
797 #define EOF_CODE 1008
798 #define WHITESPACE_CODE 1009
799 #define COMMENT_CODE 1010
800 #define IDENTIFIER_CODE 1011
802 /* This list cannot safely be changed without also changing the header
803 separator #defines. The ordering is significant in that (i) all entries
804 beginning with the same character must be adjacent and (ii) that if
805 X is a an initial substring of Y then X must come before Y.
807 E.g. --> must occur before -- to prevent "-->0" being tokenised
808 wrongly as "--", ">", "0" rather than "-->", "0". */
810 static const char separators[NUMBER_SEPARATORS][4] =
811 { "->", "-->", "--", "-", "++", "+", "*", "/", "%",
812 "||", "|", "&&", "&", "~~",
813 "~=", "~", "==", "=", ">=", ">",
814 "<=", "<", "(", ")", ",",
815 ".&", ".#", "..&", "..#", "..", ".",
816 "::", ":", "@", ";", "[", "]", "{", "}",
818 "#a$", "#g$", "#n$", "#r$", "#w$", "##", "#"
821 static void make_tokeniser_grid(void)
823 /* Construct the grid to the specification above. */
827 for (i=0; i<256; i++) tokeniser_grid[i]=0;
829 for (i=0; i<NUMBER_SEPARATORS; i++)
830 { j=separators[i][0];
831 if (tokeniser_grid[j]==0)
832 tokeniser_grid[j]=i*16+1; else tokeniser_grid[j]++;
834 tokeniser_grid['\''] = QUOTE_CODE;
835 tokeniser_grid['\"'] = DQUOTE_CODE;
836 tokeniser_grid[0] = EOF_CODE;
837 tokeniser_grid[' '] = WHITESPACE_CODE;
838 tokeniser_grid['\n'] = WHITESPACE_CODE;
839 tokeniser_grid['$'] = RADIX_CODE;
840 tokeniser_grid['!'] = COMMENT_CODE;
842 tokeniser_grid['0'] = DIGIT_CODE;
843 tokeniser_grid['1'] = DIGIT_CODE;
844 tokeniser_grid['2'] = DIGIT_CODE;
845 tokeniser_grid['3'] = DIGIT_CODE;
846 tokeniser_grid['4'] = DIGIT_CODE;
847 tokeniser_grid['5'] = DIGIT_CODE;
848 tokeniser_grid['6'] = DIGIT_CODE;
849 tokeniser_grid['7'] = DIGIT_CODE;
850 tokeniser_grid['8'] = DIGIT_CODE;
851 tokeniser_grid['9'] = DIGIT_CODE;
853 tokeniser_grid['a'] = IDENTIFIER_CODE;
854 tokeniser_grid['b'] = IDENTIFIER_CODE;
855 tokeniser_grid['c'] = IDENTIFIER_CODE;
856 tokeniser_grid['d'] = IDENTIFIER_CODE;
857 tokeniser_grid['e'] = IDENTIFIER_CODE;
858 tokeniser_grid['f'] = IDENTIFIER_CODE;
859 tokeniser_grid['g'] = IDENTIFIER_CODE;
860 tokeniser_grid['h'] = IDENTIFIER_CODE;
861 tokeniser_grid['i'] = IDENTIFIER_CODE;
862 tokeniser_grid['j'] = IDENTIFIER_CODE;
863 tokeniser_grid['k'] = IDENTIFIER_CODE;
864 tokeniser_grid['l'] = IDENTIFIER_CODE;
865 tokeniser_grid['m'] = IDENTIFIER_CODE;
866 tokeniser_grid['n'] = IDENTIFIER_CODE;
867 tokeniser_grid['o'] = IDENTIFIER_CODE;
868 tokeniser_grid['p'] = IDENTIFIER_CODE;
869 tokeniser_grid['q'] = IDENTIFIER_CODE;
870 tokeniser_grid['r'] = IDENTIFIER_CODE;
871 tokeniser_grid['s'] = IDENTIFIER_CODE;
872 tokeniser_grid['t'] = IDENTIFIER_CODE;
873 tokeniser_grid['u'] = IDENTIFIER_CODE;
874 tokeniser_grid['v'] = IDENTIFIER_CODE;
875 tokeniser_grid['w'] = IDENTIFIER_CODE;
876 tokeniser_grid['x'] = IDENTIFIER_CODE;
877 tokeniser_grid['y'] = IDENTIFIER_CODE;
878 tokeniser_grid['z'] = IDENTIFIER_CODE;
880 tokeniser_grid['A'] = IDENTIFIER_CODE;
881 tokeniser_grid['B'] = IDENTIFIER_CODE;
882 tokeniser_grid['C'] = IDENTIFIER_CODE;
883 tokeniser_grid['D'] = IDENTIFIER_CODE;
884 tokeniser_grid['E'] = IDENTIFIER_CODE;
885 tokeniser_grid['F'] = IDENTIFIER_CODE;
886 tokeniser_grid['G'] = IDENTIFIER_CODE;
887 tokeniser_grid['H'] = IDENTIFIER_CODE;
888 tokeniser_grid['I'] = IDENTIFIER_CODE;
889 tokeniser_grid['J'] = IDENTIFIER_CODE;
890 tokeniser_grid['K'] = IDENTIFIER_CODE;
891 tokeniser_grid['L'] = IDENTIFIER_CODE;
892 tokeniser_grid['M'] = IDENTIFIER_CODE;
893 tokeniser_grid['N'] = IDENTIFIER_CODE;
894 tokeniser_grid['O'] = IDENTIFIER_CODE;
895 tokeniser_grid['P'] = IDENTIFIER_CODE;
896 tokeniser_grid['Q'] = IDENTIFIER_CODE;
897 tokeniser_grid['R'] = IDENTIFIER_CODE;
898 tokeniser_grid['S'] = IDENTIFIER_CODE;
899 tokeniser_grid['T'] = IDENTIFIER_CODE;
900 tokeniser_grid['U'] = IDENTIFIER_CODE;
901 tokeniser_grid['V'] = IDENTIFIER_CODE;
902 tokeniser_grid['W'] = IDENTIFIER_CODE;
903 tokeniser_grid['X'] = IDENTIFIER_CODE;
904 tokeniser_grid['Y'] = IDENTIFIER_CODE;
905 tokeniser_grid['Z'] = IDENTIFIER_CODE;
907 tokeniser_grid['_'] = IDENTIFIER_CODE;
910 /* ------------------------------------------------------------------------- */
911 /* Definition of a lexical block: a source file or a string containing */
912 /* text for lexical analysis; an independent source from the point of */
913 /* view of issuing error reports. */
914 /* ------------------------------------------------------------------------- */
916 typedef struct LexicalBlock_s
917 { char *filename; /* Full translated name */
918 int main_flag; /* TRUE if the main file
919 (the first one opened) */
920 int sys_flag; /* TRUE if a System_File */
921 int source_line; /* Line number count */
922 int line_start; /* Char number within file
923 where the current line
925 int chars_read; /* Char number of read pos */
926 int file_no; /* Or 255 if not from a
929 char *orig_source; /* From #origsource direct */
935 static LexicalBlock NoFileOpen =
936 { "<before compilation>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
938 static LexicalBlock MakingOutput =
939 { "<constructing output>", FALSE, FALSE, 0, 0, 0, 255, NULL, 0, 0, 0 };
941 static LexicalBlock StringLB =
942 { "<veneer routine>", FALSE, TRUE, 0, 0, 0, 255, NULL, 0, 0, 0 };
944 static LexicalBlock *CurrentLB; /* The current lexical
945 block of input text */
947 extern void declare_systemfile(void)
948 { CurrentLB->sys_flag = TRUE;
951 extern int is_systemfile(void)
952 { return ((CurrentLB->sys_flag)?1:0);
955 extern void set_origsource_location(char *source, int32 line, int32 charnum)
958 /* Clear the Origsource declaration. */
959 CurrentLB->orig_file = 0;
960 CurrentLB->orig_source = NULL;
961 CurrentLB->orig_line = 0;
962 CurrentLB->orig_char = 0;
966 /* Get the file number for a new or existing InputFiles entry. */
967 int file_no = register_orig_sourcefile(source);
969 CurrentLB->orig_file = file_no;
970 CurrentLB->orig_source = InputFiles[file_no-1].filename;
971 CurrentLB->orig_line = line;
972 CurrentLB->orig_char = charnum;
975 /* Error locations. */
977 extern debug_location get_current_debug_location(void)
978 { debug_location result;
979 /* Assume that all input characters are one byte. */
980 result.file_index = CurrentLB->file_no;
981 result.beginning_byte_index = CurrentLB->chars_read - LOOKAHEAD_SIZE;
982 result.end_byte_index = result.beginning_byte_index;
983 result.beginning_line_number = CurrentLB->source_line;
984 result.end_line_number = result.beginning_line_number;
985 result.beginning_character_number =
986 CurrentLB->chars_read - CurrentLB->line_start;
987 result.end_character_number = result.beginning_character_number;
988 result.orig_file_index = CurrentLB->orig_file;
989 result.orig_beg_line_number = CurrentLB->orig_line;
990 result.orig_beg_char_number = CurrentLB->orig_char;
994 static debug_location ErrorReport_debug_location;
996 extern void report_errors_at_current_line(void)
997 { ErrorReport.line_number = CurrentLB->source_line;
998 ErrorReport.file_number = CurrentLB->file_no;
999 if (ErrorReport.file_number == 255)
1000 ErrorReport.file_number = -1;
1001 ErrorReport.source = CurrentLB->filename;
1002 ErrorReport.main_flag = CurrentLB->main_flag;
1003 if (debugfile_switch)
1004 ErrorReport_debug_location = get_current_debug_location();
1005 ErrorReport.orig_file = CurrentLB->orig_file;
1006 ErrorReport.orig_source = CurrentLB->orig_source;
1007 ErrorReport.orig_line = CurrentLB->orig_line;
1008 ErrorReport.orig_char = CurrentLB->orig_char;
1011 extern debug_location get_error_report_debug_location(void)
1012 { return ErrorReport_debug_location;
1015 extern int32 get_current_line_start(void)
1016 { return CurrentLB->line_start;
1019 brief_location blank_brief_location;
1021 extern brief_location get_brief_location(ErrorPosition *errpos)
1024 loc.file_index = errpos->file_number;
1025 loc.line_number = errpos->line_number;
1026 loc.orig_file_index = errpos->orig_file;
1027 loc.orig_line_number = errpos->orig_line;
1031 extern void export_brief_location(brief_location loc, ErrorPosition *errpos)
1033 if (loc.file_index != -1)
1034 { errpos->file_number = loc.file_index;
1035 errpos->line_number = loc.line_number;
1036 errpos->main_flag = (errpos->file_number == 1);
1037 errpos->orig_source = NULL;
1038 errpos->orig_file = loc.orig_file_index;
1039 errpos->orig_line = loc.orig_line_number;
1040 errpos->orig_char = 0;
1044 /* ------------------------------------------------------------------------- */
1045 /* Hash printing and line counting */
1046 /* ------------------------------------------------------------------------- */
1048 static void print_hash(void)
1050 /* Hash-printing is the practice of printing a # character every 100
1051 lines of source code (the -x switch), reassuring the user that
1052 progress is being made */
1054 if (no_hash_printed_yet)
1055 { printf("::"); no_hash_printed_yet = FALSE;
1057 printf("#"); hash_printed_since_newline = TRUE;
1060 /* On some systems, text output is buffered to a line at a time, and
1061 this would frustrate the point of hash-printing, so: */
1067 static void reached_new_line(void)
1069 /* Called to signal that a new line has been reached in the source code */
1071 forerrors_pointer = 0;
1073 CurrentLB->source_line++;
1074 CurrentLB->line_start = CurrentLB->chars_read;
1076 total_source_line_count++;
1078 if (total_source_line_count%100==0)
1079 { if (hash_switch) print_hash();
1081 SpinCursor(32); /* I.e., allow other tasks to run */
1086 if (total_source_line_count%((**g_pm_hndl).linespercheck) == 0)
1087 { ProcessEvents (&g_proc);
1091 if (temporary_files_switch)
1092 remove_temp_files();
1094 my_free(&all_text,"transcription text");
1095 abort_transcript_file();
1096 longjmp (g_fallback, 1);
1102 static void new_syntax_line(void)
1103 { if (source_to_analyse != NULL) forerrors_pointer = 0;
1104 report_errors_at_current_line();
1107 /* Return 10 raised to the expo power.
1109 * I'm avoiding the standard pow() function for a rather lame reason:
1110 * it's in the libmath (-lm) library, and I don't want to change the
1111 * build model for the compiler. So, this is implemented with a stupid
1112 * lookup table. It's faster than pow() for small values of expo.
1113 * Probably not as fast if expo is 200, but "$+1e200" is an overflow
1114 * anyway, so I don't expect that to be a problem.
1116 * (For some reason, frexp() and ldexp(), which are used later on, do
1117 * not require libmath to be linked in.)
1119 static double pow10_cheap(int expo)
1121 #define POW10_RANGE (8)
1122 static double powers[POW10_RANGE*2+1] = {
1123 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
1125 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0
1131 for (; expo < -POW10_RANGE; expo += POW10_RANGE) {
1134 return res * powers[POW10_RANGE+expo];
1137 for (; expo > POW10_RANGE; expo -= POW10_RANGE) {
1138 res *= powers[POW10_RANGE*2];
1140 return res * powers[POW10_RANGE+expo];
1144 /* Return the IEEE-754 single-precision encoding of a floating-point
1145 * number. See http://www.psc.edu/general/software/packages/ieee/ieee.php
1146 * for an explanation.
1148 * The number is provided in the pieces it was parsed in:
1149 * [+|-] intv "." fracv "e" [+|-]expo
1151 * If the magnitude is too large (beyond about 3.4e+38), this returns
1152 * an infinite value (0x7f800000 or 0xff800000). If the magnitude is too
1153 * small (below about 1e-45), this returns a zero value (0x00000000 or
1154 * 0x80000000). If any of the inputs are NaN, this returns NaN (but the
1155 * lexer should never do that).
1157 * Note that using a float constant does *not* set the uses_float_features
1158 * flag (which would cause the game file to be labelled 3.1.2). There's
1159 * no VM feature here, just an integer. Of course, any use of the float
1160 * *opcodes* will set the flag.
1162 * The math functions in this routine require #including <math.h>, but
1163 * they should not require linking the math library (-lm). At least,
1164 * they do not on OSX and Linux.
1166 static int32 construct_float(int signbit, double intv, double fracv, int expo)
1168 double absval = (intv + fracv) * pow10_cheap(expo);
1169 int32 sign = (signbit ? 0x80000000 : 0x0);
1173 if (isinf(absval)) {
1174 return sign | 0x7f800000; /* infinity */
1176 if (isnan(absval)) {
1177 return sign | 0x7fc00000;
1180 mant = frexp(absval, &expo);
1182 /* Normalize mantissa to be in the range [1.0, 2.0) */
1183 if (0.5 <= mant && mant < 1.0) {
1187 else if (mant == 0.0) {
1191 return sign | 0x7f800000; /* infinity */
1195 return sign | 0x7f800000; /* infinity */
1197 else if (expo < -126) {
1198 /* Denormalized (very small) number */
1199 mant = ldexp(mant, 126 + expo);
1202 else if (!(expo == 0 && mant == 0.0)) {
1204 mant -= 1.0; /* Get rid of leading 1 */
1207 mant *= 8388608.0; /* 2^23 */
1208 fbits = (int32)(mant + 0.5); /* round mant to nearest int */
1210 /* The carry propagated out of a string of 23 1 bits. */
1214 return sign | 0x7f800000; /* infinity */
1218 return (sign) | ((int32)(expo << 23)) | (fbits);
1221 /* ------------------------------------------------------------------------- */
1222 /* Characters are read via a "pipeline" of variables, allowing us to look */
1223 /* up to three characters ahead of the current position. */
1225 /* There are two possible sources: from the source files being loaded in, */
1226 /* and from a string inside Inform (which is where the code for veneer */
1227 /* routines comes from). Each source has its own get-next-character */
1229 /* ------------------------------------------------------------------------- */
1230 /* Source 1: from files */
1232 /* Note that file_load_chars(p, size) loads "size" bytes into buffer "p" */
1233 /* from the current input file. If the file runs out, then if it was */
1234 /* the last source file 4 EOF characters are placed in the buffer: if it */
1235 /* was only an Include file ending, then a '\n' character is placed there */
1236 /* (essentially to force termination of any comment line) followed by */
1237 /* three harmless spaces. */
1239 /* The routine returns the number of characters it has written, and note */
1240 /* that this conveniently ensures that all characters in the buffer come */
1241 /* from the same file. */
1242 /* ------------------------------------------------------------------------- */
1244 #define SOURCE_BUFFER_SIZE 4096 /* Typical disc block size */
1246 typedef struct Sourcefile_s
1247 { char *buffer; /* Input buffer */
1248 int read_pos; /* Read position in buffer */
1249 int size; /* Number of meaningful
1250 characters in buffer */
1251 int la, la2, la3; /* Three characters of
1252 lookahead pipeline */
1253 int file_no; /* Internal file number
1258 static Sourcefile *FileStack;
1259 static int File_sp; /* Stack pointer */
1261 static Sourcefile *CF; /* Top entry on stack */
1263 static int last_input_file;
1265 static void begin_buffering_file(int i, int file_no)
1266 { int j, cnt; uchar *p;
1268 if (i >= MAX_INCLUSION_DEPTH)
1269 memoryerror("MAX_INCLUSION_DEPTH",MAX_INCLUSION_DEPTH);
1271 p = (uchar *) FileStack[i].buffer;
1274 { FileStack[i-1].la = lookahead;
1275 FileStack[i-1].la2 = lookahead2;
1276 FileStack[i-1].la3 = lookahead3;
1279 FileStack[i].file_no = file_no;
1280 FileStack[i].size = file_load_chars(file_no,
1281 (char *) p, SOURCE_BUFFER_SIZE);
1282 lookahead = source_to_iso_grid[p[0]];
1283 lookahead2 = source_to_iso_grid[p[1]];
1284 lookahead3 = source_to_iso_grid[p[2]];
1285 if (LOOKAHEAD_SIZE != 3)
1287 ("Lexer lookahead size does not match hard-coded lookahead code");
1288 FileStack[i].read_pos = LOOKAHEAD_SIZE;
1290 if (file_no==1) FileStack[i].LB.main_flag = TRUE;
1291 else FileStack[i].LB.main_flag = FALSE;
1292 FileStack[i].LB.sys_flag = FALSE;
1293 FileStack[i].LB.source_line = 1;
1294 FileStack[i].LB.line_start = LOOKAHEAD_SIZE;
1295 FileStack[i].LB.chars_read = LOOKAHEAD_SIZE;
1296 FileStack[i].LB.filename = InputFiles[file_no-1].filename;
1297 FileStack[i].LB.file_no = file_no;
1298 FileStack[i].LB.orig_source = NULL; FileStack[i].LB.orig_file = 0;
1299 FileStack[i].LB.orig_line = 0; FileStack[i].LB.orig_char = 0;
1301 CurrentLB = &(FileStack[i].LB);
1302 CF = &(FileStack[i]);
1304 /* Check for recursive inclusion */
1307 { if (!strcmp(FileStack[i].LB.filename, FileStack[j].LB.filename))
1311 warning_named("File included more than once",
1312 FileStack[j].LB.filename);
1315 static void create_char_pipeline(void)
1318 begin_buffering_file(File_sp++, 1);
1319 pipeline_made = TRUE;
1320 last_input_file = current_input_file;
1323 static int get_next_char_from_pipeline(void)
1326 while (last_input_file < current_input_file)
1328 /* An "Include" file must have opened since the last character
1329 was read. Perhaps more than one. We run forward through the
1330 list and add them to the include stack. But we ignore
1331 "Origsource" files (which were never actually opened for
1335 if (!InputFiles[last_input_file-1].is_input)
1338 begin_buffering_file(File_sp++, last_input_file);
1340 if (last_input_file != current_input_file)
1341 compiler_error("last_input_file did not match after Include");
1344 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1347 if (CF->read_pos == CF->size)
1349 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1353 if (CF->read_pos == -(CF->size))
1354 { set_token_location(get_current_debug_location());
1357 { lookahead = 0; lookahead2 = 0; lookahead3 = 0; return 0;
1359 CF = &(FileStack[File_sp-1]);
1360 CurrentLB = &(FileStack[File_sp-1].LB);
1361 lookahead = CF->la; lookahead2 = CF->la2; lookahead3 = CF->la3;
1362 if (CF->read_pos == CF->size)
1364 file_load_chars(CF->file_no, CF->buffer, SOURCE_BUFFER_SIZE);
1367 set_token_location(get_current_debug_location());
1370 p = (uchar *) (CF->buffer);
1372 current = lookahead;
1373 lookahead = lookahead2;
1374 lookahead2 = lookahead3;
1375 lookahead3 = source_to_iso_grid[p[CF->read_pos++]];
1377 CurrentLB->chars_read++;
1378 if (forerrors_pointer < 511)
1379 forerrors_buff[forerrors_pointer++] = current;
1380 if (current == '\n') reached_new_line();
1384 /* ------------------------------------------------------------------------- */
1385 /* Source 2: from a string */
1386 /* ------------------------------------------------------------------------- */
1388 static int source_to_analyse_pointer; /* Current read position */
1390 static int get_next_char_from_string(void)
1391 { uchar *p = (uchar *) source_to_analyse + source_to_analyse_pointer++;
1392 current = source_to_iso_grid[p[0]];
1394 if (current == 0) lookahead = 0;
1395 else lookahead = source_to_iso_grid[p[1]];
1396 if (lookahead == 0) lookahead2 = 0;
1397 else lookahead2 = source_to_iso_grid[p[2]];
1398 if (lookahead2 == 0) lookahead3 = 0;
1399 else lookahead3 = source_to_iso_grid[p[3]];
1401 CurrentLB->chars_read++;
1402 if (forerrors_pointer < 511)
1403 forerrors_buff[forerrors_pointer++] = current;
1404 if (current == '\n') reached_new_line();
1408 /* ========================================================================= */
1409 /* The interface between the lexer and Inform's higher levels: */
1411 /* put_token_back() (effectively) move the read position */
1412 /* back by one token */
1414 /* get_next_token() copy the token at the current read */
1415 /* position into the triple */
1416 /* (token_type, token_value, token_text) */
1417 /* and move the read position forward */
1420 /* restart_lexer(source, name) if source is NULL, initialise the lexer */
1421 /* to read from source files; */
1422 /* otherwise, to read from this string. */
1423 /* ------------------------------------------------------------------------- */
1425 extern void put_token_back(void)
1426 { tokens_put_back++;
1428 if (tokens_trace_level > 0)
1429 { if (tokens_trace_level == 1) printf("<- ");
1430 else printf("<-\n");
1433 /* The following error, of course, should never happen! */
1435 if (tokens_put_back == CIRCLE_SIZE)
1436 { compiler_error("The lexical analyser has collapsed because of a wrong \
1437 assumption inside Inform");
1443 extern void get_next_token(void)
1444 { int d, i, j, k, quoted_size, e, radix, context; int32 n; char *r;
1445 int returning_a_put_back_token = TRUE;
1447 context = lexical_context();
1449 if (tokens_put_back > 0)
1450 { i = circle_position - tokens_put_back + 1;
1451 if (i<0) i += CIRCLE_SIZE;
1453 if (context != token_contexts[i])
1454 { j = circle[i].type;
1455 if ((j==0) || ((j>=100) && (j<200)))
1456 interpret_identifier(i, FALSE);
1460 returning_a_put_back_token = FALSE;
1462 if (circle_position == CIRCLE_SIZE-1) circle_position = 0;
1463 else circle_position++;
1465 if (lex_p > lexeme_memory + 4*MAX_QTEXT_SIZE)
1466 lex_p = lexeme_memory;
1468 circle[circle_position].text = lex_p;
1469 circle[circle_position].value = 0;
1473 d = (*get_next_char)();
1474 e = tokeniser_grid[d];
1476 if (next_token_begins_syntax_line)
1477 { if ((e != WHITESPACE_CODE) && (e != COMMENT_CODE))
1478 { new_syntax_line();
1479 next_token_begins_syntax_line = FALSE;
1483 circle[circle_position].location = get_current_debug_location();
1486 { case 0: char_error("Illegal character found in source:", d);
1487 goto StartTokenAgain;
1489 case WHITESPACE_CODE:
1490 while (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1492 goto StartTokenAgain;
1495 while ((lookahead != '\n') && (lookahead != 0))
1497 goto StartTokenAgain;
1500 circle[circle_position].type = EOF_TT;
1501 strcpy(lex_p, "<end of file>");
1502 lex_p += strlen(lex_p) + 1;
1510 { n = n*radix + character_digit_value[d];
1512 } while ((character_digit_value[lookahead] < radix)
1513 && (d = (*get_next_char)(), TRUE));
1516 circle[circle_position].type = NUMBER_TT;
1517 circle[circle_position].value = n;
1521 { int expo=0; double intv=0, fracv=0;
1522 int expocount=0, intcount=0, fraccount=0;
1523 int signbit = (d == '-');
1525 while (character_digit_value[lookahead] < 10) {
1526 intv = 10.0*intv + character_digit_value[lookahead];
1528 *lex_p++ = lookahead;
1531 if (lookahead == '.') {
1532 double fracpow = 1.0;
1533 *lex_p++ = lookahead;
1535 while (character_digit_value[lookahead] < 10) {
1537 fracv = fracv + fracpow*character_digit_value[lookahead];
1539 *lex_p++ = lookahead;
1543 if (lookahead == 'e' || lookahead == 'E') {
1545 *lex_p++ = lookahead;
1547 if (lookahead == '+' || lookahead == '-') {
1548 exposign = (lookahead == '-');
1549 *lex_p++ = lookahead;
1552 while (character_digit_value[lookahead] < 10) {
1553 expo = 10*expo + character_digit_value[lookahead];
1555 *lex_p++ = lookahead;
1559 error("Floating-point literal must have digits after the 'e'");
1560 if (exposign) { expo = -expo; }
1562 if (intcount + fraccount == 0)
1563 error("Floating-point literal must have digits");
1564 n = construct_float(signbit, intv, fracv, expo);
1567 circle[circle_position].type = NUMBER_TT;
1568 circle[circle_position].value = n;
1569 if (!glulx_mode && dont_enter_into_symbol_table != -2) error("Floating-point literals are not available in Z-code");
1573 radix = 16; d = (*get_next_char)();
1574 if (d == '-' || d == '+') { goto FloatNumber; }
1575 if (d == '$') { d = (*get_next_char)(); radix = 2; }
1576 if (character_digit_value[d] >= radix)
1578 error("Binary number expected after '$$'");
1580 error("Hexadecimal number expected after '$'");
1584 case QUOTE_CODE: /* Single-quotes: scan a literal string */
1587 { e = d; d = (*get_next_char)(); *lex_p++ = d;
1588 if (quoted_size++==64)
1590 "Too much text for one pair of quotations '...' to hold");
1593 if ((d == '\'') && (e != '@'))
1594 { if (quoted_size == 1)
1595 { d = (*get_next_char)(); *lex_p++ = d;
1597 error("No text between quotation marks ''");
1602 if (d==EOF) ebf_error("'\''", "end of file");
1604 circle[circle_position].type = SQ_TT;
1607 case DQUOTE_CODE: /* Double-quotes: scan a literal string */
1610 { d = (*get_next_char)(); *lex_p++ = d;
1611 if (quoted_size++==MAX_QTEXT_SIZE)
1612 { memoryerror("MAX_QTEXT_SIZE", MAX_QTEXT_SIZE);
1617 while (*(lex_p-1) == ' ') lex_p--;
1618 if (*(lex_p-1) != '^') *lex_p++ = ' ';
1619 while ((lookahead != EOF) &&
1620 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1624 { int newline_passed = FALSE;
1626 while ((lookahead != EOF) &&
1627 (tokeniser_grid[lookahead] == WHITESPACE_CODE))
1628 if ((d = (*get_next_char)()) == '\n')
1629 newline_passed = TRUE;
1630 if (!newline_passed)
1632 chb[0] = '\"'; chb[1] = lookahead;
1633 chb[2] = '\"'; chb[3] = 0;
1634 ebf_error("empty rest of line after '\\' in string",
1638 } while ((d != EOF) && (d!='\"'));
1639 if (d==EOF) ebf_error("'\"'", "end of file");
1641 circle[circle_position].type = DQ_TT;
1644 case IDENTIFIER_CODE: /* Letter or underscore: an identifier */
1647 while ((n<=MAX_IDENTIFIER_LENGTH)
1648 && ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1649 || (tokeniser_grid[lookahead] == DIGIT_CODE)))
1650 n++, *lex_p++ = (*get_next_char)();
1654 if (n > MAX_IDENTIFIER_LENGTH)
1655 { char bad_length[100];
1657 "Name exceeds the maximum length of %d characters:",
1658 MAX_IDENTIFIER_LENGTH);
1659 error_named(bad_length, circle[circle_position].text);
1660 /* Trim token so that it doesn't violate
1661 MAX_IDENTIFIER_LENGTH during error recovery */
1662 circle[circle_position].text[MAX_IDENTIFIER_LENGTH] = 0;
1665 if (dont_enter_into_symbol_table)
1666 { circle[circle_position].type = DQ_TT;
1667 circle[circle_position].value = 0;
1668 if (dont_enter_into_symbol_table == -2)
1669 interpret_identifier(circle_position, TRUE);
1673 interpret_identifier(circle_position, FALSE);
1678 /* The character is initial to at least one of the separators */
1680 for (j=e>>4, k=j+(e&0x0f); j<k; j++)
1681 { r = (char *) separators[j];
1683 { *lex_p++=d; *lex_p++=0;
1684 goto SeparatorMatched;
1688 { if (*(r+1) == lookahead)
1690 *lex_p++=(*get_next_char)();
1692 goto SeparatorMatched;
1696 { if ((*(r+1) == lookahead) && (*(r+2) == lookahead2))
1698 *lex_p++=(*get_next_char)();
1699 *lex_p++=(*get_next_char)();
1701 goto SeparatorMatched;
1706 /* The following contingency never in fact arises with the
1707 current set of separators, but might in future */
1709 *lex_p++ = d; *lex_p++ = lookahead; *lex_p++ = lookahead2;
1711 error_named("Unrecognised combination in source:", lex_p);
1712 goto StartTokenAgain;
1716 circle[circle_position].type = SEP_TT;
1717 circle[circle_position].value = j;
1719 { case SEMICOLON_SEP: break;
1720 case HASHNDOLLAR_SEP:
1721 case HASHWDOLLAR_SEP:
1722 if (tokeniser_grid[lookahead] == WHITESPACE_CODE)
1723 { error_named("Character expected after",
1724 circle[circle_position].text);
1728 *lex_p++ = (*get_next_char)();
1729 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1730 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1731 *lex_p++ = (*get_next_char)();
1734 case HASHADOLLAR_SEP:
1735 case HASHGDOLLAR_SEP:
1736 case HASHRDOLLAR_SEP:
1738 if (tokeniser_grid[lookahead] != IDENTIFIER_CODE)
1739 { error_named("Alphabetic character expected after",
1740 circle[circle_position].text);
1744 while ((tokeniser_grid[lookahead] == IDENTIFIER_CODE)
1745 || (tokeniser_grid[lookahead] == DIGIT_CODE))
1746 *lex_p++ = (*get_next_char)();
1753 i = circle_position;
1756 token_value = circle[i].value;
1757 token_type = circle[i].type;
1758 token_text = circle[i].text;
1759 if (!returning_a_put_back_token)
1760 { set_token_location(circle[i].location);
1762 token_contexts[i] = context;
1764 if (tokens_trace_level > 0)
1765 { if (tokens_trace_level == 1)
1766 printf("'%s' ", circle[i].text);
1768 { printf("-> "); describe_token(circle[i]);
1770 if (tokens_trace_level > 2) print_context(token_contexts[i]);
1776 static char veneer_error_title[64];
1778 extern void restart_lexer(char *lexical_source, char *name)
1780 circle_position = 0;
1781 for (i=0; i<CIRCLE_SIZE; i++)
1782 { circle[i].type = 0;
1783 circle[i].value = 0;
1784 circle[i].text = "(if this is ever visible, there is a bug)";
1785 token_contexts[i] = 0;
1788 lex_p = lexeme_memory;
1789 tokens_put_back = 0;
1790 forerrors_pointer = 0;
1791 dont_enter_into_symbol_table = FALSE;
1792 return_sp_as_variable = FALSE;
1793 next_token_begins_syntax_line = TRUE;
1795 source_to_analyse = lexical_source;
1797 if (source_to_analyse == NULL)
1798 { get_next_char = get_next_char_from_pipeline;
1799 if (!pipeline_made) create_char_pipeline();
1800 forerrors_buff[0] = 0; forerrors_pointer = 0;
1803 { get_next_char = get_next_char_from_string;
1804 source_to_analyse_pointer = 0;
1805 CurrentLB = &StringLB;
1806 sprintf(veneer_error_title, "<veneer routine '%s'>", name);
1807 StringLB.filename = veneer_error_title;
1809 CurrentLB->source_line = 1;
1810 CurrentLB->line_start = 0;
1811 CurrentLB->chars_read = 0;
1815 /* ========================================================================= */
1816 /* Data structure management routines */
1817 /* ------------------------------------------------------------------------- */
1819 extern void init_lexer_vars(void)
1821 blank_brief_location.file_index = -1;
1822 blank_brief_location.line_number = 0;
1823 blank_brief_location.orig_file_index = 0;
1824 blank_brief_location.orig_line_number = 0;
1827 extern void lexer_begin_prepass(void)
1828 { total_source_line_count = 0;
1829 CurrentLB = &NoFileOpen;
1830 report_errors_at_current_line();
1833 extern void lexer_begin_pass(void)
1834 { no_hash_printed_yet = TRUE;
1835 hash_printed_since_newline = FALSE;
1837 pipeline_made = FALSE;
1839 restart_lexer(NULL, NULL);
1842 extern void lexer_endpass(void)
1843 { CurrentLB = &MakingOutput;
1844 report_errors_at_current_line();
1847 extern void lexer_allocate_arrays(void)
1850 FileStack = my_malloc(MAX_INCLUSION_DEPTH*sizeof(Sourcefile),
1851 "filestack buffer");
1853 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1854 FileStack[i].buffer = my_malloc(SOURCE_BUFFER_SIZE+4, "source file buffer");
1856 lexeme_memory = my_malloc(5*MAX_QTEXT_SIZE, "lexeme memory");
1858 keywords_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1859 "keyword hash table");
1860 keywords_hash_ends_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1861 "keyword hash end table");
1862 keywords_data_table = my_calloc(sizeof(int), 3*MAX_KEYWORDS,
1863 "keyword hashing linked list");
1864 local_variable_hash_table = my_calloc(sizeof(int), HASH_TAB_SIZE,
1865 "local variable hash table");
1866 local_variable_text_table = my_malloc(
1867 (MAX_LOCAL_VARIABLES-1)*(MAX_IDENTIFIER_LENGTH+1),
1868 "text of local variable names");
1870 local_variable_hash_codes = my_calloc(sizeof(int), MAX_LOCAL_VARIABLES,
1871 "local variable hash codes");
1872 local_variable_texts = my_calloc(sizeof(char *), MAX_LOCAL_VARIABLES,
1873 "local variable text pointers");
1875 make_tokeniser_grid();
1876 make_keywords_tables();
1878 first_token_locations =
1879 my_malloc(sizeof(debug_locations), "debug locations of recent tokens");
1880 first_token_locations->location.file_index = 0;
1881 first_token_locations->location.beginning_byte_index = 0;
1882 first_token_locations->location.end_byte_index = 0;
1883 first_token_locations->location.beginning_line_number = 0;
1884 first_token_locations->location.end_line_number = 0;
1885 first_token_locations->location.beginning_character_number = 0;
1886 first_token_locations->location.end_character_number = 0;
1887 first_token_locations->location.orig_file_index = 0;
1888 first_token_locations->location.orig_beg_line_number = 0;
1889 first_token_locations->location.orig_beg_char_number = 0;
1890 first_token_locations->next = NULL;
1891 first_token_locations->reference_count = 0;
1892 last_token_location = first_token_locations;
1895 extern void lexer_free_arrays(void)
1898 for (i=0; i<MAX_INCLUSION_DEPTH; i++)
1899 { p = FileStack[i].buffer;
1900 my_free(&p, "source file buffer");
1902 my_free(&FileStack, "filestack buffer");
1903 my_free(&lexeme_memory, "lexeme memory");
1905 my_free(&keywords_hash_table, "keyword hash table");
1906 my_free(&keywords_hash_ends_table, "keyword hash end table");
1907 my_free(&keywords_data_table, "keyword hashing linked list");
1908 my_free(&local_variable_hash_table, "local variable hash table");
1909 my_free(&local_variable_text_table, "text of local variable names");
1911 my_free(&local_variable_hash_codes, "local variable hash codes");
1912 my_free(&local_variable_texts, "local variable text pointers");
1914 cleanup_token_locations(NULL);
1917 /* ========================================================================= */