1 /* ------------------------------------------------------------------------- */
2 /* "chars" : Character set mappings and the Z-machine alphabet table */
4 /* Part of Inform 6.35 */
5 /* copyright (c) Graham Nelson 1993 - 2020 */
7 /* Inform is free software: you can redistribute it and/or modify */
8 /* it under the terms of the GNU General Public License as published by */
9 /* the Free Software Foundation, either version 3 of the License, or */
10 /* (at your option) any later version. */
12 /* Inform is distributed in the hope that it will be useful, */
13 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
14 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
15 /* GNU General Public License for more details. */
17 /* You should have received a copy of the GNU General Public License */
18 /* along with Inform. If not, see https://gnu.org/licenses/ */
20 /* ------------------------------------------------------------------------- */
21 /* Inform uses six different character representations: */
23 /* ASCII plain ASCII characters in range $20 to $7e */
24 /* (unsigned 7-bit number) */
25 /* Source raw bytes from source code */
26 /* (unsigned 8-bit number) */
27 /* ISO plain ASCII or ISO 8859-1 to -9, according to value */
28 /* character_set_setting == 0 or 1 to 9 */
29 /* in Unicode mode (character_set_unicode), individual */
31 /* (unsigned 8-bit number) */
32 /* ZSCII the Z-machine's character set */
33 /* (unsigned 10-bit number) */
34 /* textual such as the text @'e to mean e-acute */
35 /* or @$03a3 to mean capital Greek sigma */
36 /* in Unicode mode, the operations manipulating multibyte */
37 /* UCS representations are included in text routines */
38 /* (sequence of ASCII characters) */
39 /* Unicode a unifying character set holding all possible characters */
40 /* Inform can ever deal with */
41 /* (unsigned 16-bit number) */
43 /* Conversion can always be made down this list, but generally not up. */
44 /* Note that all ASCII values are the same in any version of ISO */
47 /* There is a seventh form: sequences of 5-bit "Z-chars" which encode */
48 /* ZSCII into the story file in compressed form. Conversion of ZSCII to */
49 /* and from Z-char sequences, although it uses the alphabet table, is done */
51 /* ------------------------------------------------------------------------- */
52 /* The main data structures need to be modified in mid-compilation, but */
53 /* several of them depend on each other, and must remain consistent; */
54 /* and rebuilding one sometimes uses conversion routines depending on */
55 /* information held in the others: */
57 /* Structure If changed, need to rebuild: */
58 /* character_set_setting source_to_iso_grid[] */
59 /* zscii_to_unicode_grid[] */
60 /* zscii_to_iso_grid[] */
61 /* iso_to_unicode_grid[] */
62 /* alphabet[][] iso_to_alphabet_grid[] */
63 /* zscii_to_alphabet_grid[] */
64 /* zscii_to_unicode_grid[] iso_to_alphabet_grid[] */
65 /* source_to_iso_grid[] <nothing> */
66 /* iso_to_alphabet_grid[] <nothing> */
67 /* zscii_to_alphabet_grid[] <nothing> */
68 /* zscii_to_iso_grid[] <nothing> */
70 /* (zscii_to_iso_grid[] is made whenever iso_to_alphabet_grid[] is */
71 /* made but does not depend on alphabet[].) */
73 /* Conversion routine Makes use of: */
74 /* iso_to_unicode character_set_setting */
75 /* unicode_to_zscii character_set_setting */
76 /* zscii_to_unicode_grid[] */
77 /* zscii_to_unicode character_set_setting */
78 /* zscii_to_unicode_grid[] */
79 /* text_to_unicode <nothing> */
80 /* zscii_to_text character_set_setting */
81 /* zscii_to_unicode_grid[] */
82 /* zscii_to_iso_grid[] */
84 /* For example, if we want to change alphabet[][] then we can safely */
85 /* use any of the conversion routines while working on the change, but */
86 /* must rebuild the iso_to_alphabet_grid[] before allowing Inform to */
87 /* continue compiling. */
88 /* ------------------------------------------------------------------------- */
92 uchar source_to_iso_grid[0x100]; /* Filters source code into legal ISO */
94 int32 iso_to_unicode_grid[0x100]; /* Filters ISO into Unicode */
96 int character_digit_value[128]; /* Parsing of binary, decimal and hex */
98 static char *accents = /* Standard 0.2 stock of accented... */
100 ":a:o:u:A:O:Uss>><<:e:i:y:E:I'a'e'i'o'u'y'A'E'I'O'U'Y`a`e`i`o`u\
101 `A`E`I`O`U^a^e^i^o^u^A^E^I^O^UoaoA/o/O~a~n~o~A~N~OaeAEcccCthetThEtLLoeOE!!??";
103 /* ...characters, numbered upwards */
106 /* ------------------------------------------------------------------------- */
108 uchar alphabet[3][27]; /* The alphabet table. */
110 int alphabet_modified; /* Has the default been changed? */
112 char alphabet_used[78]; /* Flags (holding 'N' or 'Y') for
113 which of the Z-alphabet letters
114 have actually been encrypted */
116 /* ------------------------------------------------------------------------- */
118 int iso_to_alphabet_grid[0x100];
120 /* This array combines two conversion processes which have to run quickly:
121 an ISO character n is being converted for text purposes into a stream
122 of Z-chars (anything from 1 up to 8 of these). Unicode but non-ISO
123 characters are also converted from text, but far less often, and
124 different (and slower) methods are used to carry this out.
126 iso_to_alphabet_grid[n]
127 = i if the character exists in ZSCII and is located at
128 position i in the Z-machine alphabet (where 0 to 25
129 give positions in A0, 26 to 51 in A1 and 52 to 77 in A2);
131 -z if the character exists in ZSCII as value z, but is not
132 located anywhere in the Z-machine alphabet;
134 -5 if the character does not exist in ZSCII. (It will still
135 be printable using an 8-Z-char sequence to encode it in
136 Unicode form, but there's no ZSCII form.)
138 Note that ISO tilde ~ is interpreted as ZSCII double-quote ",
139 and ISO circumflex ^ is interpreted as ZSCII new-line, in accordance
140 with the Inform syntax for strings. This is automatic from the
141 structure of alphabet[][]:
143 alphabet[i][j] = the ZSCII code of letter j (0 to 25)
144 in alphabet i (0 to 2)
148 alphabet[2][0] is ignored by the Z-machine and Inform
149 (char 0 in A2 is an escape)
150 alphabet[2][1] is ignored by the Z-machine
151 (char 1 in A2 means new-line)
152 but used by Inform to hold ISO circumflex
153 so that ^ is translated as new-line
154 alphabet[2][19] is used by Inform to hold ISO tilde
155 so that ~ is translated as ": after
156 compilation, when the alphabet table is
157 written into the Z-machine, this entry
158 is changed back to ".
160 Note that the alphabet can only hold ZSCII values between 0 and 255.
162 The array is dimensioned as [3][27], not [3][26], to make it easier to
163 initialise using strcpy (see below), but the zero entries [x][26] are
166 int zscii_to_alphabet_grid[0x100];
168 /* The same, except that the index is a ZSCII character, not an ISO one. */
170 int zscii_to_iso_grid[0x100]; /* Converts ZSCII between 0 and 255 to
171 codes in current ISO set: or to 0 if
172 code isn't in the current ISO set. */
174 static void make_iso_to_alphabet_grid(void)
175 { int i, j, k; int z;
177 for (j=0; j<0x100; j++)
178 { zscii_to_iso_grid[j] = 0;
179 zscii_to_alphabet_grid[j] = -j;
182 for (j=0; j<0x100; j++)
183 { iso_to_alphabet_grid[j]=-5;
184 if ((j >= 0x20) && (j <= 0x7e))
185 { iso_to_alphabet_grid[j] = -j;
186 zscii_to_iso_grid[j] = j;
188 if ((j >= 0xa1) && (j <= 0xff))
189 { z = unicode_to_zscii(iso_to_unicode(j));
190 if (character_set_setting != 0)
191 zscii_to_iso_grid[z] = j;
192 iso_to_alphabet_grid[j] = -z;
194 iso_to_unicode_grid[j] = iso_to_unicode(j);
197 for (k=(j<2?0:1); k<26; k++)
198 { i=(int) ((alphabet[j])[k]);
199 zscii_to_alphabet_grid[i] = k + j*26;
200 iso_to_alphabet_grid[zscii_to_iso_grid[i]] = k + j*26;
204 extern void map_new_zchar(int32 unicode)
205 { /* Attempts to enter the given Unicode character into the "alphabet[]"
206 array, in place of one which has not so far been used in the
207 compilation of the current file. This may of course fail. */
211 zscii = unicode_to_zscii(unicode);
213 /* Out of ZSCII range? */
214 if ((zscii == 5) || (zscii >= 0x100))
215 { unicode_char_error(
216 "Character must first be entered into Zcharacter table:", unicode);
221 for (i=0;i<3;i++) for (j=0;j<26;j++)
222 if (alphabet[i][j] == zscii) return;
224 /* A0 and A1 are never changed. Try to find a place in alphabet A2:
226 xx0123456789.,!?_#'~/\-:()
227 ^^^^^^^^^^ ^^^^^ ^^^^^^
229 The letters marked ^ are considered to be replaceable, as long as
230 they haven't yet been used in any text already encoded, and haven't
231 already been replaced. The routine works along from the left, since
232 numerals are more of a luxury than punctuation. */
235 { if ((i == 12) || (i == 13) || (i == 19)) continue;
236 if (alphabet_used[52+i] == 'N')
237 { alphabet_used[52+i] = 'Y';
238 alphabet[2][i] = zscii;
239 alphabet_modified = TRUE;
240 make_iso_to_alphabet_grid();
246 extern void new_alphabet(char *text, int which_alph)
248 /* Called three times in succession, with which_alph = 0, 1, 2 */
250 int i, j, zscii; int32 unicode;
252 alphabet_modified = TRUE;
255 { i=3; alphabet[2][2] = '~';
260 { if (text[j] == 0) goto WrongSizeError;
262 unicode = text_to_unicode(text+j);
263 j += textual_form_length;
265 zscii = unicode_to_zscii(unicode);
266 if ((zscii == 5) || (zscii >= 0x100))
267 unicode_char_error("Character can't be used in alphabets unless \
268 entered into Zcharacter table", unicode);
269 else alphabet[which_alph][i] = zscii;
275 error("Alphabet string must give exactly 23 characters");
277 error("Alphabet string must give exactly 26 characters");
281 { int test_dups[0x100];
282 for (i=0; i<0x100; i++) test_dups[i] = 0;
283 for (i=0; i<3; i++) for (j=0; j<26; j++)
284 { if (test_dups[alphabet[i][j]]++ == 1)
285 unicode_char_error("Character duplicated in alphabet:",
286 zscii_to_unicode(alphabet[i][j]));
289 make_iso_to_alphabet_grid();
293 static void read_source_to_iso_file(uchar *uccg)
294 { FILE *charset_file;
299 charset_file=fopen(Charset_Map, "r");
300 if (charset_file==NULL)
301 fatalerror_named("Couldn't open character set mapping", Charset_Map);
303 while (feof(charset_file)==0)
304 { if (fgets(cs_buff,256,charset_file)==0) break;
307 { case '!': /* Ignore comments in file */
309 case 'C': /* Set character set */
310 character_set_setting = cs_buff[1]-'0';
311 if ((character_set_setting < 0) || (character_set_setting > 9))
312 { fatalerror_named("Character set in mapping must be 0 to 9",
318 while ((i<256) && (p!=NULL))
320 uccg[i++] = (uchar)atoi(p);
328 fclose(charset_file);
331 /* ========================================================================= */
332 /* Conversion functions (without side effects) */
333 /* ------------------------------------------------------------------------- */
334 /* (1) Source -> ISO */
336 /* 00 remains 0 (meaning "end of file") */
337 /* TAB becomes SPACE */
338 /* 0c ("form feed") becomes '\n' */
339 /* 0d becomes '\n' */
340 /* other control characters become '?' */
342 /* 80 to 9f become '?' */
343 /* a0 (ISO "non-breaking space") becomes SPACE */
344 /* ad (ISO "soft hyphen") becomes '-' */
345 /* any character undefined in ISO is mapped to '?' */
346 /* In Unicode mode, characters 80 and upwards are preserved. */
348 /* ------------------------------------------------------------------------- */
350 static void make_source_to_iso_grid(void)
351 { int n; uchar *uccg = (uchar *) source_to_iso_grid;
353 for (n=0; n<0x100; n++) uccg[n] = (char) n;
355 if (Charset_Map[0] != '\0')
356 read_source_to_iso_file(uccg);
358 { source_to_iso_grid[0] = (char) 0;
359 for (n=1; n<32; n++) source_to_iso_grid[n] = '?';
360 source_to_iso_grid[10] = '\n';
361 source_to_iso_grid[12] = '\n';
362 source_to_iso_grid[13] = '\n';
363 source_to_iso_grid[127] = '?';
364 source_to_iso_grid[TAB_CHARACTER] = ' ';
366 if (character_set_unicode) /* No need to meddle with 8-bit for UTF-8 */
369 for (n=0x80; n<0xa0; n++) source_to_iso_grid[n] = '?';
370 source_to_iso_grid[0xa0] = ' ';
371 source_to_iso_grid[0xad] = '-';
373 switch(character_set_setting)
375 for (n=0xa0; n<0x100; n++)
376 source_to_iso_grid[n] = '?';
379 for (n=0xa0; n<0xc1; n++)
381 { case 0xa0: case 0xa4: case 0xac: case 0xad:
382 case 0xbb: case 0xbf: break;
383 default: source_to_iso_grid[n] = '?';
385 for (n=0xdb; n<0xe0; n++)
386 source_to_iso_grid[n] = '?';
387 for (n=0xf3; n<0x100; n++)
388 source_to_iso_grid[n] = '?';
391 source_to_iso_grid[0xa4] = '?';
392 source_to_iso_grid[0xa5] = '?';
393 source_to_iso_grid[0xaa] = '?';
394 source_to_iso_grid[0xae] = '?';
395 source_to_iso_grid[0xd2] = '?';
396 source_to_iso_grid[0xff] = '?';
399 source_to_iso_grid[0xa1] = '?';
400 for (n=0xbf; n<0xdf; n++)
401 source_to_iso_grid[n] = '?';
402 for (n=0xfb; n<0x100; n++)
403 source_to_iso_grid[n] = '?';
409 /* ------------------------------------------------------------------------- */
410 /* (2) ISO -> Unicode */
412 /* Need not be rapid, as the results are mostly cached. */
413 /* Always succeeds. */
414 /* ------------------------------------------------------------------------- */
416 extern int iso_to_unicode(int iso)
418 switch(character_set_setting)
421 case 0: /* Plain ASCII only */
424 case 1: /* ISO 8859-1: Latin1: west European */
427 case 2: /* ISO 8859-2: Latin2: central European */
430 { case 0xA1: u=0x0104; break; /* LATIN CAP A WITH OGONEK */
431 case 0xA2: u=0x02D8; break; /* BREVE */
432 case 0xA3: u=0x0141; break; /* LATIN CAP L WITH STROKE */
433 case 0xA5: u=0x013D; break; /* LATIN CAP L WITH CARON */
434 case 0xA6: u=0x015A; break; /* LATIN CAP S WITH ACUTE */
435 case 0xA9: u=0x0160; break; /* LATIN CAP S WITH CARON */
436 case 0xAA: u=0x015E; break; /* LATIN CAP S WITH CEDILLA */
437 case 0xAB: u=0x0164; break; /* LATIN CAP T WITH CARON */
438 case 0xAC: u=0x0179; break; /* LATIN CAP Z WITH ACUTE */
439 case 0xAE: u=0x017D; break; /* LATIN CAP Z WITH CARON */
440 case 0xAF: u=0x017B; break; /* LATIN CAP Z WITH DOT ABOVE */
441 case 0xB1: u=0x0105; break; /* LATIN SMALL A WITH OGONEK */
442 case 0xB2: u=0x02DB; break; /* OGONEK */
443 case 0xB3: u=0x0142; break; /* LATIN SMALL L WITH STROKE */
444 case 0xB5: u=0x013E; break; /* LATIN SMALL L WITH CARON */
445 case 0xB6: u=0x015B; break; /* LATIN SMALL S WITH ACUTE */
446 case 0xB7: u=0x02C7; break; /* CARON */
447 case 0xB9: u=0x0161; break; /* LATIN SMALL S WITH CARON */
448 case 0xBA: u=0x015F; break; /* LATIN SMALL S WITH CEDILLA */
449 case 0xBB: u=0x0165; break; /* LATIN SMALL T WITH CARON */
450 case 0xBC: u=0x017A; break; /* LATIN SMALL Z WITH ACUTE */
451 case 0xBD: u=0x02DD; break; /* DOUBLE ACUTE ACCENT */
452 case 0xBE: u=0x017E; break; /* LATIN SMALL Z WITH CARON */
453 case 0xBF: u=0x017C; break; /* LATIN SMALL Z WITH DOT ABOVE */
454 case 0xC0: u=0x0154; break; /* LATIN CAP R WITH ACUTE */
455 case 0xC3: u=0x0102; break; /* LATIN CAP A WITH BREVE */
456 case 0xC5: u=0x0139; break; /* LATIN CAP L WITH ACUTE */
457 case 0xC6: u=0x0106; break; /* LATIN CAP C WITH ACUTE */
458 case 0xC8: u=0x010C; break; /* LATIN CAP C WITH CARON */
459 case 0xCA: u=0x0118; break; /* LATIN CAP E WITH OGONEK */
460 case 0xCC: u=0x011A; break; /* LATIN CAP E WITH CARON */
461 case 0xCF: u=0x010E; break; /* LATIN CAP D WITH CARON */
462 case 0xD0: u=0x0110; break; /* LATIN CAP D WITH STROKE */
463 case 0xD1: u=0x0143; break; /* LATIN CAP N WITH ACUTE */
464 case 0xD2: u=0x0147; break; /* LATIN CAP N WITH CARON */
465 case 0xD5: u=0x0150; break; /* LATIN CAP O WITH DOUBLE ACUTE */
466 case 0xD8: u=0x0158; break; /* LATIN CAP R WITH CARON */
467 case 0xD9: u=0x016E; break; /* LATIN CAP U WITH RING ABOVE */
468 case 0xDB: u=0x0170; break; /* LATIN CAP U WITH DOUBLE ACUTE */
469 case 0xDE: u=0x0162; break; /* LATIN CAP T WITH CEDILLA */
470 case 0xE0: u=0x0155; break; /* LATIN SMALL R WITH ACUTE */
471 case 0xE3: u=0x0103; break; /* LATIN SMALL A WITH BREVE */
472 case 0xE5: u=0x013A; break; /* LATIN SMALL L WITH ACUTE */
473 case 0xE6: u=0x0107; break; /* LATIN SMALL C WITH ACUTE */
474 case 0xE8: u=0x010D; break; /* LATIN SMALL C WITH CARON */
475 case 0xEA: u=0x0119; break; /* LATIN SMALL E WITH OGONEK */
476 case 0xEC: u=0x011B; break; /* LATIN SMALL E WITH CARON */
477 case 0xEF: u=0x010F; break; /* LATIN SMALL D WITH CARON */
478 case 0xF0: u=0x0111; break; /* LATIN SMALL D WITH STROKE */
479 case 0xF1: u=0x0144; break; /* LATIN SMALL N WITH ACUTE */
480 case 0xF2: u=0x0148; break; /* LATIN SMALL N WITH CARON */
481 case 0xF5: u=0x0151; break; /* LATIN SMALL O WITH DOUBLE ACUTE */
482 case 0xF8: u=0x0159; break; /* LATIN SMALL R WITH CARON */
483 case 0xF9: u=0x016F; break; /* LATIN SMALL U WITH RING ABOVE */
484 case 0xFB: u=0x0171; break; /* LATIN SMALL U WITH DOUBLE ACUTE */
485 case 0xFE: u=0x0163; break; /* LATIN SMALL T WITH CEDILLA */
486 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
489 case 3: /* ISO 8859-3: Latin3: central European */
492 { case 0xA1: u=0x0126; break; /* LATIN CAP H WITH STROKE */
493 case 0xA2: u=0x02D8; break; /* BREVE */
494 case 0xA6: u=0x0124; break; /* LATIN CAP H WITH CIRCUMFLEX */
495 case 0xA9: u=0x0130; break; /* LATIN CAP I WITH DOT ABOVE */
496 case 0xAA: u=0x015E; break; /* LATIN CAP S WITH CEDILLA */
497 case 0xAB: u=0x011E; break; /* LATIN CAP G WITH BREVE */
498 case 0xAC: u=0x0134; break; /* LATIN CAP J WITH CIRCUMFLEX */
499 case 0xAF: u=0x017B; break; /* LATIN CAP Z WITH DOT ABOVE */
500 case 0xB1: u=0x0127; break; /* LATIN SMALL H WITH STROKE */
501 case 0xB6: u=0x0125; break; /* LATIN SMALL H WITH CIRCUMFLEX */
502 case 0xB9: u=0x0131; break; /* LATIN SMALL DOTLESS I */
503 case 0xBA: u=0x015F; break; /* LATIN SMALL S WITH CEDILLA */
504 case 0xBB: u=0x011F; break; /* LATIN SMALL G WITH BREVE */
505 case 0xBC: u=0x0135; break; /* LATIN SMALL J WITH CIRCUMFLEX */
506 case 0xBF: u=0x017C; break; /* LATIN SMALL Z WITH DOT ABOVE */
507 case 0xC5: u=0x010A; break; /* LATIN CAP C WITH DOT ABOVE */
508 case 0xC6: u=0x0108; break; /* LATIN CAP C WITH CIRCUMFLEX */
509 case 0xD5: u=0x0120; break; /* LATIN CAP G WITH DOT ABOVE */
510 case 0xD8: u=0x011C; break; /* LATIN CAP G WITH CIRCUMFLEX */
511 case 0xDD: u=0x016C; break; /* LATIN CAP U WITH BREVE */
512 case 0xDE: u=0x015C; break; /* LATIN CAP S WITH CIRCUMFLEX */
513 case 0xE5: u=0x010B; break; /* LATIN SMALL C WITH DOT ABOVE */
514 case 0xE6: u=0x0109; break; /* LATIN SMALL C WITH CIRCUMFLEX */
515 case 0xF5: u=0x0121; break; /* LATIN SMALL G WITH DOT ABOVE */
516 case 0xF8: u=0x011D; break; /* LATIN SMALL G WITH CIRCUMFLEX */
517 case 0xFD: u=0x016D; break; /* LATIN SMALL U WITH BREVE */
518 case 0xFE: u=0x015D; break; /* LATIN SMALL S WITH CIRCUMFLEX */
519 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
522 case 4: /* ISO 8859-4: Latin4: central European */
525 { case 0xA1: u=0x0104; break; /* LATIN CAP A WITH OGONEK */
526 case 0xA2: u=0x0138; break; /* LATIN SMALL KRA */
527 case 0xA3: u=0x0156; break; /* LATIN CAP R WITH CEDILLA */
528 case 0xA5: u=0x0128; break; /* LATIN CAP I WITH TILDE */
529 case 0xA6: u=0x013B; break; /* LATIN CAP L WITH CEDILLA */
530 case 0xA9: u=0x0160; break; /* LATIN CAP S WITH CARON */
531 case 0xAA: u=0x0112; break; /* LATIN CAP E WITH MACRON */
532 case 0xAB: u=0x0122; break; /* LATIN CAP G WITH CEDILLA */
533 case 0xAC: u=0x0166; break; /* LATIN CAP T WITH STROKE */
534 case 0xAE: u=0x017D; break; /* LATIN CAP Z WITH CARON */
535 case 0xB1: u=0x0105; break; /* LATIN SMALL A WITH OGONEK */
536 case 0xB2: u=0x02DB; break; /* OGONEK */
537 case 0xB3: u=0x0157; break; /* LATIN SMALL R WITH CEDILLA */
538 case 0xB5: u=0x0129; break; /* LATIN SMALL I WITH TILDE */
539 case 0xB6: u=0x013C; break; /* LATIN SMALL L WITH CEDILLA */
540 case 0xB7: u=0x02C7; break; /* CARON */
541 case 0xB9: u=0x0161; break; /* LATIN SMALL S WITH CARON */
542 case 0xBA: u=0x0113; break; /* LATIN SMALL E WITH MACRON */
543 case 0xBB: u=0x0123; break; /* LATIN SMALL G WITH CEDILLA */
544 case 0xBC: u=0x0167; break; /* LATIN SMALL T WITH STROKE */
545 case 0xBD: u=0x014A; break; /* LATIN CAP ENG */
546 case 0xBE: u=0x017E; break; /* LATIN SMALL Z WITH CARON */
547 case 0xBF: u=0x014B; break; /* LATIN SMALL ENG */
548 case 0xC0: u=0x0100; break; /* LATIN CAP A WITH MACRON */
549 case 0xC7: u=0x012E; break; /* LATIN CAP I WITH OGONEK */
550 case 0xC8: u=0x010C; break; /* LATIN CAP C WITH CARON */
551 case 0xCA: u=0x0118; break; /* LATIN CAP E WITH OGONEK */
552 case 0xCC: u=0x0116; break; /* LATIN CAP E WITH DOT ABOVE */
553 case 0xCF: u=0x012A; break; /* LATIN CAP I WITH MACRON */
554 case 0xD0: u=0x0110; break; /* LATIN CAP D WITH STROKE */
555 case 0xD1: u=0x0145; break; /* LATIN CAP N WITH CEDILLA */
556 case 0xD2: u=0x014C; break; /* LATIN CAP O WITH MACRON */
557 case 0xD3: u=0x0136; break; /* LATIN CAP K WITH CEDILLA */
558 case 0xD9: u=0x0172; break; /* LATIN CAP U WITH OGONEK */
559 case 0xDD: u=0x0168; break; /* LATIN CAP U WITH TILDE */
560 case 0xDE: u=0x016A; break; /* LATIN CAP U WITH MACRON */
561 case 0xE0: u=0x0101; break; /* LATIN SMALL A WITH MACRON */
562 case 0xE7: u=0x012F; break; /* LATIN SMALL I WITH OGONEK */
563 case 0xE8: u=0x010D; break; /* LATIN SMALL C WITH CARON */
564 case 0xEA: u=0x0119; break; /* LATIN SMALL E WITH OGONEK */
565 case 0xEC: u=0x0117; break; /* LATIN SMALL E WITH DOT ABOVE */
566 case 0xEF: u=0x012B; break; /* LATIN SMALL I WITH MACRON */
567 case 0xF0: u=0x0111; break; /* LATIN SMALL D WITH STROKE */
568 case 0xF1: u=0x0146; break; /* LATIN SMALL N WITH CEDILLA */
569 case 0xF2: u=0x014D; break; /* LATIN SMALL O WITH MACRON */
570 case 0xF3: u=0x0137; break; /* LATIN SMALL K WITH CEDILLA */
571 case 0xF9: u=0x0173; break; /* LATIN SMALL U WITH OGONEK */
572 case 0xFD: u=0x0169; break; /* LATIN SMALL U WITH TILDE */
573 case 0xFE: u=0x016B; break; /* LATIN SMALL U WITH MACRON */
574 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
577 case 5: /* ISO 8859-5: Cyrillic */
580 { case 0xA1: u=0x0401; break; /* CYRILLIC CAP IO */
581 case 0xA2: u=0x0402; break; /* CYRILLIC CAP DJE */
582 case 0xA3: u=0x0403; break; /* CYRILLIC CAP GJE */
583 case 0xA4: u=0x0404; break; /* CYRILLIC CAP UKRAINIAN IE */
584 case 0xA5: u=0x0405; break; /* CYRILLIC CAP DZE */
585 case 0xA6: u=0x0406; break; /* CYRILLIC CAP BYELORUSSIAN-UKRAINIAN I */
586 case 0xA7: u=0x0407; break; /* CYRILLIC CAP YI */
587 case 0xA8: u=0x0408; break; /* CYRILLIC CAP JE */
588 case 0xA9: u=0x0409; break; /* CYRILLIC CAP LJE */
589 case 0xAA: u=0x040A; break; /* CYRILLIC CAP NJE */
590 case 0xAB: u=0x040B; break; /* CYRILLIC CAP TSHE */
591 case 0xAC: u=0x040C; break; /* CYRILLIC CAP KJE */
592 case 0xAE: u=0x040E; break; /* CYRILLIC CAP SHORT U */
593 case 0xAF: u=0x040F; break; /* CYRILLIC CAP DZHE */
594 case 0xB0: u=0x0410; break; /* CYRILLIC CAP A */
595 case 0xB1: u=0x0411; break; /* CYRILLIC CAP BE */
596 case 0xB2: u=0x0412; break; /* CYRILLIC CAP VE */
597 case 0xB3: u=0x0413; break; /* CYRILLIC CAP GHE */
598 case 0xB4: u=0x0414; break; /* CYRILLIC CAP DE */
599 case 0xB5: u=0x0415; break; /* CYRILLIC CAP IE */
600 case 0xB6: u=0x0416; break; /* CYRILLIC CAP ZHE */
601 case 0xB7: u=0x0417; break; /* CYRILLIC CAP ZE */
602 case 0xB8: u=0x0418; break; /* CYRILLIC CAP I */
603 case 0xB9: u=0x0419; break; /* CYRILLIC CAP SHORT I */
604 case 0xBA: u=0x041A; break; /* CYRILLIC CAP KA */
605 case 0xBB: u=0x041B; break; /* CYRILLIC CAP EL */
606 case 0xBC: u=0x041C; break; /* CYRILLIC CAP EM */
607 case 0xBD: u=0x041D; break; /* CYRILLIC CAP EN */
608 case 0xBE: u=0x041E; break; /* CYRILLIC CAP O */
609 case 0xBF: u=0x041F; break; /* CYRILLIC CAP PE */
610 case 0xC0: u=0x0420; break; /* CYRILLIC CAP ER */
611 case 0xC1: u=0x0421; break; /* CYRILLIC CAP ES */
612 case 0xC2: u=0x0422; break; /* CYRILLIC CAP TE */
613 case 0xC3: u=0x0423; break; /* CYRILLIC CAP U */
614 case 0xC4: u=0x0424; break; /* CYRILLIC CAP EF */
615 case 0xC5: u=0x0425; break; /* CYRILLIC CAP HA */
616 case 0xC6: u=0x0426; break; /* CYRILLIC CAP TSE */
617 case 0xC7: u=0x0427; break; /* CYRILLIC CAP CHE */
618 case 0xC8: u=0x0428; break; /* CYRILLIC CAP SHA */
619 case 0xC9: u=0x0429; break; /* CYRILLIC CAP SHCHA */
620 case 0xCA: u=0x042A; break; /* CYRILLIC CAP HARD SIGN */
621 case 0xCB: u=0x042B; break; /* CYRILLIC CAP YERU */
622 case 0xCC: u=0x042C; break; /* CYRILLIC CAP SOFT SIGN */
623 case 0xCD: u=0x042D; break; /* CYRILLIC CAP E */
624 case 0xCE: u=0x042E; break; /* CYRILLIC CAP YU */
625 case 0xCF: u=0x042F; break; /* CYRILLIC CAP YA */
626 case 0xD0: u=0x0430; break; /* CYRILLIC SMALL A */
627 case 0xD1: u=0x0431; break; /* CYRILLIC SMALL BE */
628 case 0xD2: u=0x0432; break; /* CYRILLIC SMALL VE */
629 case 0xD3: u=0x0433; break; /* CYRILLIC SMALL GHE */
630 case 0xD4: u=0x0434; break; /* CYRILLIC SMALL DE */
631 case 0xD5: u=0x0435; break; /* CYRILLIC SMALL IE */
632 case 0xD6: u=0x0436; break; /* CYRILLIC SMALL ZHE */
633 case 0xD7: u=0x0437; break; /* CYRILLIC SMALL ZE */
634 case 0xD8: u=0x0438; break; /* CYRILLIC SMALL I */
635 case 0xD9: u=0x0439; break; /* CYRILLIC SMALL SHORT I */
636 case 0xDA: u=0x043A; break; /* CYRILLIC SMALL KA */
637 case 0xDB: u=0x043B; break; /* CYRILLIC SMALL EL */
638 case 0xDC: u=0x043C; break; /* CYRILLIC SMALL EM */
639 case 0xDD: u=0x043D; break; /* CYRILLIC SMALL EN */
640 case 0xDE: u=0x043E; break; /* CYRILLIC SMALL O */
641 case 0xDF: u=0x043F; break; /* CYRILLIC SMALL PE */
642 case 0xE0: u=0x0440; break; /* CYRILLIC SMALL ER */
643 case 0xE1: u=0x0441; break; /* CYRILLIC SMALL ES */
644 case 0xE2: u=0x0442; break; /* CYRILLIC SMALL TE */
645 case 0xE3: u=0x0443; break; /* CYRILLIC SMALL U */
646 case 0xE4: u=0x0444; break; /* CYRILLIC SMALL EF */
647 case 0xE5: u=0x0445; break; /* CYRILLIC SMALL HA */
648 case 0xE6: u=0x0446; break; /* CYRILLIC SMALL TSE */
649 case 0xE7: u=0x0447; break; /* CYRILLIC SMALL CHE */
650 case 0xE8: u=0x0448; break; /* CYRILLIC SMALL SHA */
651 case 0xE9: u=0x0449; break; /* CYRILLIC SMALL SHCHA */
652 case 0xEA: u=0x044A; break; /* CYRILLIC SMALL HARD SIGN */
653 case 0xEB: u=0x044B; break; /* CYRILLIC SMALL YERU */
654 case 0xEC: u=0x044C; break; /* CYRILLIC SMALL SOFT SIGN */
655 case 0xED: u=0x044D; break; /* CYRILLIC SMALL E */
656 case 0xEE: u=0x044E; break; /* CYRILLIC SMALL YU */
657 case 0xEF: u=0x044F; break; /* CYRILLIC SMALL YA */
658 case 0xF0: u=0x2116; break; /* NUMERO SIGN */
659 case 0xF1: u=0x0451; break; /* CYRILLIC SMALL IO */
660 case 0xF2: u=0x0452; break; /* CYRILLIC SMALL DJE */
661 case 0xF3: u=0x0453; break; /* CYRILLIC SMALL GJE */
662 case 0xF4: u=0x0454; break; /* CYRILLIC SMALL UKRAINIAN IE */
663 case 0xF5: u=0x0455; break; /* CYRILLIC SMALL DZE */
664 case 0xF6: u=0x0456; break; /* CYRILLIC SMALL BYELORUSSIAN-UKRAINIAN I */
665 case 0xF7: u=0x0457; break; /* CYRILLIC SMALL YI */
666 case 0xF8: u=0x0458; break; /* CYRILLIC SMALL JE */
667 case 0xF9: u=0x0459; break; /* CYRILLIC SMALL LJE */
668 case 0xFA: u=0x045A; break; /* CYRILLIC SMALL NJE */
669 case 0xFB: u=0x045B; break; /* CYRILLIC SMALL TSHE */
670 case 0xFC: u=0x045C; break; /* CYRILLIC SMALL KJE */
671 case 0xFD: u=0x00A7; break; /* SECTION SIGN */
672 case 0xFE: u=0x045E; break; /* CYRILLIC SMALL SHORT U */
673 case 0xFF: u=0x045F; break; /* CYRILLIC SMALL DZHE */
676 case 6: /* ISO 8859-6: Arabic */
679 { case 0xAC: u=0x060C; break; /* ARABIC COMMA */
680 case 0xBB: u=0x061B; break; /* ARABIC SEMICOLON */
681 case 0xBF: u=0x061F; break; /* ARABIC QUESTION MARK */
682 case 0xC1: u=0x0621; break; /* ARABIC HAMZA */
683 case 0xC2: u=0x0622; break; /* ARABIC ALEF WITH MADDA ABOVE */
684 case 0xC3: u=0x0623; break; /* ARABIC ALEF WITH HAMZA ABOVE */
685 case 0xC4: u=0x0624; break; /* ARABIC WAW WITH HAMZA ABOVE */
686 case 0xC5: u=0x0625; break; /* ARABIC ALEF WITH HAMZA BELOW */
687 case 0xC6: u=0x0626; break; /* ARABIC YEH WITH HAMZA ABOVE */
688 case 0xC7: u=0x0627; break; /* ARABIC ALEF */
689 case 0xC8: u=0x0628; break; /* ARABIC BEH */
690 case 0xC9: u=0x0629; break; /* ARABIC TEH MARBUTA */
691 case 0xCA: u=0x062A; break; /* ARABIC TEH */
692 case 0xCB: u=0x062B; break; /* ARABIC THEH */
693 case 0xCC: u=0x062C; break; /* ARABIC JEEM */
694 case 0xCD: u=0x062D; break; /* ARABIC HAH */
695 case 0xCE: u=0x062E; break; /* ARABIC KHAH */
696 case 0xCF: u=0x062F; break; /* ARABIC DAL */
697 case 0xD0: u=0x0630; break; /* ARABIC THAL */
698 case 0xD1: u=0x0631; break; /* ARABIC REH */
699 case 0xD2: u=0x0632; break; /* ARABIC ZAIN */
700 case 0xD3: u=0x0633; break; /* ARABIC SEEN */
701 case 0xD4: u=0x0634; break; /* ARABIC SHEEN */
702 case 0xD5: u=0x0635; break; /* ARABIC SAD */
703 case 0xD6: u=0x0636; break; /* ARABIC DAD */
704 case 0xD7: u=0x0637; break; /* ARABIC TAH */
705 case 0xD8: u=0x0638; break; /* ARABIC ZAH */
706 case 0xD9: u=0x0639; break; /* ARABIC AIN */
707 case 0xDA: u=0x063A; break; /* ARABIC GHAIN */
708 case 0xE0: u=0x0640; break; /* ARABIC TATWEEL */
709 case 0xE1: u=0x0641; break; /* ARABIC FEH */
710 case 0xE2: u=0x0642; break; /* ARABIC QAF */
711 case 0xE3: u=0x0643; break; /* ARABIC KAF */
712 case 0xE4: u=0x0644; break; /* ARABIC LAM */
713 case 0xE5: u=0x0645; break; /* ARABIC MEEM */
714 case 0xE6: u=0x0646; break; /* ARABIC NOON */
715 case 0xE7: u=0x0647; break; /* ARABIC HEH */
716 case 0xE8: u=0x0648; break; /* ARABIC WAW */
717 case 0xE9: u=0x0649; break; /* ARABIC ALEF MAKSURA */
718 case 0xEA: u=0x064A; break; /* ARABIC YEH */
719 case 0xEB: u=0x064B; break; /* ARABIC FATHATAN */
720 case 0xEC: u=0x064C; break; /* ARABIC DAMMATAN */
721 case 0xED: u=0x064D; break; /* ARABIC KASRATAN */
722 case 0xEE: u=0x064E; break; /* ARABIC FATHA */
723 case 0xEF: u=0x064F; break; /* ARABIC DAMMA */
724 case 0xF0: u=0x0650; break; /* ARABIC KASRA */
725 case 0xF1: u=0x0651; break; /* ARABIC SHADDA */
726 case 0xF2: u=0x0652; break; /* ARABIC SUKUN */
729 case 7: /* ISO 8859-7: Greek */
732 { case 0xA1: u=0x02BD; break; /* MODIFIER REVERSED COMMA */
733 case 0xA2: u=0x02BC; break; /* MODIFIER APOSTROPHE */
734 case 0xAF: u=0x2015; break; /* HORIZONTAL BAR */
735 case 0xB4: u=0x0384; break; /* GREEK TONOS */
736 case 0xB5: u=0x0385; break; /* GREEK DIALYTIKA TONOS */
737 case 0xB6: u=0x0386; break; /* GREEK CAP ALPHA WITH TONOS */
738 case 0xB8: u=0x0388; break; /* GREEK CAP EPSILON WITH TONOS */
739 case 0xB9: u=0x0389; break; /* GREEK CAP ETA WITH TONOS */
740 case 0xBA: u=0x038A; break; /* GREEK CAP IOTA WITH TONOS */
741 case 0xBC: u=0x038C; break; /* GREEK CAP OMICRON WITH TONOS */
742 case 0xBE: u=0x038E; break; /* GREEK CAP UPSILON WITH TONOS */
743 case 0xBF: u=0x038F; break; /* GREEK CAP OMEGA WITH TONOS */
744 case 0xC0: u=0x0390; break; /* GREEK SMALL IOTA WITH DIALYTIKA AND TONOS */
745 case 0xC1: u=0x0391; break; /* GREEK CAP ALPHA */
746 case 0xC2: u=0x0392; break; /* GREEK CAP BETA */
747 case 0xC3: u=0x0393; break; /* GREEK CAP GAMMA */
748 case 0xC4: u=0x0394; break; /* GREEK CAP DELTA */
749 case 0xC5: u=0x0395; break; /* GREEK CAP EPSILON */
750 case 0xC6: u=0x0396; break; /* GREEK CAP ZETA */
751 case 0xC7: u=0x0397; break; /* GREEK CAP ETA */
752 case 0xC8: u=0x0398; break; /* GREEK CAP THETA */
753 case 0xC9: u=0x0399; break; /* GREEK CAP IOTA */
754 case 0xCA: u=0x039A; break; /* GREEK CAP KAPPA */
755 case 0xCB: u=0x039B; break; /* GREEK CAP LAMDA */
756 case 0xCC: u=0x039C; break; /* GREEK CAP MU */
757 case 0xCD: u=0x039D; break; /* GREEK CAP NU */
758 case 0xCE: u=0x039E; break; /* GREEK CAP XI */
759 case 0xCF: u=0x039F; break; /* GREEK CAP OMICRON */
760 case 0xD0: u=0x03A0; break; /* GREEK CAP PI */
761 case 0xD1: u=0x03A1; break; /* GREEK CAP RHO */
762 case 0xD3: u=0x03A3; break; /* GREEK CAP SIGMA */
763 case 0xD4: u=0x03A4; break; /* GREEK CAP TAU */
764 case 0xD5: u=0x03A5; break; /* GREEK CAP UPSILON */
765 case 0xD6: u=0x03A6; break; /* GREEK CAP PHI */
766 case 0xD7: u=0x03A7; break; /* GREEK CAP CHI */
767 case 0xD8: u=0x03A8; break; /* GREEK CAP PSI */
768 case 0xD9: u=0x03A9; break; /* GREEK CAP OMEGA */
769 case 0xDA: u=0x03AA; break; /* GREEK CAP IOTA WITH DIALYTIKA */
770 case 0xDB: u=0x03AB; break; /* GREEK CAP UPSILON WITH DIALYTIKA */
771 case 0xDC: u=0x03AC; break; /* GREEK SMALL ALPHA WITH TONOS */
772 case 0xDD: u=0x03AD; break; /* GREEK SMALL EPSILON WITH TONOS */
773 case 0xDE: u=0x03AE; break; /* GREEK SMALL ETA WITH TONOS */
774 case 0xDF: u=0x03AF; break; /* GREEK SMALL IOTA WITH TONOS */
775 case 0xE0: u=0x03B0; break; /* GREEK SMALL UPSILON WITH DIALYTIKA AND TONOS */
776 case 0xE1: u=0x03B1; break; /* GREEK SMALL ALPHA */
777 case 0xE2: u=0x03B2; break; /* GREEK SMALL BETA */
778 case 0xE3: u=0x03B3; break; /* GREEK SMALL GAMMA */
779 case 0xE4: u=0x03B4; break; /* GREEK SMALL DELTA */
780 case 0xE5: u=0x03B5; break; /* GREEK SMALL EPSILON */
781 case 0xE6: u=0x03B6; break; /* GREEK SMALL ZETA */
782 case 0xE7: u=0x03B7; break; /* GREEK SMALL ETA */
783 case 0xE8: u=0x03B8; break; /* GREEK SMALL THETA */
784 case 0xE9: u=0x03B9; break; /* GREEK SMALL IOTA */
785 case 0xEA: u=0x03BA; break; /* GREEK SMALL KAPPA */
786 case 0xEB: u=0x03BB; break; /* GREEK SMALL LAMDA */
787 case 0xEC: u=0x03BC; break; /* GREEK SMALL MU */
788 case 0xED: u=0x03BD; break; /* GREEK SMALL NU */
789 case 0xEE: u=0x03BE; break; /* GREEK SMALL XI */
790 case 0xEF: u=0x03BF; break; /* GREEK SMALL OMICRON */
791 case 0xF0: u=0x03C0; break; /* GREEK SMALL PI */
792 case 0xF1: u=0x03C1; break; /* GREEK SMALL RHO */
793 case 0xF2: u=0x03C2; break; /* GREEK SMALL FINAL SIGMA */
794 case 0xF3: u=0x03C3; break; /* GREEK SMALL SIGMA */
795 case 0xF4: u=0x03C4; break; /* GREEK SMALL TAU */
796 case 0xF5: u=0x03C5; break; /* GREEK SMALL UPSILON */
797 case 0xF6: u=0x03C6; break; /* GREEK SMALL PHI */
798 case 0xF7: u=0x03C7; break; /* GREEK SMALL CHI */
799 case 0xF8: u=0x03C8; break; /* GREEK SMALL PSI */
800 case 0xF9: u=0x03C9; break; /* GREEK SMALL OMEGA */
801 case 0xFA: u=0x03CA; break; /* GREEK SMALL IOTA WITH DIALYTIKA */
802 case 0xFB: u=0x03CB; break; /* GREEK SMALL UPSILON WITH DIALYTIKA */
803 case 0xFC: u=0x03CC; break; /* GREEK SMALL OMICRON WITH TONOS */
804 case 0xFD: u=0x03CD; break; /* GREEK SMALL UPSILON WITH TONOS */
805 case 0xFE: u=0x03CE; break; /* GREEK SMALL OMEGA WITH TONOS */
808 case 8: /* ISO 8859-8: Hebrew */
811 { case 0xAA: u=0x00D7; break; /* MULTIPLICATION SIGN */
812 case 0xAF: u=0x203E; break; /* OVERLINE */
813 case 0xBA: u=0x00F7; break; /* DIVISION SIGN */
814 case 0xDF: u=0x2017; break; /* DOUBLE LOW LINE */
815 case 0xE0: u=0x05D0; break; /* HEBREW ALEF */
816 case 0xE1: u=0x05D1; break; /* HEBREW BET */
817 case 0xE2: u=0x05D2; break; /* HEBREW GIMEL */
818 case 0xE3: u=0x05D3; break; /* HEBREW DALET */
819 case 0xE4: u=0x05D4; break; /* HEBREW HE */
820 case 0xE5: u=0x05D5; break; /* HEBREW VAV */
821 case 0xE6: u=0x05D6; break; /* HEBREW ZAYIN */
822 case 0xE7: u=0x05D7; break; /* HEBREW HET */
823 case 0xE8: u=0x05D8; break; /* HEBREW TET */
824 case 0xE9: u=0x05D9; break; /* HEBREW YOD */
825 case 0xEA: u=0x05DA; break; /* HEBREW FINAL KAF */
826 case 0xEB: u=0x05DB; break; /* HEBREW KAF */
827 case 0xEC: u=0x05DC; break; /* HEBREW LAMED */
828 case 0xED: u=0x05DD; break; /* HEBREW FINAL MEM */
829 case 0xEE: u=0x05DE; break; /* HEBREW MEM */
830 case 0xEF: u=0x05DF; break; /* HEBREW FINAL NUN */
831 case 0xF0: u=0x05E0; break; /* HEBREW NUN */
832 case 0xF1: u=0x05E1; break; /* HEBREW SAMEKH */
833 case 0xF2: u=0x05E2; break; /* HEBREW AYIN */
834 case 0xF3: u=0x05E3; break; /* HEBREW FINAL PE */
835 case 0xF4: u=0x05E4; break; /* HEBREW PE */
836 case 0xF5: u=0x05E5; break; /* HEBREW FINAL TSADI */
837 case 0xF6: u=0x05E6; break; /* HEBREW TSADI */
838 case 0xF7: u=0x05E7; break; /* HEBREW QOF */
839 case 0xF8: u=0x05E8; break; /* HEBREW RESH */
840 case 0xF9: u=0x05E9; break; /* HEBREW SHIN */
841 case 0xFA: u=0x05EA; break; /* HEBREW TAV */
844 case 9: /* ISO 8859-9: Latin5: west European without Icelandic */
848 case 0xD0: u=0x011E; break; /* LATIN CAP G WITH BREVE */
849 case 0xDD: u=0x0130; break; /* LATIN CAP I WITH DOT ABOVE */
850 case 0xDE: u=0x015e; break; /* LATIN CAP S WITH CEDILLA */
851 case 0xF0: u=0x011f; break; /* LATIN SMALL G WITH BREVE */
852 case 0xFD: u=0x0131; break; /* LATIN SMALL DOTLESS I */
853 case 0xFE: u=0x015f; break; /* LATIN SMALL S WITH CEDILLA */
860 /* ------------------------------------------------------------------------- */
861 /* (3) Unicode -> ZSCII and vice versa */
863 /* Need not be rapid, as the results are mostly cached. */
864 /* Unicode chars which can't be fitted into ZSCII are converted to the */
865 /* value 5 (the pad character used in the dictionary and elsewhere). */
866 /* ------------------------------------------------------------------------- */
868 int zscii_defn_modified, zscii_high_water_mark;
870 int32 zscii_to_unicode_grid[0x61];
872 static void zscii_unicode_map(int zscii, int32 unicode)
873 { if ((zscii < 155) || (zscii > 251))
874 { compiler_error("Attempted to map a Unicode character into the ZSCII \
875 set at an illegal position");
878 zscii_to_unicode_grid[zscii-155] = unicode;
879 zscii_defn_modified = TRUE;
882 int default_zscii_highset_sizes[] = { 69, 69, 81, 71, 82, 92, 48, 71, 27, 62 };
884 int32 default_zscii_to_unicode_c01[]
885 = { /* (This ordering is important, unlike those for other char sets)
886 The 69 characters making up the default Unicode translation
887 table (see the Z-Machine Standard 1.0). */
889 0xe4, /* a-diaeresis */ 0xf6, /* o-diaeresis */ 0xfc, /* u-diaeresis */
890 0xc4, /* A-diaeresis */ 0xd6, /* O-diaeresis */ 0xdc, /* U-diaeresis */
891 0xdf, /* sz-ligature */ 0xbb, /* >> */ 0xab, /* << */
892 0xeb, /* e-diaeresis */ 0xef, /* i-diaeresis */ 0xff, /* y-diaeresis */
893 0xcb, /* E-diaeresis */ 0xcf, /* I-diaeresis */ 0xe1, /* a-acute */
894 0xe9, /* e-acute */ 0xed, /* i-acute */ 0xf3, /* o-acute */
895 0xfa, /* u-acute */ 0xfd, /* y-acute */ 0xc1, /* A-acute */
896 0xc9, /* E-acute */ 0xcd, /* I-acute */ 0xd3, /* O-acute */
897 0xda, /* U-acute */ 0xdd, /* Y-acute */ 0xe0, /* a-grave */
898 0xe8, /* e-grave */ 0xec, /* i-grave */ 0xf2, /* o-grave */
899 0xf9, /* u-grave */ 0xc0, /* A-grave */ 0xc8, /* E-grave */
900 0xcc, /* I-grave */ 0xd2, /* O-grave */ 0xd9, /* U-grave */
901 0xe2, /* a-circumflex */ 0xea, /* e-circumflex */
902 0xee, /* i-circumflex */ 0xf4, /* o-circumflex */
903 0xfb, /* u-circumflex */ 0xc2, /* A-circumflex */
904 0xca, /* E-circumflex */ 0xce, /* I-circumflex */
905 0xd4, /* O-circumflex */ 0xdb, /* U-circumflex */
906 0xe5, /* a-ring */ 0xc5, /* A-ring */
907 0xf8, /* o-slash */ 0xd8, /* O-slash */
908 0xe3, /* a-tilde */ 0xf1, /* n-tilde */ 0xf5, /* o-tilde */
909 0xc3, /* A-tilde */ 0xd1, /* N-tilde */ 0xd5, /* O-tilde */
910 0xe6, /* ae-ligature */ 0xc6, /* AE-ligature */
911 0xe7, /* c-cedilla */ 0xc7, /* C-cedilla */
912 0xfe, /* thorn */ 0xf0, /* eth */ 0xde, /* Thorn */ 0xd0, /* Eth */
913 0xa3, /* pound symbol */
914 0x0153, /* oe-ligature */ 0x0152, /* OE-ligature */
915 0xa1, /* inverted ! */ 0xbf /* inverted ? */ };
917 int32 default_zscii_to_unicode_c2[]
918 = { /* The 81 accented letters in Latin2 */
919 0x0104, 0x0141, 0x013D, 0x015A, 0x0160, 0x015E, 0x0164, 0x0179,
920 0x017D, 0x017B, 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139,
921 0x0106, 0x00C7, 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD,
922 0x00CE, 0x010E, 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150,
923 0x00D6, 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162,
924 0x0105, 0x0142, 0x013E, 0x015B, 0x0161, 0x015F, 0x0165, 0x017A,
925 0x017E, 0x017C, 0x00DF, 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4,
926 0x013A, 0x0107, 0x00E7, 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B,
927 0x00ED, 0x00EE, 0x010F, 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4,
928 0x0151, 0x00F6, 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD,
931 int32 default_zscii_to_unicode_c3[]
932 = { /* The 71 accented letters in Latin3 */
933 0x0126, 0x0124, 0x0130, 0x015E, 0x011E, 0x0134, 0x017B, 0x0127,
934 0x0125, 0x0131, 0x015F, 0x011F, 0x0135, 0x017C, 0x00C0, 0x00C1,
935 0x00C2, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA,
936 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D1, 0x00D2, 0x00D3,
937 0x00D4, 0x0120, 0x00D6, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC,
938 0x016C, 0x015C, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E4, 0x010B,
939 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED,
940 0x00EE, 0x00EF, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6,
941 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D };
943 int32 default_zscii_to_unicode_c4[]
944 = { /* The 82 accented letters in Latin4 */
945 0x0104, 0x0138, 0x0156, 0x0128, 0x013B, 0x0160, 0x0112, 0x0122,
946 0x0166, 0x017D, 0x0105, 0x0157, 0x0129, 0x013C, 0x0161, 0x0113,
947 0x0123, 0x0167, 0x014A, 0x017E, 0x014B, 0x0100, 0x00C1, 0x00C2,
948 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, 0x010C, 0x00C9, 0x0118,
949 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A, 0x0110, 0x0145, 0x014C,
950 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D8, 0x0172, 0x00DA, 0x00DB,
951 0x00DC, 0x0168, 0x016A, 0x00DF, 0x0101, 0x00E1, 0x00E2, 0x00E3,
952 0x00E4, 0x00E5, 0x00E6, 0x012F, 0x010D, 0x00E9, 0x0119, 0x00EB,
953 0x0117, 0x00ED, 0x00EE, 0x012B, 0x0111, 0x0146, 0x014D, 0x0137,
954 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC,
957 int32 default_zscii_to_unicode_c5[]
958 = { /* The 92 accented letters in Cyrillic */
959 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408,
960 0x0409, 0x040A, 0x040B, 0x040C, 0x040E, 0x040F, 0x0410, 0x0411,
961 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419,
962 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, 0x0420, 0x0421,
963 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429,
964 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431,
965 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439,
966 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, 0x0440, 0x0441,
967 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449,
968 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x0451, 0x0452,
969 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A,
970 0x045B, 0x045C, 0x045E, 0x045F };
972 int32 default_zscii_to_unicode_c6[]
973 = { /* The 48 accented letters in Arabic */
974 0x060C, 0x061B, 0x061F, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625,
975 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D,
976 0x062E, 0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635,
977 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642,
978 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A,
979 0x064B, 0x064C, 0x064D, 0x064E, 0x064F, 0x0650, 0x0651, 0x0652 };
981 int32 default_zscii_to_unicode_c7[]
982 = { /* The 71 accented letters in Greek */
983 0x0384, 0x0385, 0x0386, 0x0388, 0x0389, 0x038A, 0x038C, 0x038E,
984 0x038F, 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396,
985 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E,
986 0x039F, 0x03A0, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
987 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
988 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
989 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
990 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
991 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE };
993 int32 default_zscii_to_unicode_c8[]
994 = { /* The 27 accented letters in Hebrew */
995 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
996 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
997 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
998 0x05E8, 0x05E9, 0x05EA };
1000 int32 default_zscii_to_unicode_c9[]
1001 = { /* The 62 accented letters in Latin5 */
1002 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
1003 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
1004 0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D8,
1005 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF, 0x00E0,
1006 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8,
1007 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x011F,
1008 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x00F9,
1009 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF };
1011 static void make_unicode_zscii_map(void)
1014 for (i=0; i<0x61; i++) zscii_to_unicode_grid[i] = '?';
1016 zscii_high_water_mark
1017 = default_zscii_highset_sizes[character_set_setting];
1019 for (i=0; i<zscii_high_water_mark; i++)
1020 { switch(character_set_setting)
1022 case 1: zscii_unicode_map(i+155, default_zscii_to_unicode_c01[i]);
1024 case 2: zscii_unicode_map(i+155, default_zscii_to_unicode_c2[i]);
1026 case 3: zscii_unicode_map(i+155, default_zscii_to_unicode_c3[i]);
1028 case 4: zscii_unicode_map(i+155, default_zscii_to_unicode_c4[i]);
1030 case 5: zscii_unicode_map(i+155, default_zscii_to_unicode_c5[i]);
1032 case 6: zscii_unicode_map(i+155, default_zscii_to_unicode_c6[i]);
1034 case 7: zscii_unicode_map(i+155, default_zscii_to_unicode_c7[i]);
1036 case 8: zscii_unicode_map(i+155, default_zscii_to_unicode_c8[i]);
1038 case 9: zscii_unicode_map(i+155, default_zscii_to_unicode_c9[i]);
1042 if (character_set_setting < 2) zscii_defn_modified = FALSE;
1043 make_iso_to_alphabet_grid();
1046 extern void new_zscii_character(int32 u, int plus_flag)
1048 if (u < 0 || u > 0xFFFF)
1049 error("Zcharacter table cannot contain Unicode characters beyond $FFFF");
1050 if (plus_flag == FALSE)
1051 zscii_high_water_mark = 0;
1052 if (zscii_high_water_mark == 0x61)
1053 error("No more room in the Zcharacter table");
1054 else zscii_unicode_map(155 + zscii_high_water_mark++, u);
1057 extern void new_zscii_finished(void)
1058 { make_iso_to_alphabet_grid();
1061 extern int unicode_to_zscii(int32 u)
1063 if (u < 0x7f) return u;
1064 for (i=0; i<zscii_high_water_mark; i++)
1065 if (zscii_to_unicode_grid[i] == u) return i+155;
1069 extern int32 zscii_to_unicode(int z)
1070 { if (z < 0x80) return z;
1071 if ((z >= 155) && (z <= 251)) return zscii_to_unicode_grid[z-155];
1075 /* ------------------------------------------------------------------------- */
1076 /* (4) Text -> Unicode */
1078 /* This routine is not used for ordinary text compilation as it is too */
1079 /* slow, but it's useful for handling @ string escapes, or to avoid writing */
1080 /* special code when speed is not especially required. */
1081 /* Note that the two string escapes which can define Unicode are: */
1083 /* @.. where .. is an accent */
1084 /* and @{...} where ... specifies a Unicode char in hexadecimal */
1085 /* (1 to 6 digits long) */
1087 /* If either syntax is malformed, an error is generated */
1088 /* and the Unicode (= ISO = ASCII) character value of '?' is returned */
1090 /* In Unicode mode (character_set_unicode is true), this handles UTF-8 */
1091 /* decoding as well as @-expansion. (So it's called when an '@' appears */
1092 /* *and* when a high-bit character appears.) */
1093 /* ------------------------------------------------------------------------- */
1095 int textual_form_length;
1097 extern int32 text_to_unicode(char *text)
1101 { if (character_set_unicode)
1102 { if (text[0] & 0x80) /* 8-bit */
1103 { switch (text[0] & 0xF0)
1104 { case 0xf0: /* 4-byte UTF-8 string */
1105 textual_form_length = 4;
1106 if ((text[0] & 0xf8) != 0xf0)
1107 { error("Invalid 4-byte UTF-8 string.");
1110 if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80 || (text[3] & 0xc0) != 0x80)
1111 { error("Invalid 4-byte UTF-8 string.");
1114 return (text[0] & 0x07) << 18
1115 | (text[1] & 0x3f) << 12
1116 | (text[2] & 0x3f) << 6
1119 case 0xe0: /* 3-byte UTF-8 string */
1120 textual_form_length = 3;
1121 if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80)
1122 { error("Invalid 3-byte UTF-8 string.");
1125 return (text[0] & 0x0f) << 12
1126 | (text[1] & 0x3f) << 6
1129 case 0xc0: /* 2-byte UTF-8 string */
1131 textual_form_length = 2;
1132 if ((text[1] & 0xc0) != 0x80)
1133 { error("Invalid 2-byte UTF-8 string.");
1136 return (text[0] & 0x1f) << 6
1139 default: /* broken */
1140 error("Invalid UTF-8 string.");
1141 textual_form_length = 1;
1146 else /* nice 7-bit */
1147 { textual_form_length = 1;
1148 return (uchar) text[0];
1153 textual_form_length = 1;
1154 return iso_to_unicode((uchar) text[0]);
1158 if ((isdigit(text[1])) || (text[1] == '@'))
1159 { ebf_error("'@' plus an accent code or '@{...}'", text);
1160 textual_form_length = 1;
1165 { for (i=0; accents[i] != 0; i+=2)
1166 if ((text[1] == accents[i]) && (text[2] == accents[i+1]))
1167 { textual_form_length = 3;
1168 return default_zscii_to_unicode_c01[i/2];
1172 uac[0]='@'; uac[1]=text[1]; uac[2]=text[2]; uac[3]=0;
1173 error_named("No such accented character as", uac);
1179 while (text[++i] != '}')
1181 { error("'@{' without matching '}'");
1185 { error("At most six hexadecimal digits allowed in '@{...}'");
1188 d = character_digit_value[(uchar)text[i]];
1190 { error("'@{...}' may only contain hexadecimal digits");
1193 total = total*16 + d;
1195 while ((text[i] != '}') && (text[i] != 0)) i++;
1196 if (text[i] == '}') i++;
1197 textual_form_length = i;
1201 textual_form_length = 1;
1205 /* ------------------------------------------------------------------------- */
1206 /* (5) Zscii -> Text */
1208 /* Used for printing out dictionary contents into the text transcript file */
1209 /* or on-screen (in response to the Trace dictionary directive). */
1210 /* In either case, output uses the same ISO set as the source code. */
1211 /* ------------------------------------------------------------------------- */
1213 extern void zscii_to_text(char *text, int zscii)
1217 if ((zscii < 0x100) && (zscii_to_iso_grid[zscii] != 0))
1218 { text[0] = zscii_to_iso_grid[zscii]; text[1] = 0; return;
1221 unicode = zscii_to_unicode(zscii);
1223 if (default_zscii_to_unicode_c01[i] == unicode)
1225 text[1] = accents[2*i];
1226 text[2] = accents[2*i+1];
1227 text[3] = 0; return;
1229 sprintf(text, "@{%x}", unicode);
1232 /* ========================================================================= */
1234 extern char *name_of_iso_set(int s)
1236 { case 1: return "Latin1";
1237 case 2: return "Latin2";
1238 case 3: return "Latin3";
1239 case 4: return "Latin4";
1240 case 5: return "Cyrillic";
1241 case 6: return "Arabic";
1242 case 7: return "Greek";
1243 case 8: return "Hebrew";
1244 case 9: return "Latin5";
1246 return "Plain ASCII";
1249 extern void change_character_set(void)
1250 { make_source_to_iso_grid();
1251 make_unicode_zscii_map();
1254 /* ------------------------------------------------------------------------- */
1255 /* Case translation of standard Roman letters within ISO */
1256 /* ------------------------------------------------------------------------- */
1258 extern void make_lower_case(char *str)
1260 for (i=0; str[i]!=0; i++)
1261 if ((((uchar)str[i])<128) && (isupper(str[i]))) str[i]=tolower(str[i]);
1264 extern void make_upper_case(char *str)
1266 for (i=0; str[i]!=0; i++)
1267 if ((((uchar)str[i])<128) && (islower(str[i]))) str[i]=toupper(str[i]);
1270 /* ========================================================================= */
1271 /* Data structure management routines */
1272 /* ------------------------------------------------------------------------- */
1274 extern void init_chars_vars(void)
1276 for (n=0; n<128; n++) character_digit_value[n] = 127;
1277 character_digit_value['0'] = 0;
1278 character_digit_value['1'] = 1;
1279 character_digit_value['2'] = 2;
1280 character_digit_value['3'] = 3;
1281 character_digit_value['4'] = 4;
1282 character_digit_value['5'] = 5;
1283 character_digit_value['6'] = 6;
1284 character_digit_value['7'] = 7;
1285 character_digit_value['8'] = 8;
1286 character_digit_value['9'] = 9;
1287 character_digit_value['a'] = 10;
1288 character_digit_value['b'] = 11;
1289 character_digit_value['c'] = 12;
1290 character_digit_value['d'] = 13;
1291 character_digit_value['e'] = 14;
1292 character_digit_value['f'] = 15;
1293 character_digit_value['A'] = 10;
1294 character_digit_value['B'] = 11;
1295 character_digit_value['C'] = 12;
1296 character_digit_value['D'] = 13;
1297 character_digit_value['E'] = 14;
1298 character_digit_value['F'] = 15;
1300 strcpy((char *) alphabet[0], "abcdefghijklmnopqrstuvwxyz");
1301 strcpy((char *) alphabet[1], "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
1302 strcpy((char *) alphabet[2], " ^0123456789.,!?_#'~/\\-:()");
1304 alphabet_modified = FALSE;
1306 for (n=0; n<78; n++) alphabet_used[n] = 'N';
1308 change_character_set();
1311 extern void chars_begin_pass(void)
1315 extern void chars_allocate_arrays(void)
1319 extern void chars_free_arrays(void)
1323 /* ========================================================================= */