1 /* ------------------------------------------------------------------------- */
2 /* "chars" : Character set mappings and the Z-machine alphabet table */
4 /* Copyright (c) Graham Nelson 1993 - 2020 */
6 /* This file is part of Inform. */
8 /* Inform is free software: you can redistribute it and/or modify */
9 /* it under the terms of the GNU General Public License as published by */
10 /* the Free Software Foundation, either version 3 of the License, or */
11 /* (at your option) any later version. */
13 /* Inform is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
16 /* GNU General Public License for more details. */
18 /* You should have received a copy of the GNU General Public License */
19 /* along with Inform. If not, see https://gnu.org/licenses/ */
21 /* ------------------------------------------------------------------------- */
22 /* Inform uses six different character representations: */
24 /* ASCII plain ASCII characters in range $20 to $7e */
25 /* (unsigned 7-bit number) */
26 /* Source raw bytes from source code */
27 /* (unsigned 8-bit number) */
28 /* ISO plain ASCII or ISO 8859-1 to -9, according to value */
29 /* character_set_setting == 0 or 1 to 9 */
30 /* in Unicode mode (character_set_unicode), individual */
32 /* (unsigned 8-bit number) */
33 /* ZSCII the Z-machine's character set */
34 /* (unsigned 10-bit number) */
35 /* textual such as the text @'e to mean e-acute */
36 /* or @$03a3 to mean capital Greek sigma */
37 /* in Unicode mode, the operations manipulating multibyte */
38 /* UCS representations are included in text routines */
39 /* (sequence of ASCII characters) */
40 /* Unicode a unifying character set holding all possible characters */
41 /* Inform can ever deal with */
42 /* (unsigned 16-bit number) */
44 /* Conversion can always be made down this list, but generally not up. */
45 /* Note that all ASCII values are the same in any version of ISO */
48 /* There is a seventh form: sequences of 5-bit "Z-chars" which encode */
49 /* ZSCII into the story file in compressed form. Conversion of ZSCII to */
50 /* and from Z-char sequences, although it uses the alphabet table, is done */
52 /* ------------------------------------------------------------------------- */
53 /* The main data structures need to be modified in mid-compilation, but */
54 /* several of them depend on each other, and must remain consistent; */
55 /* and rebuilding one sometimes uses conversion routines depending on */
56 /* information held in the others: */
58 /* Structure If changed, need to rebuild: */
59 /* character_set_setting source_to_iso_grid[] */
60 /* zscii_to_unicode_grid[] */
61 /* zscii_to_iso_grid[] */
62 /* iso_to_unicode_grid[] */
63 /* alphabet[][] iso_to_alphabet_grid[] */
64 /* zscii_to_alphabet_grid[] */
65 /* zscii_to_unicode_grid[] iso_to_alphabet_grid[] */
66 /* source_to_iso_grid[] <nothing> */
67 /* iso_to_alphabet_grid[] <nothing> */
68 /* zscii_to_alphabet_grid[] <nothing> */
69 /* zscii_to_iso_grid[] <nothing> */
71 /* (zscii_to_iso_grid[] is made whenever iso_to_alphabet_grid[] is */
72 /* made but does not depend on alphabet[].) */
74 /* Conversion routine Makes use of: */
75 /* iso_to_unicode character_set_setting */
76 /* unicode_to_zscii character_set_setting */
77 /* zscii_to_unicode_grid[] */
78 /* zscii_to_unicode character_set_setting */
79 /* zscii_to_unicode_grid[] */
80 /* text_to_unicode <nothing> */
81 /* zscii_to_text character_set_setting */
82 /* zscii_to_unicode_grid[] */
83 /* zscii_to_iso_grid[] */
85 /* For example, if we want to change alphabet[][] then we can safely */
86 /* use any of the conversion routines while working on the change, but */
87 /* must rebuild the iso_to_alphabet_grid[] before allowing Inform to */
88 /* continue compiling. */
89 /* ------------------------------------------------------------------------- */
93 uchar source_to_iso_grid[0x100]; /* Filters source code into legal ISO */
95 int32 iso_to_unicode_grid[0x100]; /* Filters ISO into Unicode */
97 int character_digit_value[128]; /* Parsing of binary, decimal and hex */
99 static char *accents = /* Standard 0.2 stock of accented... */
101 ":a:o:u:A:O:Uss>><<:e:i:y:E:I'a'e'i'o'u'y'A'E'I'O'U'Y`a`e`i`o`u\
102 `A`E`I`O`U^a^e^i^o^u^A^E^I^O^UoaoA/o/O~a~n~o~A~N~OaeAEcccCthetThEtLLoeOE!!??";
104 /* ...characters, numbered upwards */
107 /* ------------------------------------------------------------------------- */
109 uchar alphabet[3][27]; /* The alphabet table. */
111 int alphabet_modified; /* Has the default been changed? */
113 char alphabet_used[78]; /* Flags (holding 'N' or 'Y') for
114 which of the Z-alphabet letters
115 have actually been encrypted */
117 /* ------------------------------------------------------------------------- */
119 int iso_to_alphabet_grid[0x100];
121 /* This array combines two conversion processes which have to run quickly:
122 an ISO character n is being converted for text purposes into a stream
123 of Z-chars (anything from 1 up to 8 of these). Unicode but non-ISO
124 characters are also converted from text, but far less often, and
125 different (and slower) methods are used to carry this out.
127 iso_to_alphabet_grid[n]
128 = i if the character exists in ZSCII and is located at
129 position i in the Z-machine alphabet (where 0 to 25
130 give positions in A0, 26 to 51 in A1 and 52 to 77 in A2);
132 -z if the character exists in ZSCII as value z, but is not
133 located anywhere in the Z-machine alphabet;
135 -5 if the character does not exist in ZSCII. (It will still
136 be printable using an 8-Z-char sequence to encode it in
137 Unicode form, but there's no ZSCII form.)
139 Note that ISO tilde ~ is interpreted as ZSCII double-quote ",
140 and ISO circumflex ^ is interpreted as ZSCII new-line, in accordance
141 with the Inform syntax for strings. This is automatic from the
142 structure of alphabet[][]:
144 alphabet[i][j] = the ZSCII code of letter j (0 to 25)
145 in alphabet i (0 to 2)
149 alphabet[2][0] is ignored by the Z-machine and Inform
150 (char 0 in A2 is an escape)
151 alphabet[2][1] is ignored by the Z-machine
152 (char 1 in A2 means new-line)
153 but used by Inform to hold ISO circumflex
154 so that ^ is translated as new-line
155 alphabet[2][19] is used by Inform to hold ISO tilde
156 so that ~ is translated as ": after
157 compilation, when the alphabet table is
158 written into the Z-machine, this entry
159 is changed back to ".
161 Note that the alphabet can only hold ZSCII values between 0 and 255.
163 The array is dimensioned as [3][27], not [3][26], to make it easier to
164 initialise using strcpy (see below), but the zero entries [x][26] are
167 int zscii_to_alphabet_grid[0x100];
169 /* The same, except that the index is a ZSCII character, not an ISO one. */
171 int zscii_to_iso_grid[0x100]; /* Converts ZSCII between 0 and 255 to
172 codes in current ISO set: or to 0 if
173 code isn't in the current ISO set. */
175 static void make_iso_to_alphabet_grid(void)
176 { int i, j, k; int z;
178 for (j=0; j<0x100; j++)
179 { zscii_to_iso_grid[j] = 0;
180 zscii_to_alphabet_grid[j] = -j;
183 for (j=0; j<0x100; j++)
184 { iso_to_alphabet_grid[j]=-5;
185 if ((j >= 0x20) && (j <= 0x7e))
186 { iso_to_alphabet_grid[j] = -j;
187 zscii_to_iso_grid[j] = j;
189 if ((j >= 0xa1) && (j <= 0xff))
190 { z = unicode_to_zscii(iso_to_unicode(j));
191 if (character_set_setting != 0)
192 zscii_to_iso_grid[z] = j;
193 iso_to_alphabet_grid[j] = -z;
195 iso_to_unicode_grid[j] = iso_to_unicode(j);
198 for (k=(j<2?0:1); k<26; k++)
199 { i=(int) ((alphabet[j])[k]);
200 zscii_to_alphabet_grid[i] = k + j*26;
201 iso_to_alphabet_grid[zscii_to_iso_grid[i]] = k + j*26;
205 extern void map_new_zchar(int32 unicode)
206 { /* Attempts to enter the given Unicode character into the "alphabet[]"
207 array, in place of one which has not so far been used in the
208 compilation of the current file. This may of course fail. */
212 zscii = unicode_to_zscii(unicode);
214 /* Out of ZSCII range? */
215 if ((zscii == 5) || (zscii >= 0x100))
216 { unicode_char_error(
217 "Character must first be entered into Zcharacter table:", unicode);
222 for (i=0;i<3;i++) for (j=0;j<26;j++)
223 if (alphabet[i][j] == zscii) return;
225 /* A0 and A1 are never changed. Try to find a place in alphabet A2:
227 xx0123456789.,!?_#'~/\-:()
228 ^^^^^^^^^^ ^^^^^ ^^^^^^
230 The letters marked ^ are considered to be replaceable, as long as
231 they haven't yet been used in any text already encoded, and haven't
232 already been replaced. The routine works along from the left, since
233 numerals are more of a luxury than punctuation. */
236 { if ((i == 12) || (i == 13) || (i == 19)) continue;
237 if (alphabet_used[52+i] == 'N')
238 { alphabet_used[52+i] = 'Y';
239 alphabet[2][i] = zscii;
240 alphabet_modified = TRUE;
241 make_iso_to_alphabet_grid();
247 extern void new_alphabet(char *text, int which_alph)
249 /* Called three times in succession, with which_alph = 0, 1, 2 */
251 int i, j, zscii; int32 unicode;
253 alphabet_modified = TRUE;
256 { i=3; alphabet[2][2] = '~';
261 { if (text[j] == 0) goto WrongSizeError;
263 unicode = text_to_unicode(text+j);
264 j += textual_form_length;
266 zscii = unicode_to_zscii(unicode);
267 if ((zscii == 5) || (zscii >= 0x100))
268 unicode_char_error("Character can't be used in alphabets unless \
269 entered into Zcharacter table", unicode);
270 else alphabet[which_alph][i] = zscii;
276 error("Alphabet string must give exactly 23 characters");
278 error("Alphabet string must give exactly 26 characters");
282 { int test_dups[0x100];
283 for (i=0; i<0x100; i++) test_dups[i] = 0;
284 for (i=0; i<3; i++) for (j=0; j<26; j++)
285 { if (test_dups[alphabet[i][j]]++ == 1)
286 unicode_char_error("Character duplicated in alphabet:",
287 zscii_to_unicode(alphabet[i][j]));
290 make_iso_to_alphabet_grid();
294 static void read_source_to_iso_file(uchar *uccg)
295 { FILE *charset_file;
300 charset_file=fopen(Charset_Map, "r");
301 if (charset_file==NULL)
302 fatalerror_named("Couldn't open character set mapping", Charset_Map);
304 while (feof(charset_file)==0)
305 { if (fgets(cs_buff,256,charset_file)==0) break;
308 { case '!': /* Ignore comments in file */
310 case 'C': /* Set character set */
311 character_set_setting = cs_buff[1]-'0';
312 if ((character_set_setting < 0) || (character_set_setting > 9))
313 { fatalerror_named("Character set in mapping must be 0 to 9",
319 while ((i<256) && (p!=NULL))
321 uccg[i++] = (uchar)atoi(p);
329 fclose(charset_file);
332 /* ========================================================================= */
333 /* Conversion functions (without side effects) */
334 /* ------------------------------------------------------------------------- */
335 /* (1) Source -> ISO */
337 /* 00 remains 0 (meaning "end of file") */
338 /* TAB becomes SPACE */
339 /* 0c ("form feed") becomes '\n' */
340 /* 0d becomes '\n' */
341 /* other control characters become '?' */
343 /* 80 to 9f become '?' */
344 /* a0 (ISO "non-breaking space") becomes SPACE */
345 /* ad (ISO "soft hyphen") becomes '-' */
346 /* any character undefined in ISO is mapped to '?' */
347 /* In Unicode mode, characters 80 and upwards are preserved. */
349 /* ------------------------------------------------------------------------- */
351 static void make_source_to_iso_grid(void)
352 { int n; uchar *uccg = (uchar *) source_to_iso_grid;
354 for (n=0; n<0x100; n++) uccg[n] = (char) n;
356 if (Charset_Map[0] != '\0')
357 read_source_to_iso_file(uccg);
359 { source_to_iso_grid[0] = (char) 0;
360 for (n=1; n<32; n++) source_to_iso_grid[n] = '?';
361 source_to_iso_grid[10] = '\n';
362 source_to_iso_grid[12] = '\n';
363 source_to_iso_grid[13] = '\n';
364 source_to_iso_grid[127] = '?';
365 source_to_iso_grid[TAB_CHARACTER] = ' ';
367 if (character_set_unicode) /* No need to meddle with 8-bit for UTF-8 */
370 for (n=0x80; n<0xa0; n++) source_to_iso_grid[n] = '?';
371 source_to_iso_grid[0xa0] = ' ';
372 source_to_iso_grid[0xad] = '-';
374 switch(character_set_setting)
376 for (n=0xa0; n<0x100; n++)
377 source_to_iso_grid[n] = '?';
380 for (n=0xa0; n<0xc1; n++)
382 { case 0xa0: case 0xa4: case 0xac: case 0xad:
383 case 0xbb: case 0xbf: break;
384 default: source_to_iso_grid[n] = '?';
386 for (n=0xdb; n<0xe0; n++)
387 source_to_iso_grid[n] = '?';
388 for (n=0xf3; n<0x100; n++)
389 source_to_iso_grid[n] = '?';
392 source_to_iso_grid[0xa4] = '?';
393 source_to_iso_grid[0xa5] = '?';
394 source_to_iso_grid[0xaa] = '?';
395 source_to_iso_grid[0xae] = '?';
396 source_to_iso_grid[0xd2] = '?';
397 source_to_iso_grid[0xff] = '?';
400 source_to_iso_grid[0xa1] = '?';
401 for (n=0xbf; n<0xdf; n++)
402 source_to_iso_grid[n] = '?';
403 for (n=0xfb; n<0x100; n++)
404 source_to_iso_grid[n] = '?';
410 /* ------------------------------------------------------------------------- */
411 /* (2) ISO -> Unicode */
413 /* Need not be rapid, as the results are mostly cached. */
414 /* Always succeeds. */
415 /* ------------------------------------------------------------------------- */
417 extern int iso_to_unicode(int iso)
419 switch(character_set_setting)
422 case 0: /* Plain ASCII only */
425 case 1: /* ISO 8859-1: Latin1: west European */
428 case 2: /* ISO 8859-2: Latin2: central European */
431 { case 0xA1: u=0x0104; break; /* LATIN CAP A WITH OGONEK */
432 case 0xA2: u=0x02D8; break; /* BREVE */
433 case 0xA3: u=0x0141; break; /* LATIN CAP L WITH STROKE */
434 case 0xA5: u=0x013D; break; /* LATIN CAP L WITH CARON */
435 case 0xA6: u=0x015A; break; /* LATIN CAP S WITH ACUTE */
436 case 0xA9: u=0x0160; break; /* LATIN CAP S WITH CARON */
437 case 0xAA: u=0x015E; break; /* LATIN CAP S WITH CEDILLA */
438 case 0xAB: u=0x0164; break; /* LATIN CAP T WITH CARON */
439 case 0xAC: u=0x0179; break; /* LATIN CAP Z WITH ACUTE */
440 case 0xAE: u=0x017D; break; /* LATIN CAP Z WITH CARON */
441 case 0xAF: u=0x017B; break; /* LATIN CAP Z WITH DOT ABOVE */
442 case 0xB1: u=0x0105; break; /* LATIN SMALL A WITH OGONEK */
443 case 0xB2: u=0x02DB; break; /* OGONEK */
444 case 0xB3: u=0x0142; break; /* LATIN SMALL L WITH STROKE */
445 case 0xB5: u=0x013E; break; /* LATIN SMALL L WITH CARON */
446 case 0xB6: u=0x015B; break; /* LATIN SMALL S WITH ACUTE */
447 case 0xB7: u=0x02C7; break; /* CARON */
448 case 0xB9: u=0x0161; break; /* LATIN SMALL S WITH CARON */
449 case 0xBA: u=0x015F; break; /* LATIN SMALL S WITH CEDILLA */
450 case 0xBB: u=0x0165; break; /* LATIN SMALL T WITH CARON */
451 case 0xBC: u=0x017A; break; /* LATIN SMALL Z WITH ACUTE */
452 case 0xBD: u=0x02DD; break; /* DOUBLE ACUTE ACCENT */
453 case 0xBE: u=0x017E; break; /* LATIN SMALL Z WITH CARON */
454 case 0xBF: u=0x017C; break; /* LATIN SMALL Z WITH DOT ABOVE */
455 case 0xC0: u=0x0154; break; /* LATIN CAP R WITH ACUTE */
456 case 0xC3: u=0x0102; break; /* LATIN CAP A WITH BREVE */
457 case 0xC5: u=0x0139; break; /* LATIN CAP L WITH ACUTE */
458 case 0xC6: u=0x0106; break; /* LATIN CAP C WITH ACUTE */
459 case 0xC8: u=0x010C; break; /* LATIN CAP C WITH CARON */
460 case 0xCA: u=0x0118; break; /* LATIN CAP E WITH OGONEK */
461 case 0xCC: u=0x011A; break; /* LATIN CAP E WITH CARON */
462 case 0xCF: u=0x010E; break; /* LATIN CAP D WITH CARON */
463 case 0xD0: u=0x0110; break; /* LATIN CAP D WITH STROKE */
464 case 0xD1: u=0x0143; break; /* LATIN CAP N WITH ACUTE */
465 case 0xD2: u=0x0147; break; /* LATIN CAP N WITH CARON */
466 case 0xD5: u=0x0150; break; /* LATIN CAP O WITH DOUBLE ACUTE */
467 case 0xD8: u=0x0158; break; /* LATIN CAP R WITH CARON */
468 case 0xD9: u=0x016E; break; /* LATIN CAP U WITH RING ABOVE */
469 case 0xDB: u=0x0170; break; /* LATIN CAP U WITH DOUBLE ACUTE */
470 case 0xDE: u=0x0162; break; /* LATIN CAP T WITH CEDILLA */
471 case 0xE0: u=0x0155; break; /* LATIN SMALL R WITH ACUTE */
472 case 0xE3: u=0x0103; break; /* LATIN SMALL A WITH BREVE */
473 case 0xE5: u=0x013A; break; /* LATIN SMALL L WITH ACUTE */
474 case 0xE6: u=0x0107; break; /* LATIN SMALL C WITH ACUTE */
475 case 0xE8: u=0x010D; break; /* LATIN SMALL C WITH CARON */
476 case 0xEA: u=0x0119; break; /* LATIN SMALL E WITH OGONEK */
477 case 0xEC: u=0x011B; break; /* LATIN SMALL E WITH CARON */
478 case 0xEF: u=0x010F; break; /* LATIN SMALL D WITH CARON */
479 case 0xF0: u=0x0111; break; /* LATIN SMALL D WITH STROKE */
480 case 0xF1: u=0x0144; break; /* LATIN SMALL N WITH ACUTE */
481 case 0xF2: u=0x0148; break; /* LATIN SMALL N WITH CARON */
482 case 0xF5: u=0x0151; break; /* LATIN SMALL O WITH DOUBLE ACUTE */
483 case 0xF8: u=0x0159; break; /* LATIN SMALL R WITH CARON */
484 case 0xF9: u=0x016F; break; /* LATIN SMALL U WITH RING ABOVE */
485 case 0xFB: u=0x0171; break; /* LATIN SMALL U WITH DOUBLE ACUTE */
486 case 0xFE: u=0x0163; break; /* LATIN SMALL T WITH CEDILLA */
487 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
490 case 3: /* ISO 8859-3: Latin3: central European */
493 { case 0xA1: u=0x0126; break; /* LATIN CAP H WITH STROKE */
494 case 0xA2: u=0x02D8; break; /* BREVE */
495 case 0xA6: u=0x0124; break; /* LATIN CAP H WITH CIRCUMFLEX */
496 case 0xA9: u=0x0130; break; /* LATIN CAP I WITH DOT ABOVE */
497 case 0xAA: u=0x015E; break; /* LATIN CAP S WITH CEDILLA */
498 case 0xAB: u=0x011E; break; /* LATIN CAP G WITH BREVE */
499 case 0xAC: u=0x0134; break; /* LATIN CAP J WITH CIRCUMFLEX */
500 case 0xAF: u=0x017B; break; /* LATIN CAP Z WITH DOT ABOVE */
501 case 0xB1: u=0x0127; break; /* LATIN SMALL H WITH STROKE */
502 case 0xB6: u=0x0125; break; /* LATIN SMALL H WITH CIRCUMFLEX */
503 case 0xB9: u=0x0131; break; /* LATIN SMALL DOTLESS I */
504 case 0xBA: u=0x015F; break; /* LATIN SMALL S WITH CEDILLA */
505 case 0xBB: u=0x011F; break; /* LATIN SMALL G WITH BREVE */
506 case 0xBC: u=0x0135; break; /* LATIN SMALL J WITH CIRCUMFLEX */
507 case 0xBF: u=0x017C; break; /* LATIN SMALL Z WITH DOT ABOVE */
508 case 0xC5: u=0x010A; break; /* LATIN CAP C WITH DOT ABOVE */
509 case 0xC6: u=0x0108; break; /* LATIN CAP C WITH CIRCUMFLEX */
510 case 0xD5: u=0x0120; break; /* LATIN CAP G WITH DOT ABOVE */
511 case 0xD8: u=0x011C; break; /* LATIN CAP G WITH CIRCUMFLEX */
512 case 0xDD: u=0x016C; break; /* LATIN CAP U WITH BREVE */
513 case 0xDE: u=0x015C; break; /* LATIN CAP S WITH CIRCUMFLEX */
514 case 0xE5: u=0x010B; break; /* LATIN SMALL C WITH DOT ABOVE */
515 case 0xE6: u=0x0109; break; /* LATIN SMALL C WITH CIRCUMFLEX */
516 case 0xF5: u=0x0121; break; /* LATIN SMALL G WITH DOT ABOVE */
517 case 0xF8: u=0x011D; break; /* LATIN SMALL G WITH CIRCUMFLEX */
518 case 0xFD: u=0x016D; break; /* LATIN SMALL U WITH BREVE */
519 case 0xFE: u=0x015D; break; /* LATIN SMALL S WITH CIRCUMFLEX */
520 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
523 case 4: /* ISO 8859-4: Latin4: central European */
526 { case 0xA1: u=0x0104; break; /* LATIN CAP A WITH OGONEK */
527 case 0xA2: u=0x0138; break; /* LATIN SMALL KRA */
528 case 0xA3: u=0x0156; break; /* LATIN CAP R WITH CEDILLA */
529 case 0xA5: u=0x0128; break; /* LATIN CAP I WITH TILDE */
530 case 0xA6: u=0x013B; break; /* LATIN CAP L WITH CEDILLA */
531 case 0xA9: u=0x0160; break; /* LATIN CAP S WITH CARON */
532 case 0xAA: u=0x0112; break; /* LATIN CAP E WITH MACRON */
533 case 0xAB: u=0x0122; break; /* LATIN CAP G WITH CEDILLA */
534 case 0xAC: u=0x0166; break; /* LATIN CAP T WITH STROKE */
535 case 0xAE: u=0x017D; break; /* LATIN CAP Z WITH CARON */
536 case 0xB1: u=0x0105; break; /* LATIN SMALL A WITH OGONEK */
537 case 0xB2: u=0x02DB; break; /* OGONEK */
538 case 0xB3: u=0x0157; break; /* LATIN SMALL R WITH CEDILLA */
539 case 0xB5: u=0x0129; break; /* LATIN SMALL I WITH TILDE */
540 case 0xB6: u=0x013C; break; /* LATIN SMALL L WITH CEDILLA */
541 case 0xB7: u=0x02C7; break; /* CARON */
542 case 0xB9: u=0x0161; break; /* LATIN SMALL S WITH CARON */
543 case 0xBA: u=0x0113; break; /* LATIN SMALL E WITH MACRON */
544 case 0xBB: u=0x0123; break; /* LATIN SMALL G WITH CEDILLA */
545 case 0xBC: u=0x0167; break; /* LATIN SMALL T WITH STROKE */
546 case 0xBD: u=0x014A; break; /* LATIN CAP ENG */
547 case 0xBE: u=0x017E; break; /* LATIN SMALL Z WITH CARON */
548 case 0xBF: u=0x014B; break; /* LATIN SMALL ENG */
549 case 0xC0: u=0x0100; break; /* LATIN CAP A WITH MACRON */
550 case 0xC7: u=0x012E; break; /* LATIN CAP I WITH OGONEK */
551 case 0xC8: u=0x010C; break; /* LATIN CAP C WITH CARON */
552 case 0xCA: u=0x0118; break; /* LATIN CAP E WITH OGONEK */
553 case 0xCC: u=0x0116; break; /* LATIN CAP E WITH DOT ABOVE */
554 case 0xCF: u=0x012A; break; /* LATIN CAP I WITH MACRON */
555 case 0xD0: u=0x0110; break; /* LATIN CAP D WITH STROKE */
556 case 0xD1: u=0x0145; break; /* LATIN CAP N WITH CEDILLA */
557 case 0xD2: u=0x014C; break; /* LATIN CAP O WITH MACRON */
558 case 0xD3: u=0x0136; break; /* LATIN CAP K WITH CEDILLA */
559 case 0xD9: u=0x0172; break; /* LATIN CAP U WITH OGONEK */
560 case 0xDD: u=0x0168; break; /* LATIN CAP U WITH TILDE */
561 case 0xDE: u=0x016A; break; /* LATIN CAP U WITH MACRON */
562 case 0xE0: u=0x0101; break; /* LATIN SMALL A WITH MACRON */
563 case 0xE7: u=0x012F; break; /* LATIN SMALL I WITH OGONEK */
564 case 0xE8: u=0x010D; break; /* LATIN SMALL C WITH CARON */
565 case 0xEA: u=0x0119; break; /* LATIN SMALL E WITH OGONEK */
566 case 0xEC: u=0x0117; break; /* LATIN SMALL E WITH DOT ABOVE */
567 case 0xEF: u=0x012B; break; /* LATIN SMALL I WITH MACRON */
568 case 0xF0: u=0x0111; break; /* LATIN SMALL D WITH STROKE */
569 case 0xF1: u=0x0146; break; /* LATIN SMALL N WITH CEDILLA */
570 case 0xF2: u=0x014D; break; /* LATIN SMALL O WITH MACRON */
571 case 0xF3: u=0x0137; break; /* LATIN SMALL K WITH CEDILLA */
572 case 0xF9: u=0x0173; break; /* LATIN SMALL U WITH OGONEK */
573 case 0xFD: u=0x0169; break; /* LATIN SMALL U WITH TILDE */
574 case 0xFE: u=0x016B; break; /* LATIN SMALL U WITH MACRON */
575 case 0xFF: u=0x02D9; break; /* DOT ABOVE */
578 case 5: /* ISO 8859-5: Cyrillic */
581 { case 0xA1: u=0x0401; break; /* CYRILLIC CAP IO */
582 case 0xA2: u=0x0402; break; /* CYRILLIC CAP DJE */
583 case 0xA3: u=0x0403; break; /* CYRILLIC CAP GJE */
584 case 0xA4: u=0x0404; break; /* CYRILLIC CAP UKRAINIAN IE */
585 case 0xA5: u=0x0405; break; /* CYRILLIC CAP DZE */
586 case 0xA6: u=0x0406; break; /* CYRILLIC CAP BYELORUSSIAN-UKRAINIAN I */
587 case 0xA7: u=0x0407; break; /* CYRILLIC CAP YI */
588 case 0xA8: u=0x0408; break; /* CYRILLIC CAP JE */
589 case 0xA9: u=0x0409; break; /* CYRILLIC CAP LJE */
590 case 0xAA: u=0x040A; break; /* CYRILLIC CAP NJE */
591 case 0xAB: u=0x040B; break; /* CYRILLIC CAP TSHE */
592 case 0xAC: u=0x040C; break; /* CYRILLIC CAP KJE */
593 case 0xAE: u=0x040E; break; /* CYRILLIC CAP SHORT U */
594 case 0xAF: u=0x040F; break; /* CYRILLIC CAP DZHE */
595 case 0xB0: u=0x0410; break; /* CYRILLIC CAP A */
596 case 0xB1: u=0x0411; break; /* CYRILLIC CAP BE */
597 case 0xB2: u=0x0412; break; /* CYRILLIC CAP VE */
598 case 0xB3: u=0x0413; break; /* CYRILLIC CAP GHE */
599 case 0xB4: u=0x0414; break; /* CYRILLIC CAP DE */
600 case 0xB5: u=0x0415; break; /* CYRILLIC CAP IE */
601 case 0xB6: u=0x0416; break; /* CYRILLIC CAP ZHE */
602 case 0xB7: u=0x0417; break; /* CYRILLIC CAP ZE */
603 case 0xB8: u=0x0418; break; /* CYRILLIC CAP I */
604 case 0xB9: u=0x0419; break; /* CYRILLIC CAP SHORT I */
605 case 0xBA: u=0x041A; break; /* CYRILLIC CAP KA */
606 case 0xBB: u=0x041B; break; /* CYRILLIC CAP EL */
607 case 0xBC: u=0x041C; break; /* CYRILLIC CAP EM */
608 case 0xBD: u=0x041D; break; /* CYRILLIC CAP EN */
609 case 0xBE: u=0x041E; break; /* CYRILLIC CAP O */
610 case 0xBF: u=0x041F; break; /* CYRILLIC CAP PE */
611 case 0xC0: u=0x0420; break; /* CYRILLIC CAP ER */
612 case 0xC1: u=0x0421; break; /* CYRILLIC CAP ES */
613 case 0xC2: u=0x0422; break; /* CYRILLIC CAP TE */
614 case 0xC3: u=0x0423; break; /* CYRILLIC CAP U */
615 case 0xC4: u=0x0424; break; /* CYRILLIC CAP EF */
616 case 0xC5: u=0x0425; break; /* CYRILLIC CAP HA */
617 case 0xC6: u=0x0426; break; /* CYRILLIC CAP TSE */
618 case 0xC7: u=0x0427; break; /* CYRILLIC CAP CHE */
619 case 0xC8: u=0x0428; break; /* CYRILLIC CAP SHA */
620 case 0xC9: u=0x0429; break; /* CYRILLIC CAP SHCHA */
621 case 0xCA: u=0x042A; break; /* CYRILLIC CAP HARD SIGN */
622 case 0xCB: u=0x042B; break; /* CYRILLIC CAP YERU */
623 case 0xCC: u=0x042C; break; /* CYRILLIC CAP SOFT SIGN */
624 case 0xCD: u=0x042D; break; /* CYRILLIC CAP E */
625 case 0xCE: u=0x042E; break; /* CYRILLIC CAP YU */
626 case 0xCF: u=0x042F; break; /* CYRILLIC CAP YA */
627 case 0xD0: u=0x0430; break; /* CYRILLIC SMALL A */
628 case 0xD1: u=0x0431; break; /* CYRILLIC SMALL BE */
629 case 0xD2: u=0x0432; break; /* CYRILLIC SMALL VE */
630 case 0xD3: u=0x0433; break; /* CYRILLIC SMALL GHE */
631 case 0xD4: u=0x0434; break; /* CYRILLIC SMALL DE */
632 case 0xD5: u=0x0435; break; /* CYRILLIC SMALL IE */
633 case 0xD6: u=0x0436; break; /* CYRILLIC SMALL ZHE */
634 case 0xD7: u=0x0437; break; /* CYRILLIC SMALL ZE */
635 case 0xD8: u=0x0438; break; /* CYRILLIC SMALL I */
636 case 0xD9: u=0x0439; break; /* CYRILLIC SMALL SHORT I */
637 case 0xDA: u=0x043A; break; /* CYRILLIC SMALL KA */
638 case 0xDB: u=0x043B; break; /* CYRILLIC SMALL EL */
639 case 0xDC: u=0x043C; break; /* CYRILLIC SMALL EM */
640 case 0xDD: u=0x043D; break; /* CYRILLIC SMALL EN */
641 case 0xDE: u=0x043E; break; /* CYRILLIC SMALL O */
642 case 0xDF: u=0x043F; break; /* CYRILLIC SMALL PE */
643 case 0xE0: u=0x0440; break; /* CYRILLIC SMALL ER */
644 case 0xE1: u=0x0441; break; /* CYRILLIC SMALL ES */
645 case 0xE2: u=0x0442; break; /* CYRILLIC SMALL TE */
646 case 0xE3: u=0x0443; break; /* CYRILLIC SMALL U */
647 case 0xE4: u=0x0444; break; /* CYRILLIC SMALL EF */
648 case 0xE5: u=0x0445; break; /* CYRILLIC SMALL HA */
649 case 0xE6: u=0x0446; break; /* CYRILLIC SMALL TSE */
650 case 0xE7: u=0x0447; break; /* CYRILLIC SMALL CHE */
651 case 0xE8: u=0x0448; break; /* CYRILLIC SMALL SHA */
652 case 0xE9: u=0x0449; break; /* CYRILLIC SMALL SHCHA */
653 case 0xEA: u=0x044A; break; /* CYRILLIC SMALL HARD SIGN */
654 case 0xEB: u=0x044B; break; /* CYRILLIC SMALL YERU */
655 case 0xEC: u=0x044C; break; /* CYRILLIC SMALL SOFT SIGN */
656 case 0xED: u=0x044D; break; /* CYRILLIC SMALL E */
657 case 0xEE: u=0x044E; break; /* CYRILLIC SMALL YU */
658 case 0xEF: u=0x044F; break; /* CYRILLIC SMALL YA */
659 case 0xF0: u=0x2116; break; /* NUMERO SIGN */
660 case 0xF1: u=0x0451; break; /* CYRILLIC SMALL IO */
661 case 0xF2: u=0x0452; break; /* CYRILLIC SMALL DJE */
662 case 0xF3: u=0x0453; break; /* CYRILLIC SMALL GJE */
663 case 0xF4: u=0x0454; break; /* CYRILLIC SMALL UKRAINIAN IE */
664 case 0xF5: u=0x0455; break; /* CYRILLIC SMALL DZE */
665 case 0xF6: u=0x0456; break; /* CYRILLIC SMALL BYELORUSSIAN-UKRAINIAN I */
666 case 0xF7: u=0x0457; break; /* CYRILLIC SMALL YI */
667 case 0xF8: u=0x0458; break; /* CYRILLIC SMALL JE */
668 case 0xF9: u=0x0459; break; /* CYRILLIC SMALL LJE */
669 case 0xFA: u=0x045A; break; /* CYRILLIC SMALL NJE */
670 case 0xFB: u=0x045B; break; /* CYRILLIC SMALL TSHE */
671 case 0xFC: u=0x045C; break; /* CYRILLIC SMALL KJE */
672 case 0xFD: u=0x00A7; break; /* SECTION SIGN */
673 case 0xFE: u=0x045E; break; /* CYRILLIC SMALL SHORT U */
674 case 0xFF: u=0x045F; break; /* CYRILLIC SMALL DZHE */
677 case 6: /* ISO 8859-6: Arabic */
680 { case 0xAC: u=0x060C; break; /* ARABIC COMMA */
681 case 0xBB: u=0x061B; break; /* ARABIC SEMICOLON */
682 case 0xBF: u=0x061F; break; /* ARABIC QUESTION MARK */
683 case 0xC1: u=0x0621; break; /* ARABIC HAMZA */
684 case 0xC2: u=0x0622; break; /* ARABIC ALEF WITH MADDA ABOVE */
685 case 0xC3: u=0x0623; break; /* ARABIC ALEF WITH HAMZA ABOVE */
686 case 0xC4: u=0x0624; break; /* ARABIC WAW WITH HAMZA ABOVE */
687 case 0xC5: u=0x0625; break; /* ARABIC ALEF WITH HAMZA BELOW */
688 case 0xC6: u=0x0626; break; /* ARABIC YEH WITH HAMZA ABOVE */
689 case 0xC7: u=0x0627; break; /* ARABIC ALEF */
690 case 0xC8: u=0x0628; break; /* ARABIC BEH */
691 case 0xC9: u=0x0629; break; /* ARABIC TEH MARBUTA */
692 case 0xCA: u=0x062A; break; /* ARABIC TEH */
693 case 0xCB: u=0x062B; break; /* ARABIC THEH */
694 case 0xCC: u=0x062C; break; /* ARABIC JEEM */
695 case 0xCD: u=0x062D; break; /* ARABIC HAH */
696 case 0xCE: u=0x062E; break; /* ARABIC KHAH */
697 case 0xCF: u=0x062F; break; /* ARABIC DAL */
698 case 0xD0: u=0x0630; break; /* ARABIC THAL */
699 case 0xD1: u=0x0631; break; /* ARABIC REH */
700 case 0xD2: u=0x0632; break; /* ARABIC ZAIN */
701 case 0xD3: u=0x0633; break; /* ARABIC SEEN */
702 case 0xD4: u=0x0634; break; /* ARABIC SHEEN */
703 case 0xD5: u=0x0635; break; /* ARABIC SAD */
704 case 0xD6: u=0x0636; break; /* ARABIC DAD */
705 case 0xD7: u=0x0637; break; /* ARABIC TAH */
706 case 0xD8: u=0x0638; break; /* ARABIC ZAH */
707 case 0xD9: u=0x0639; break; /* ARABIC AIN */
708 case 0xDA: u=0x063A; break; /* ARABIC GHAIN */
709 case 0xE0: u=0x0640; break; /* ARABIC TATWEEL */
710 case 0xE1: u=0x0641; break; /* ARABIC FEH */
711 case 0xE2: u=0x0642; break; /* ARABIC QAF */
712 case 0xE3: u=0x0643; break; /* ARABIC KAF */
713 case 0xE4: u=0x0644; break; /* ARABIC LAM */
714 case 0xE5: u=0x0645; break; /* ARABIC MEEM */
715 case 0xE6: u=0x0646; break; /* ARABIC NOON */
716 case 0xE7: u=0x0647; break; /* ARABIC HEH */
717 case 0xE8: u=0x0648; break; /* ARABIC WAW */
718 case 0xE9: u=0x0649; break; /* ARABIC ALEF MAKSURA */
719 case 0xEA: u=0x064A; break; /* ARABIC YEH */
720 case 0xEB: u=0x064B; break; /* ARABIC FATHATAN */
721 case 0xEC: u=0x064C; break; /* ARABIC DAMMATAN */
722 case 0xED: u=0x064D; break; /* ARABIC KASRATAN */
723 case 0xEE: u=0x064E; break; /* ARABIC FATHA */
724 case 0xEF: u=0x064F; break; /* ARABIC DAMMA */
725 case 0xF0: u=0x0650; break; /* ARABIC KASRA */
726 case 0xF1: u=0x0651; break; /* ARABIC SHADDA */
727 case 0xF2: u=0x0652; break; /* ARABIC SUKUN */
730 case 7: /* ISO 8859-7: Greek */
733 { case 0xA1: u=0x02BD; break; /* MODIFIER REVERSED COMMA */
734 case 0xA2: u=0x02BC; break; /* MODIFIER APOSTROPHE */
735 case 0xAF: u=0x2015; break; /* HORIZONTAL BAR */
736 case 0xB4: u=0x0384; break; /* GREEK TONOS */
737 case 0xB5: u=0x0385; break; /* GREEK DIALYTIKA TONOS */
738 case 0xB6: u=0x0386; break; /* GREEK CAP ALPHA WITH TONOS */
739 case 0xB8: u=0x0388; break; /* GREEK CAP EPSILON WITH TONOS */
740 case 0xB9: u=0x0389; break; /* GREEK CAP ETA WITH TONOS */
741 case 0xBA: u=0x038A; break; /* GREEK CAP IOTA WITH TONOS */
742 case 0xBC: u=0x038C; break; /* GREEK CAP OMICRON WITH TONOS */
743 case 0xBE: u=0x038E; break; /* GREEK CAP UPSILON WITH TONOS */
744 case 0xBF: u=0x038F; break; /* GREEK CAP OMEGA WITH TONOS */
745 case 0xC0: u=0x0390; break; /* GREEK SMALL IOTA WITH DIALYTIKA AND TONOS */
746 case 0xC1: u=0x0391; break; /* GREEK CAP ALPHA */
747 case 0xC2: u=0x0392; break; /* GREEK CAP BETA */
748 case 0xC3: u=0x0393; break; /* GREEK CAP GAMMA */
749 case 0xC4: u=0x0394; break; /* GREEK CAP DELTA */
750 case 0xC5: u=0x0395; break; /* GREEK CAP EPSILON */
751 case 0xC6: u=0x0396; break; /* GREEK CAP ZETA */
752 case 0xC7: u=0x0397; break; /* GREEK CAP ETA */
753 case 0xC8: u=0x0398; break; /* GREEK CAP THETA */
754 case 0xC9: u=0x0399; break; /* GREEK CAP IOTA */
755 case 0xCA: u=0x039A; break; /* GREEK CAP KAPPA */
756 case 0xCB: u=0x039B; break; /* GREEK CAP LAMDA */
757 case 0xCC: u=0x039C; break; /* GREEK CAP MU */
758 case 0xCD: u=0x039D; break; /* GREEK CAP NU */
759 case 0xCE: u=0x039E; break; /* GREEK CAP XI */
760 case 0xCF: u=0x039F; break; /* GREEK CAP OMICRON */
761 case 0xD0: u=0x03A0; break; /* GREEK CAP PI */
762 case 0xD1: u=0x03A1; break; /* GREEK CAP RHO */
763 case 0xD3: u=0x03A3; break; /* GREEK CAP SIGMA */
764 case 0xD4: u=0x03A4; break; /* GREEK CAP TAU */
765 case 0xD5: u=0x03A5; break; /* GREEK CAP UPSILON */
766 case 0xD6: u=0x03A6; break; /* GREEK CAP PHI */
767 case 0xD7: u=0x03A7; break; /* GREEK CAP CHI */
768 case 0xD8: u=0x03A8; break; /* GREEK CAP PSI */
769 case 0xD9: u=0x03A9; break; /* GREEK CAP OMEGA */
770 case 0xDA: u=0x03AA; break; /* GREEK CAP IOTA WITH DIALYTIKA */
771 case 0xDB: u=0x03AB; break; /* GREEK CAP UPSILON WITH DIALYTIKA */
772 case 0xDC: u=0x03AC; break; /* GREEK SMALL ALPHA WITH TONOS */
773 case 0xDD: u=0x03AD; break; /* GREEK SMALL EPSILON WITH TONOS */
774 case 0xDE: u=0x03AE; break; /* GREEK SMALL ETA WITH TONOS */
775 case 0xDF: u=0x03AF; break; /* GREEK SMALL IOTA WITH TONOS */
776 case 0xE0: u=0x03B0; break; /* GREEK SMALL UPSILON WITH DIALYTIKA AND TONOS */
777 case 0xE1: u=0x03B1; break; /* GREEK SMALL ALPHA */
778 case 0xE2: u=0x03B2; break; /* GREEK SMALL BETA */
779 case 0xE3: u=0x03B3; break; /* GREEK SMALL GAMMA */
780 case 0xE4: u=0x03B4; break; /* GREEK SMALL DELTA */
781 case 0xE5: u=0x03B5; break; /* GREEK SMALL EPSILON */
782 case 0xE6: u=0x03B6; break; /* GREEK SMALL ZETA */
783 case 0xE7: u=0x03B7; break; /* GREEK SMALL ETA */
784 case 0xE8: u=0x03B8; break; /* GREEK SMALL THETA */
785 case 0xE9: u=0x03B9; break; /* GREEK SMALL IOTA */
786 case 0xEA: u=0x03BA; break; /* GREEK SMALL KAPPA */
787 case 0xEB: u=0x03BB; break; /* GREEK SMALL LAMDA */
788 case 0xEC: u=0x03BC; break; /* GREEK SMALL MU */
789 case 0xED: u=0x03BD; break; /* GREEK SMALL NU */
790 case 0xEE: u=0x03BE; break; /* GREEK SMALL XI */
791 case 0xEF: u=0x03BF; break; /* GREEK SMALL OMICRON */
792 case 0xF0: u=0x03C0; break; /* GREEK SMALL PI */
793 case 0xF1: u=0x03C1; break; /* GREEK SMALL RHO */
794 case 0xF2: u=0x03C2; break; /* GREEK SMALL FINAL SIGMA */
795 case 0xF3: u=0x03C3; break; /* GREEK SMALL SIGMA */
796 case 0xF4: u=0x03C4; break; /* GREEK SMALL TAU */
797 case 0xF5: u=0x03C5; break; /* GREEK SMALL UPSILON */
798 case 0xF6: u=0x03C6; break; /* GREEK SMALL PHI */
799 case 0xF7: u=0x03C7; break; /* GREEK SMALL CHI */
800 case 0xF8: u=0x03C8; break; /* GREEK SMALL PSI */
801 case 0xF9: u=0x03C9; break; /* GREEK SMALL OMEGA */
802 case 0xFA: u=0x03CA; break; /* GREEK SMALL IOTA WITH DIALYTIKA */
803 case 0xFB: u=0x03CB; break; /* GREEK SMALL UPSILON WITH DIALYTIKA */
804 case 0xFC: u=0x03CC; break; /* GREEK SMALL OMICRON WITH TONOS */
805 case 0xFD: u=0x03CD; break; /* GREEK SMALL UPSILON WITH TONOS */
806 case 0xFE: u=0x03CE; break; /* GREEK SMALL OMEGA WITH TONOS */
809 case 8: /* ISO 8859-8: Hebrew */
812 { case 0xAA: u=0x00D7; break; /* MULTIPLICATION SIGN */
813 case 0xAF: u=0x203E; break; /* OVERLINE */
814 case 0xBA: u=0x00F7; break; /* DIVISION SIGN */
815 case 0xDF: u=0x2017; break; /* DOUBLE LOW LINE */
816 case 0xE0: u=0x05D0; break; /* HEBREW ALEF */
817 case 0xE1: u=0x05D1; break; /* HEBREW BET */
818 case 0xE2: u=0x05D2; break; /* HEBREW GIMEL */
819 case 0xE3: u=0x05D3; break; /* HEBREW DALET */
820 case 0xE4: u=0x05D4; break; /* HEBREW HE */
821 case 0xE5: u=0x05D5; break; /* HEBREW VAV */
822 case 0xE6: u=0x05D6; break; /* HEBREW ZAYIN */
823 case 0xE7: u=0x05D7; break; /* HEBREW HET */
824 case 0xE8: u=0x05D8; break; /* HEBREW TET */
825 case 0xE9: u=0x05D9; break; /* HEBREW YOD */
826 case 0xEA: u=0x05DA; break; /* HEBREW FINAL KAF */
827 case 0xEB: u=0x05DB; break; /* HEBREW KAF */
828 case 0xEC: u=0x05DC; break; /* HEBREW LAMED */
829 case 0xED: u=0x05DD; break; /* HEBREW FINAL MEM */
830 case 0xEE: u=0x05DE; break; /* HEBREW MEM */
831 case 0xEF: u=0x05DF; break; /* HEBREW FINAL NUN */
832 case 0xF0: u=0x05E0; break; /* HEBREW NUN */
833 case 0xF1: u=0x05E1; break; /* HEBREW SAMEKH */
834 case 0xF2: u=0x05E2; break; /* HEBREW AYIN */
835 case 0xF3: u=0x05E3; break; /* HEBREW FINAL PE */
836 case 0xF4: u=0x05E4; break; /* HEBREW PE */
837 case 0xF5: u=0x05E5; break; /* HEBREW FINAL TSADI */
838 case 0xF6: u=0x05E6; break; /* HEBREW TSADI */
839 case 0xF7: u=0x05E7; break; /* HEBREW QOF */
840 case 0xF8: u=0x05E8; break; /* HEBREW RESH */
841 case 0xF9: u=0x05E9; break; /* HEBREW SHIN */
842 case 0xFA: u=0x05EA; break; /* HEBREW TAV */
845 case 9: /* ISO 8859-9: Latin5: west European without Icelandic */
849 case 0xD0: u=0x011E; break; /* LATIN CAP G WITH BREVE */
850 case 0xDD: u=0x0130; break; /* LATIN CAP I WITH DOT ABOVE */
851 case 0xDE: u=0x015e; break; /* LATIN CAP S WITH CEDILLA */
852 case 0xF0: u=0x011f; break; /* LATIN SMALL G WITH BREVE */
853 case 0xFD: u=0x0131; break; /* LATIN SMALL DOTLESS I */
854 case 0xFE: u=0x015f; break; /* LATIN SMALL S WITH CEDILLA */
861 /* ------------------------------------------------------------------------- */
862 /* (3) Unicode -> ZSCII and vice versa */
864 /* Need not be rapid, as the results are mostly cached. */
865 /* Unicode chars which can't be fitted into ZSCII are converted to the */
866 /* value 5 (the pad character used in the dictionary and elsewhere). */
867 /* ------------------------------------------------------------------------- */
869 int zscii_defn_modified, zscii_high_water_mark;
871 int32 zscii_to_unicode_grid[0x61];
873 static void zscii_unicode_map(int zscii, int32 unicode)
874 { if ((zscii < 155) || (zscii > 251))
875 { compiler_error("Attempted to map a Unicode character into the ZSCII \
876 set at an illegal position");
879 zscii_to_unicode_grid[zscii-155] = unicode;
880 zscii_defn_modified = TRUE;
883 int default_zscii_highset_sizes[] = { 69, 69, 81, 71, 82, 92, 48, 71, 27, 62 };
885 int32 default_zscii_to_unicode_c01[]
886 = { /* (This ordering is important, unlike those for other char sets)
887 The 69 characters making up the default Unicode translation
888 table (see the Z-Machine Standard 1.0). */
890 0xe4, /* a-diaeresis */ 0xf6, /* o-diaeresis */ 0xfc, /* u-diaeresis */
891 0xc4, /* A-diaeresis */ 0xd6, /* O-diaeresis */ 0xdc, /* U-diaeresis */
892 0xdf, /* sz-ligature */ 0xbb, /* >> */ 0xab, /* << */
893 0xeb, /* e-diaeresis */ 0xef, /* i-diaeresis */ 0xff, /* y-diaeresis */
894 0xcb, /* E-diaeresis */ 0xcf, /* I-diaeresis */ 0xe1, /* a-acute */
895 0xe9, /* e-acute */ 0xed, /* i-acute */ 0xf3, /* o-acute */
896 0xfa, /* u-acute */ 0xfd, /* y-acute */ 0xc1, /* A-acute */
897 0xc9, /* E-acute */ 0xcd, /* I-acute */ 0xd3, /* O-acute */
898 0xda, /* U-acute */ 0xdd, /* Y-acute */ 0xe0, /* a-grave */
899 0xe8, /* e-grave */ 0xec, /* i-grave */ 0xf2, /* o-grave */
900 0xf9, /* u-grave */ 0xc0, /* A-grave */ 0xc8, /* E-grave */
901 0xcc, /* I-grave */ 0xd2, /* O-grave */ 0xd9, /* U-grave */
902 0xe2, /* a-circumflex */ 0xea, /* e-circumflex */
903 0xee, /* i-circumflex */ 0xf4, /* o-circumflex */
904 0xfb, /* u-circumflex */ 0xc2, /* A-circumflex */
905 0xca, /* E-circumflex */ 0xce, /* I-circumflex */
906 0xd4, /* O-circumflex */ 0xdb, /* U-circumflex */
907 0xe5, /* a-ring */ 0xc5, /* A-ring */
908 0xf8, /* o-slash */ 0xd8, /* O-slash */
909 0xe3, /* a-tilde */ 0xf1, /* n-tilde */ 0xf5, /* o-tilde */
910 0xc3, /* A-tilde */ 0xd1, /* N-tilde */ 0xd5, /* O-tilde */
911 0xe6, /* ae-ligature */ 0xc6, /* AE-ligature */
912 0xe7, /* c-cedilla */ 0xc7, /* C-cedilla */
913 0xfe, /* thorn */ 0xf0, /* eth */ 0xde, /* Thorn */ 0xd0, /* Eth */
914 0xa3, /* pound symbol */
915 0x0153, /* oe-ligature */ 0x0152, /* OE-ligature */
916 0xa1, /* inverted ! */ 0xbf /* inverted ? */ };
918 int32 default_zscii_to_unicode_c2[]
919 = { /* The 81 accented letters in Latin2 */
920 0x0104, 0x0141, 0x013D, 0x015A, 0x0160, 0x015E, 0x0164, 0x0179,
921 0x017D, 0x017B, 0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139,
922 0x0106, 0x00C7, 0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD,
923 0x00CE, 0x010E, 0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150,
924 0x00D6, 0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162,
925 0x0105, 0x0142, 0x013E, 0x015B, 0x0161, 0x015F, 0x0165, 0x017A,
926 0x017E, 0x017C, 0x00DF, 0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4,
927 0x013A, 0x0107, 0x00E7, 0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B,
928 0x00ED, 0x00EE, 0x010F, 0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4,
929 0x0151, 0x00F6, 0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD,
932 int32 default_zscii_to_unicode_c3[]
933 = { /* The 71 accented letters in Latin3 */
934 0x0126, 0x0124, 0x0130, 0x015E, 0x011E, 0x0134, 0x017B, 0x0127,
935 0x0125, 0x0131, 0x015F, 0x011F, 0x0135, 0x017C, 0x00C0, 0x00C1,
936 0x00C2, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA,
937 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D1, 0x00D2, 0x00D3,
938 0x00D4, 0x0120, 0x00D6, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC,
939 0x016C, 0x015C, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E4, 0x010B,
940 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED,
941 0x00EE, 0x00EF, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6,
942 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D };
944 int32 default_zscii_to_unicode_c4[]
945 = { /* The 82 accented letters in Latin4 */
946 0x0104, 0x0138, 0x0156, 0x0128, 0x013B, 0x0160, 0x0112, 0x0122,
947 0x0166, 0x017D, 0x0105, 0x0157, 0x0129, 0x013C, 0x0161, 0x0113,
948 0x0123, 0x0167, 0x014A, 0x017E, 0x014B, 0x0100, 0x00C1, 0x00C2,
949 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E, 0x010C, 0x00C9, 0x0118,
950 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A, 0x0110, 0x0145, 0x014C,
951 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D8, 0x0172, 0x00DA, 0x00DB,
952 0x00DC, 0x0168, 0x016A, 0x00DF, 0x0101, 0x00E1, 0x00E2, 0x00E3,
953 0x00E4, 0x00E5, 0x00E6, 0x012F, 0x010D, 0x00E9, 0x0119, 0x00EB,
954 0x0117, 0x00ED, 0x00EE, 0x012B, 0x0111, 0x0146, 0x014D, 0x0137,
955 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC,
958 int32 default_zscii_to_unicode_c5[]
959 = { /* The 92 accented letters in Cyrillic */
960 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408,
961 0x0409, 0x040A, 0x040B, 0x040C, 0x040E, 0x040F, 0x0410, 0x0411,
962 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419,
963 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, 0x0420, 0x0421,
964 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429,
965 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431,
966 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439,
967 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, 0x0440, 0x0441,
968 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449,
969 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x0451, 0x0452,
970 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045A,
971 0x045B, 0x045C, 0x045E, 0x045F };
973 int32 default_zscii_to_unicode_c6[]
974 = { /* The 48 accented letters in Arabic */
975 0x060C, 0x061B, 0x061F, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625,
976 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D,
977 0x062E, 0x062F, 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635,
978 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642,
979 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A,
980 0x064B, 0x064C, 0x064D, 0x064E, 0x064F, 0x0650, 0x0651, 0x0652 };
982 int32 default_zscii_to_unicode_c7[]
983 = { /* The 71 accented letters in Greek */
984 0x0384, 0x0385, 0x0386, 0x0388, 0x0389, 0x038A, 0x038C, 0x038E,
985 0x038F, 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396,
986 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E,
987 0x039F, 0x03A0, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
988 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
989 0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
990 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
991 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
992 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE };
994 int32 default_zscii_to_unicode_c8[]
995 = { /* The 27 accented letters in Hebrew */
996 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
997 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
998 0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
999 0x05E8, 0x05E9, 0x05EA };
1001 int32 default_zscii_to_unicode_c9[]
1002 = { /* The 62 accented letters in Latin5 */
1003 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
1004 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
1005 0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D8,
1006 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF, 0x00E0,
1007 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8,
1008 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x011F,
1009 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x00F9,
1010 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF };
1012 static void make_unicode_zscii_map(void)
1015 for (i=0; i<0x61; i++) zscii_to_unicode_grid[i] = '?';
1017 zscii_high_water_mark
1018 = default_zscii_highset_sizes[character_set_setting];
1020 for (i=0; i<zscii_high_water_mark; i++)
1021 { switch(character_set_setting)
1023 case 1: zscii_unicode_map(i+155, default_zscii_to_unicode_c01[i]);
1025 case 2: zscii_unicode_map(i+155, default_zscii_to_unicode_c2[i]);
1027 case 3: zscii_unicode_map(i+155, default_zscii_to_unicode_c3[i]);
1029 case 4: zscii_unicode_map(i+155, default_zscii_to_unicode_c4[i]);
1031 case 5: zscii_unicode_map(i+155, default_zscii_to_unicode_c5[i]);
1033 case 6: zscii_unicode_map(i+155, default_zscii_to_unicode_c6[i]);
1035 case 7: zscii_unicode_map(i+155, default_zscii_to_unicode_c7[i]);
1037 case 8: zscii_unicode_map(i+155, default_zscii_to_unicode_c8[i]);
1039 case 9: zscii_unicode_map(i+155, default_zscii_to_unicode_c9[i]);
1043 if (character_set_setting < 2) zscii_defn_modified = FALSE;
1044 make_iso_to_alphabet_grid();
1047 extern void new_zscii_character(int32 u, int plus_flag)
1049 if (u < 0 || u > 0xFFFF)
1050 error("Zcharacter table cannot contain Unicode characters beyond $FFFF");
1051 if (plus_flag == FALSE)
1052 zscii_high_water_mark = 0;
1053 if (zscii_high_water_mark == 0x61)
1054 error("No more room in the Zcharacter table");
1055 else zscii_unicode_map(155 + zscii_high_water_mark++, u);
1058 extern void new_zscii_finished(void)
1059 { make_iso_to_alphabet_grid();
1062 extern int unicode_to_zscii(int32 u)
1064 if (u < 0x7f) return u;
1065 for (i=0; i<zscii_high_water_mark; i++)
1066 if (zscii_to_unicode_grid[i] == u) return i+155;
1070 extern int32 zscii_to_unicode(int z)
1071 { if (z < 0x80) return z;
1072 if ((z >= 155) && (z <= 251)) return zscii_to_unicode_grid[z-155];
1076 /* ------------------------------------------------------------------------- */
1077 /* (4) Text -> Unicode */
1079 /* This routine is not used for ordinary text compilation as it is too */
1080 /* slow, but it's useful for handling @ string escapes, or to avoid writing */
1081 /* special code when speed is not especially required. */
1082 /* Note that the two string escapes which can define Unicode are: */
1084 /* @.. where .. is an accent */
1085 /* and @{...} where ... specifies a Unicode char in hexadecimal */
1086 /* (1 to 6 digits long) */
1088 /* If either syntax is malformed, an error is generated */
1089 /* and the Unicode (= ISO = ASCII) character value of '?' is returned */
1091 /* In Unicode mode (character_set_unicode is true), this handles UTF-8 */
1092 /* decoding as well as @-expansion. (So it's called when an '@' appears */
1093 /* *and* when a high-bit character appears.) */
1094 /* ------------------------------------------------------------------------- */
1096 int textual_form_length;
1098 extern int32 text_to_unicode(char *text)
1102 { if (character_set_unicode)
1103 { if (text[0] & 0x80) /* 8-bit */
1104 { switch (text[0] & 0xF0)
1105 { case 0xf0: /* 4-byte UTF-8 string */
1106 textual_form_length = 4;
1107 if ((text[0] & 0xf8) != 0xf0)
1108 { error("Invalid 4-byte UTF-8 string.");
1111 if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80 || (text[3] & 0xc0) != 0x80)
1112 { error("Invalid 4-byte UTF-8 string.");
1115 return (text[0] & 0x07) << 18
1116 | (text[1] & 0x3f) << 12
1117 | (text[2] & 0x3f) << 6
1120 case 0xe0: /* 3-byte UTF-8 string */
1121 textual_form_length = 3;
1122 if ((text[1] & 0xc0) != 0x80 || (text[2] & 0xc0) != 0x80)
1123 { error("Invalid 3-byte UTF-8 string.");
1126 return (text[0] & 0x0f) << 12
1127 | (text[1] & 0x3f) << 6
1130 case 0xc0: /* 2-byte UTF-8 string */
1132 textual_form_length = 2;
1133 if ((text[1] & 0xc0) != 0x80)
1134 { error("Invalid 2-byte UTF-8 string.");
1137 return (text[0] & 0x1f) << 6
1140 default: /* broken */
1141 error("Invalid UTF-8 string.");
1142 textual_form_length = 1;
1147 else /* nice 7-bit */
1148 { textual_form_length = 1;
1149 return (uchar) text[0];
1154 textual_form_length = 1;
1155 return iso_to_unicode((uchar) text[0]);
1159 if ((isdigit(text[1])) || (text[1] == '@'))
1160 { ebf_error("'@' plus an accent code or '@{...}'", text);
1161 textual_form_length = 1;
1166 { for (i=0; accents[i] != 0; i+=2)
1167 if ((text[1] == accents[i]) && (text[2] == accents[i+1]))
1168 { textual_form_length = 3;
1169 return default_zscii_to_unicode_c01[i/2];
1173 uac[0]='@'; uac[1]=text[1]; uac[2]=text[2]; uac[3]=0;
1174 error_named("No such accented character as", uac);
1180 while (text[++i] != '}')
1182 { error("'@{' without matching '}'");
1186 { error("At most six hexadecimal digits allowed in '@{...}'");
1189 d = character_digit_value[(uchar)text[i]];
1191 { error("'@{...}' may only contain hexadecimal digits");
1194 total = total*16 + d;
1196 while ((text[i] != '}') && (text[i] != 0)) i++;
1197 if (text[i] == '}') i++;
1198 textual_form_length = i;
1202 textual_form_length = 1;
1206 /* ------------------------------------------------------------------------- */
1207 /* (5) Zscii -> Text */
1209 /* Used for printing out dictionary contents into the text transcript file */
1210 /* or on-screen (in response to the Trace dictionary directive). */
1211 /* In either case, output uses the same ISO set as the source code. */
1212 /* ------------------------------------------------------------------------- */
1214 extern void zscii_to_text(char *text, int zscii)
1218 if ((zscii < 0x100) && (zscii_to_iso_grid[zscii] != 0))
1219 { text[0] = zscii_to_iso_grid[zscii]; text[1] = 0; return;
1222 unicode = zscii_to_unicode(zscii);
1224 if (default_zscii_to_unicode_c01[i] == unicode)
1226 text[1] = accents[2*i];
1227 text[2] = accents[2*i+1];
1228 text[3] = 0; return;
1230 sprintf(text, "@{%x}", unicode);
1233 /* ========================================================================= */
1235 extern char *name_of_iso_set(int s)
1237 { case 1: return "Latin1";
1238 case 2: return "Latin2";
1239 case 3: return "Latin3";
1240 case 4: return "Latin4";
1241 case 5: return "Cyrillic";
1242 case 6: return "Arabic";
1243 case 7: return "Greek";
1244 case 8: return "Hebrew";
1245 case 9: return "Latin5";
1247 return "Plain ASCII";
1250 extern void change_character_set(void)
1251 { make_source_to_iso_grid();
1252 make_unicode_zscii_map();
1255 /* ------------------------------------------------------------------------- */
1256 /* Case translation of standard Roman letters within ISO */
1257 /* ------------------------------------------------------------------------- */
1259 extern void make_lower_case(char *str)
1261 for (i=0; str[i]!=0; i++)
1262 if ((((uchar)str[i])<128) && (isupper(str[i]))) str[i]=tolower(str[i]);
1265 extern void make_upper_case(char *str)
1267 for (i=0; str[i]!=0; i++)
1268 if ((((uchar)str[i])<128) && (islower(str[i]))) str[i]=toupper(str[i]);
1271 /* ========================================================================= */
1272 /* Data structure management routines */
1273 /* ------------------------------------------------------------------------- */
1275 extern void init_chars_vars(void)
1277 for (n=0; n<128; n++) character_digit_value[n] = 127;
1278 character_digit_value['0'] = 0;
1279 character_digit_value['1'] = 1;
1280 character_digit_value['2'] = 2;
1281 character_digit_value['3'] = 3;
1282 character_digit_value['4'] = 4;
1283 character_digit_value['5'] = 5;
1284 character_digit_value['6'] = 6;
1285 character_digit_value['7'] = 7;
1286 character_digit_value['8'] = 8;
1287 character_digit_value['9'] = 9;
1288 character_digit_value['a'] = 10;
1289 character_digit_value['b'] = 11;
1290 character_digit_value['c'] = 12;
1291 character_digit_value['d'] = 13;
1292 character_digit_value['e'] = 14;
1293 character_digit_value['f'] = 15;
1294 character_digit_value['A'] = 10;
1295 character_digit_value['B'] = 11;
1296 character_digit_value['C'] = 12;
1297 character_digit_value['D'] = 13;
1298 character_digit_value['E'] = 14;
1299 character_digit_value['F'] = 15;
1301 strcpy((char *) alphabet[0], "abcdefghijklmnopqrstuvwxyz");
1302 strcpy((char *) alphabet[1], "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
1303 strcpy((char *) alphabet[2], " ^0123456789.,!?_#'~/\\-:()");
1305 alphabet_modified = FALSE;
1307 for (n=0; n<78; n++) alphabet_used[n] = 'N';
1309 change_character_set();
1312 extern void chars_begin_pass(void)
1316 extern void chars_allocate_arrays(void)
1320 extern void chars_free_arrays(void)
1324 /* ========================================================================= */