💾 Archived View for runjimmyrunrunyoufuckerrun.com › src › foreign › pmw › src › string.c captured on 2021-12-17 at 13:26:06.
View Raw
More Information
-=-=-=-=-=-=-
/*************************************************
- The PMS Music Typesetter - 2nd incarnation *
- ************************************************/
/* Copyright (c) Philip Hazel, 1991 - 2020 */
/* Written by Philip Hazel, starting November 1991 */
/* This file last modified: July 2020 */
/* This file contains code for reading and processing strings. */
#include "pmwhdr.h"
#include "readhdr.h"
#define string_start_size 64
static BOOL in_string_check = FALSE;
static uschar *music_escapes = US"bsmcQq#$%><udlr";
static uschar music_escape_values[] = {
49, 50, 51, 53, 55, 57, 37, 39, 40, 122, 121, 126, 124, 123, 125 };
typedef struct esctabstr
{
int escape;
int unicode;
}
esctabstr;
/* Table of 2-character escape sequences. We use @ here instead of circumflex,
because there's been a translation at input time. This table must be in order
because it is searched by binary chop.
Some available accented characters are omitted until I think of a suitable
escape for them. They are characters with these accents:
dotaccent - dot has been used for dieresis since the start of PMW
commaaccent - comma has been used for cedilla ditto
hungrumlaut - doublequote isn't available in PMW strings
ogonek
static esctabstr esctab[] = {
{ ('A' << 8) + '\'', 0x00c1 }, /* Aacute */
{ ('A' << 8) + '-', 0x0100 }, /* Amacron */
{ ('A' << 8) + '.', 0x00c4 }, /* Adieresis */
{ ('A' << 8) + '@', 0x00c2 }, /* Acircumflex */
{ ('A' << 8) + '`', 0x00c0 }, /* Agrave */
{ ('A' << 8) + 'o', 0x00c5 }, /* Aring */
{ ('A' << 8) + 'u', 0x0102 }, /* Abreve */
{ ('A' << 8) + '~', 0x00c3 }, /* Atilde */
{ ('C' << 8) + '\'', 0x0106 }, /* Cacute */
{ ('C' << 8) + ')', 0x00a9 }, /* Copyright */
{ ('C' << 8) + ',', 0x00c7 }, /* Ccedilla */
{ ('C' << 8) + '@', 0x0108 }, /* Ccircumflex */
{ ('C' << 8) + 'v', 0x010c }, /* Ccaron */
{ ('D' << 8) + '-', 0x0110 }, /* Dcroat */
{ ('D' << 8) + 'v', 0x010e }, /* Dcaron */
{ ('E' << 8) + '\'', 0x00c9 }, /* Eacute */
{ ('E' << 8) + '-', 0x0112 }, /* Emacron */
{ ('E' << 8) + '.', 0x00cb }, /* Edieresis */
{ ('E' << 8) + '@', 0x00ca }, /* Ecircumflex */
{ ('E' << 8) + '`', 0x00c8 }, /* Egrave */
{ ('E' << 8) + 'u', 0x0114 }, /* Ebreve */
{ ('E' << 8) + 'v', 0x011a }, /* Ecaron */
{ ('G' << 8) + '@', 0x011c }, /* Gcircumflex */
{ ('G' << 8) + 'u', 0x011e }, /* Gbreve */
{ ('H' << 8) + '@', 0x0124 }, /* Hcircumflex */
{ ('I' << 8) + '\'', 0x00cd }, /* Iacute */
{ ('I' << 8) + '-', 0x012a }, /* Imacron */
{ ('I' << 8) + '.', 0x00cf }, /* Idieresis */
{ ('I' << 8) + '@', 0x00ce }, /* Icircumflex */
{ ('I' << 8) + '`', 0x00cc }, /* Igrave */
{ ('I' << 8) + 'u', 0x012C }, /* Ibreve */
{ ('I' << 8) + '~', 0x0128 }, /* Itilde */
{ ('J' << 8) + '@', 0x0134 }, /* Jcircumflex */
{ ('L' << 8) + '\'', 0x0139 }, /* Lacute */
{ ('L' << 8) + '/', 0x0141 }, /* Lslash */
{ ('L' << 8) + 'v', 0x013d }, /* Lcaron */
{ ('N' << 8) + '\'', 0x0143 }, /* Nacute */
{ ('N' << 8) + 'v', 0x0147 }, /* Ncaron */
{ ('N' << 8) + '~', 0x00d1 }, /* Ntilde */
{ ('O' << 8) + '\'', 0x00d3 }, /* Oacute */
{ ('O' << 8) + '-', 0x014c }, /* Omacron */
{ ('O' << 8) + '.', 0x00d6 }, /* Odieresis */
{ ('O' << 8) + '/', 0x00d8 }, /* Oslash */
{ ('O' << 8) + '@', 0x00d4 }, /* Ocircumflex */
{ ('O' << 8) + '`', 0x00d2 }, /* Ograve */
{ ('O' << 8) + 'u', 0x014e }, /* Obreve */
{ ('O' << 8) + '~', 0x00d5 }, /* Otilde */
{ ('R' << 8) + '\'', 0x0154 }, /* Racute */
{ ('R' << 8) + 'v', 0x0158 }, /* Rcaron */
{ ('S' << 8) + '\'', 0x015a }, /* Sacute */
{ ('S' << 8) + ',', 0x015e }, /* Scedilla */
{ ('S' << 8) + '@', 0x015c }, /* Scircumflex */
{ ('S' << 8) + 'v', 0x0160 }, /* Scaron */
{ ('T' << 8) + ',', 0x0162 }, /* Tcedilla */
{ ('T' << 8) + 'v', 0x0164 }, /* Tcaron */
{ ('U' << 8) + '\'', 0x00da }, /* Uacute */
{ ('U' << 8) + '-', 0x016a }, /* Umacron */
{ ('U' << 8) + '.', 0x00dc }, /* Udieresis */
{ ('U' << 8) + '@', 0x00db }, /* Ucircumflex */
{ ('U' << 8) + '`', 0x00d9 }, /* Ugrave */
{ ('U' << 8) + 'o', 0x016e }, /* Uring */
{ ('U' << 8) + 'u', 0x016c }, /* Ubreve */
{ ('U' << 8) + '~', 0x0168 }, /* Utilde */
{ ('W' << 8) + '@', 0x0174 }, /* Wcircumflex */
{ ('Y' << 8) + '\'', 0x00dd }, /* Yacute */
{ ('Y' << 8) + '.', 0x0178 }, /* Ydieresis */
{ ('Y' << 8) + '@', 0x0176 }, /* Ycircumflex */
{ ('Z' << 8) + '\'', 0x0179 }, /* Zacute */
{ ('Z' << 8) + 'v', 0x017d }, /* Zcaron */
{ ('a' << 8) + '\'', 0x00e1 }, /* aacute */
{ ('a' << 8) + '-', 0x0101 }, /* amacron */
{ ('a' << 8) + '.', 0x00e4 }, /* adieresis */
{ ('a' << 8) + '@', 0x00e2 }, /* acircumflex */
{ ('a' << 8) + '`', 0x00e0 }, /* agrave */
{ ('a' << 8) + 'o', 0x00e5 }, /* aring */
{ ('a' << 8) + 'u', 0x0103 }, /* abreve */
{ ('a' << 8) + '~', 0x00e3 }, /* atilde */
{ ('c' << 8) + '\'', 0x0107 }, /* cacute */
{ ('c' << 8) + ')', 0x00a9 }, /* copyright */
{ ('c' << 8) + ',', 0x00e7 }, /* ccedilla */
{ ('c' << 8) + '@', 0x0109 }, /* ccircumflex */
{ ('c' << 8) + 'v', 0x010d }, /* ccaron */
{ ('d' << 8) + '-', 0x0111 }, /* dcroat */
{ ('d' << 8) + 'v', 0x010f }, /* dcaron */
{ ('e' << 8) + '\'', 0x00e9 }, /* eacute */
{ ('e' << 8) + '-', 0x0113 }, /* emacron */
{ ('e' << 8) + '.', 0x00eb }, /* edieresis */
{ ('e' << 8) + '@', 0x00ea }, /* ecircumflex */
{ ('e' << 8) + '`', 0x00e8 }, /* egrave */
{ ('e' << 8) + 'u', 0x0115 }, /* ebreve */
{ ('e' << 8) + 'v', 0x011b }, /* ecaron */
{ ('g' << 8) + '@', 0x011d }, /* gcircumflex */
{ ('g' << 8) + 'u', 0x011f }, /* gbreve */
{ ('h' << 8) + '@', 0x0125 }, /* hcircumflex */
{ ('i' << 8) + '\'', 0x00ed }, /* iacute */
{ ('i' << 8) + '-', 0x012b }, /* imacron */
{ ('i' << 8) + '.', 0x00ef }, /* idieresis */
{ ('i' << 8) + '@', 0x00ee }, /* icircumflex */
{ ('i' << 8) + '`', 0x00ec }, /* igrave */
{ ('i' << 8) + 'u', 0x012d }, /* ibreve */
{ ('i' << 8) + '~', 0x0129 }, /* itilde */
{ ('j' << 8) + '@', 0x0135 }, /* jcircumflex */
{ ('l' << 8) + '\'', 0x013a }, /* Lacute */
{ ('l' << 8) + '/', 0x0142 }, /* Lslash */
{ ('l' << 8) + 'v', 0x013e }, /* Lcaron */
{ ('n' << 8) + '\'', 0x0144 }, /* nacute */
{ ('n' << 8) + 'v', 0x0148 }, /* ncaron */
{ ('n' << 8) + '~', 0x00f1 }, /* ntilde */
{ ('o' << 8) + '\'', 0x00f3 }, /* oacute */
{ ('o' << 8) + '-', 0x014d }, /* omacron */
{ ('o' << 8) + '.', 0x00f6 }, /* odieresis */
{ ('o' << 8) + '/', 0x00f8 }, /* oslash */
{ ('o' << 8) + '@', 0x00f4 }, /* ocircumflex */
{ ('o' << 8) + '`', 0x00f2 }, /* ograve */
{ ('o' << 8) + 'u', 0x014f }, /* obreve */
{ ('o' << 8) + '~', 0x00f5 }, /* otilde */
{ ('r' << 8) + '\'', 0x0155 }, /* racute */
{ ('r' << 8) + 'v', 0x0159 }, /* rcaron */
{ ('s' << 8) + '\'', 0x015b }, /* sacute */
{ ('s' << 8) + ',', 0x015f }, /* scedilla */
{ ('s' << 8) + '@', 0x015d }, /* scircumflex */
{ ('s' << 8) + 'v', 0x0161 }, /* scaron */
{ ('t' << 8) + ',', 0x0163 }, /* tcedilla */
{ ('t' << 8) + 'v', 0x0165 }, /* tcaron */
{ ('u' << 8) + '\'', 0x00fa }, /* uacute */
{ ('u' << 8) + '-', 0x016b }, /* umacron */
{ ('u' << 8) + '.', 0x00fc }, /* udieresis */
{ ('u' << 8) + '@', 0x00fb }, /* ucircumflex */
{ ('u' << 8) + '`', 0x00f9 }, /* ugrave */
{ ('u' << 8) + 'o', 0x016f }, /* uring */
{ ('u' << 8) + 'u', 0x016d }, /* ubreve */
{ ('u' << 8) + '~', 0x0169 }, /* utilde */
{ ('w' << 8) + '@', 0x0175 }, /* wcircumflex */
{ ('y' << 8) + '\'', 0x00fd }, /* yacute */
{ ('y' << 8) + '.', 0x00ff }, /* ydieresis */
{ ('y' << 8) + '@', 0x0177 }, /* ycircumflex */
{ ('z' << 8) + '\'', 0x017a }, /* zacute */
{ ('z' << 8) + 'v', 0x017e }, /* zcaron */
};
static int esctabcount = sizeof(esctab)/sizeof(esctabstr);
/*************************************************
- Check for a UTF-8 character *
- ************************************************/
/* Given a pointer to a byte in a zero-terminated string, check to see if it is
the start of a UTF-8 character, and if so, return the length.
Argument: pointer to the first byte
Returns: the length of the character (1 - 6) or -1 if invalid UTF-8 start
static int
check_utf8(uschar *pp)
{
register int ab;
register int c = *pp++;
int n;
if (c < 0x80) return 1;
if (c < 0xc0) return -1;
n = ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
/* Check top bits in the second byte */
if ((*pp & 0xc0) != 0x80) return -1;
/* Check for overlong sequences for each different length */
switch (ab)
{
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0) return -1;
return 2; /* We know there aren't any more bytes to check */
/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*pp & 0x20) == 0) return -1;
break;
/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*pp & 0x30) == 0) return -1;
break;
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*pp & 0x38) == 0) return -1;
break;
/* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*pp & 0x3c) == 0)) return -1;
break;
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0)
{
if ((*(++pp) & 0xc0) != 0x80) return -1;
}
return n + 1;
}
/*************************************************
- Read a string of any length *
- ************************************************/
/* The string may extend over more than one line; newlines count as spaces.
Strings are expected to be in UTF-8 format, but for backwards compatibility,
any non-UTF-8 bytes are taken as single 8-bit characters and converted to
UTF-8.
Arguments: none
Returns: pointer to store containing the string
uschar *
string_read(void)
{
int p = 0;
int size = string_start_size;
uschar *s;
sigch();
if (read_ch != '\"')
{
error_moan(ERR10, "String in quotes");
return NULL;
}
s = store_Xget(size);
next_ch();
while (read_ch != '\"' && read_ch != EOF)
{
/* Handle bytes with the top bit set */
if (read_ch > 0x7f)
{
uschar buffer[8];
uschar *pp = read_chptr - 1; /* Starting byte */
int n = check_utf8(pp); /* Length of UTF-8 character */
/* This byte is not the start of a UTF-8 character; convert it to a UTF-8
string. */
if (n < 1)
{
n = misc_ord2utf8(read_ch, buffer);
pp = buffer;
}
/* This is a UTF-8 character; advance the character pointer */
else read_chptr += n - 1;
/* Add the bytes to the string. */
memcpy(s+p, pp, n);
p += n;
}
/* Bytes without the top bit are always one-byte characters. */
else s[p++] = (read_ch == '\n')? ' ' : read_ch;
/* Ensure there's enough room for another full-length UTF-8 character */
if (p >= size-6)
{
int increment = (size > 1024)? 1024 : size;
uschar *ss = store_Xget(size + increment);
memcpy(ss, s, size);
store_free(s);
s = ss;
size += increment;
}
next_ch();
}
s[p] = 0;
if (read_ch == EOF) { error_moan(ERR23); return NULL; }
next_ch();
return s;
}
/*************************************************
- Check the escapes in a string & transpose *
- ************************************************/
/* This is called for all but PostScript strings on reading, so that any errors
are given at that time. We set a flag (in_string_check) while handling escapes
to ensure that the \a^ -> \a@ fudge works correctly. Character codes are
checked for supported values, and those that are not supported in standardly
encoded fonts are converted to use the Symbol font where possible.
Another job of this function is to look for note letter transpositions in the
string and to carry them at at this time. Other escapes are re-processed later,
at output time.
Argument: the string, in dynamic store
Returns: the string, may be modified and/or copied
uschar *
string_check(uschar *s)
{
int c;
uschar *ss, *tt;
/* If there are any transposed note names in the string, we must build a new
string with the transpositions done. */
if ((tt = Ustrstr(s, "\\t")) != NULL)
{
int p = tt - s;
int size = Ustrlen(s) * 2 + 14;
ss = store_Xget(size);
memcpy(ss, s, p);
for (s = tt; *s != 0; s++)
{
int i, pitch, abspitch;
int note, acc;
if (p >= size - 10)
{
int increment = (size > 1024)? 1024 : size;
uschar *sss = store_Xget(size + increment);
memcpy(sss, ss, size);
store_free(ss);
ss = sss;
size += increment;
}
if (*s != '\\' || s[1] != 't' || s[2] < 'A' || s[2] > 'G')
{
ss[p++] = *s;
continue;
}
note = s[2];
s += 2;
acc = ac_none;
if (s[1] == '#') acc = ac_sharp;
else if (s[1] == '