entity.c (16516B)
/* $Id$ */ /* * Copyright (c) 2020, Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "config.h" #if HAVE_SYS_QUEUE # include <sys/queue.h> #endif #include <assert.h> #include <errno.h> #include <limits.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "lowdown.h" #include "extern.h" struct ent { const char *iso; /* html entity */ uint32_t unicode; /* decimal unicode */ const char *nroff; /* -ms/-man */ const char *tex; /* latex */ /** * For latex: if zero, escape as-is. If just TEX_ENT_ASCII, * don't escape at all. If just TEX_ENT_MATH, pass as math mode * escaped. If both TEX_ENT_ASCII and TEX_ENT_MATH, pass as * math mode and don't escape. */ unsigned char texflags; }; static const struct ent ents[] = { { "AElig", 198, "AE", "AE{}", 0 }, { "Aacute", 193, "'A", "'{A}", 0 }, { "Acirc", 194, "^A", "^{A}", 0 }, { "Agrave", 192, "`A", "`{A}", 0 }, { "Alpha", 913, "*A", "A", TEX_ENT_ASCII }, { "Aring", 197, "oA", "AA{}", 0 }, { "Atilde", 195, "~A", "~{A}", 0 }, { "Auml", 196, ":A", "\"{A}", 0 }, { "Beta", 914, "*B", "B", TEX_ENT_ASCII }, { "Ccedil", 199, ",C", "c{C}", 0 }, { "Chi", 935, "*X", "X", TEX_ENT_ASCII }, { "Dagger", 8225, "dg", "ddag{}", 0 }, { "Delta", 916, "*D", "Delta", TEX_ENT_MATH }, { "ETH", 208, "-D", "DH{}", 0 }, { "Eacute", 201, "'E", "'{E}", 0 }, { "Ecirc", 202, "^E", "^{E}", 0 }, { "Egrave", 200, "`E", "`{E}", 0 }, { "Epsilon", 917, "*E", "E", TEX_ENT_ASCII }, { "Eta", 919, "*Y", "E", TEX_ENT_ASCII }, { "Euml", 203, ":E", "\"{E}", 0 }, { "Gamma", 915, "*G", "Gamma", TEX_ENT_MATH }, { "Iacute", 205, "'I", "'{I}", 0 }, { "Icirc", 206, "^I", "^{I}", 0 }, { "Igrave", 204, "`I", "`{I}", 0 }, { "Iota", 921, "*I", "I", TEX_ENT_ASCII }, { "Iuml", 207, ":I", "\"{I}", 0 }, { "Kappa", 922, "*K", "K", TEX_ENT_ASCII }, { "Lambda", 923, "*L", "Lambda", TEX_ENT_MATH }, { "Mu", 924, "*M", "M", TEX_ENT_ASCII }, { "Ntilde", 209, "~N", "~{N}", 0 }, { "Nu", 925, "*N", "N", TEX_ENT_ASCII }, { "OElig", 338, "OE", "OE{}", 0 }, { "Oacute", 211, "'O", "'{O}", 0 }, { "Ocirc", 212, "^O", "^{O}", 0 }, { "Ograve", 210, "`O", "`{O}", 0 }, { "Omega", 937, "*W", "Omega", TEX_ENT_MATH }, { "Omicron", 927, "*O", "O", TEX_ENT_ASCII }, { "Oslash", 216, "/O", "O{}", 0 }, { "Otilde", 213, "~O", "~{O}", 0 }, { "Ouml", 214, ":O", "\"{O}", 0 }, { "Phi", 934, "*F", "Phi", TEX_ENT_MATH }, { "Pi", 928, "*P", "Pi", TEX_ENT_MATH }, { "Prime", 8243, NULL, "^{\\prime\\prime}", TEX_ENT_MATH | TEX_ENT_ASCII }, { "Psi", 936, "*Q", "Psi", TEX_ENT_MATH }, { "Rho", 929, "*R", "R", TEX_ENT_ASCII }, { "Scaron", 352, "vS", "v{S}", 0 }, { "Sigma", 931, "*S", "Sigma", TEX_ENT_MATH }, { "THORN", 222, "TP", "TH{}", 0 }, { "Tau", 932, "*T", "T", TEX_ENT_ASCII }, { "Theta", 920, "*H", "Theta", TEX_ENT_MATH }, { "Uacute", 218, "'U", "'{U}", 0 }, { "Ucirc", 219, "^U", "^{U}", 0 }, { "Ugrave", 217, "`U", "`{U}", 0 }, { "Upsilon", 933, "*U", "Upsilon", TEX_ENT_MATH }, { "Uuml", 220, ":U", "\"{U}", 0 }, { "Xi", 926, "*C", "Xi", TEX_ENT_MATH }, { "Yacute", 221, "'Y", "'{Y}", 0 }, { "Yuml", 376, ":Y", "\"{Y}", 0 }, { "Zeta", 918, "*Z", "Z", TEX_ENT_ASCII }, { "aacute", 225, "'a", "'{a}", 0 }, { "acirc", 226, "^a", "^{a}", 0 }, { "acute", 180, "'", "'{}", 0 }, { "aelig", 230, "ae", "ae{}", 0 }, { "agrave", 224, "`a", "`{a}", 0 }, { "alefsym", 8501, "Ah", "aleph", TEX_ENT_MATH }, { "alpha", 945, "*a", "alpha", TEX_ENT_MATH }, { "amp", 38, NULL, "&{}", 0 }, { "and", 8743, "AN", "wedge", TEX_ENT_MATH }, { "ang", 8736, "/_", "angle", TEX_ENT_MATH }, { "aring", 229, "oa", "aa{}", 0 }, { "asymp", 8776, "|=", "asymp", TEX_ENT_MATH }, { "atilde", 227, "~a", "~{a}", 0 }, { "auml", 228, ":a", "\"{a}", 0 }, { "bdquo", 8222, NULL, NULL, 0 }, /* XXX */ { "beta", 946, "*b", "beta", TEX_ENT_MATH }, { "brvbar", 166, NULL, "textbrokenbar{}", 0 }, { "bull", 8226, "bu", "textbullet{}", 0 }, { "cap", 8745, "ca", "cap", TEX_ENT_MATH }, { "ccedil", 231, ",c", "c{c}", 0 }, { "cedil", 184, "ac", "c{}", 0 }, { "cent", 162, "ct", "textcent{}", 0 }, { "chi", 967, "*x", "chi", TEX_ENT_MATH }, { "circ", 710, "a^", "^{}", 0 }, { "cong", 8773, "=~", "cong", TEX_ENT_MATH }, { "copy", 169, "co", "copyright{}", 0 }, { "crarr", 8629, NULL, NULL, 0 }, /* XXX */ { "cup", 8746, "cu", "cup", TEX_ENT_MATH }, { "curren", 164, NULL, "textcurrency{}", 0 }, { "dArr", 8659, NULL, "Downarrow", TEX_ENT_MATH }, { "dagger", 8224, "dg", "dag{}", 0 }, { "darr", 8595, "da", "downarrow", TEX_ENT_MATH }, { "deg", 176, "de", "textdegree{}", 0 }, { "delta", 948, "*d", "delta", TEX_ENT_MATH }, { "divide", 247, "tdi", "div", TEX_ENT_MATH }, { "eacute", 233, "'e", "'{e}", 0 }, { "ecirc", 234, "^e", "^{e}", 0 }, { "egrave", 232, "`e", "`{e}", 0 }, { "empty", 8709, "es", "emptyset", TEX_ENT_MATH }, { "emsp", 8195, NULL, "hspace{1em}", 0 }, { "ensp", 8194, NULL, "hspace{0.5em}", 0 }, { "epsilon", 949, "*e", "epsilon", TEX_ENT_MATH }, { "equiv", 8801, "==", "equiv", TEX_ENT_MATH }, { "eta", 951, "*y", "eta", TEX_ENT_MATH }, { "eth", 240, "Sd", "dh{}", 0 }, { "euml", 235, ":e", "\"{e}", 0 }, { "euro", 8364, "Eu", "texteuro{}", 0 }, { "exist", 8707, "te", "exists", TEX_ENT_MATH }, { "fnof", 402, NULL, "f", TEX_ENT_MATH }, { "forall", 8704, NULL, "forall", TEX_ENT_MATH }, { "frac12", 189, "12", "sfrac{1}{2}", TEX_ENT_MATH }, { "frac14", 188, "14", "sfrac{1}{4}", TEX_ENT_MATH }, { "frac34", 190, "34", "sfrac{3}{4}", TEX_ENT_MATH }, { "frasl", 8260, NULL, NULL, 0 }, /* XXX */ { "gamma", 947, "*g", "gamma", TEX_ENT_MATH }, { "ge", 8805, ">=", "geq", TEX_ENT_MATH }, { "gt", 62, NULL, "textgreater{}", 0 }, { "hArr", 8660, NULL, "Leftrightarrow", TEX_ENT_MATH }, { "harr", 8596, "<>", "leftrightarrow", TEX_ENT_MATH }, { "hellip", 8230, NULL, "ldots{}", 0 }, { "iacute", 237, "'i", "'{i}", 0 }, { "icirc", 238, "^i", "^{i}", 0 }, { "iexcl", 161, "r!", "textexclamdown{}", 0 }, { "igrave", 236, "`i", "`{i}", 0 }, { "image", 8465, NULL, "Im", TEX_ENT_MATH }, { "infin", 8734, "if", "infty", TEX_ENT_MATH }, { "int", 8747, "integral", "int", TEX_ENT_MATH }, { "iota", 953, "*i", "iota", TEX_ENT_MATH }, { "iquest", 191, "r?", "textquestiondown{}", 0 }, { "isin", 8712, NULL, "in", TEX_ENT_MATH }, { "iuml", 239, ":i", "\"{i}", 0 }, { "kappa", 954, "*k", "kappa", TEX_ENT_MATH }, { "lArr", 8656, NULL, "Leftarrow", TEX_ENT_MATH }, { "lambda", 955, "*l", "lambda", TEX_ENT_MATH }, { "lang", 9001, "la", "langle", TEX_ENT_MATH }, { "laquo", 171, "Fo", "guillemetleft{}", 0 }, { "larr", 8592, "<-", "leftarrow", TEX_ENT_MATH }, { "lceil", 8968, NULL, "lceil", TEX_ENT_MATH }, { "ldquo", 8220, "lq", "``", TEX_ENT_ASCII }, { "le", 8804, NULL, "leq", TEX_ENT_MATH }, { "lfloor", 8970, "lf", "lfloor", TEX_ENT_MATH }, { "lowast", 8727, NULL, "_\\ast", TEX_ENT_MATH }, { "lrm", 8206, NULL, NULL, 0 }, /* XXX */ { "lsaquo", 8249, NULL, NULL, 0 }, { "lsquo", 8216, "oq", "`", TEX_ENT_ASCII }, { "lt", 60, NULL, "textless{}", 0 }, { "macr", 175, NULL, "={}", 0 }, { "mdash", 8212, "em", "---", TEX_ENT_ASCII }, { "micro", 181, NULL, "textmu{}", 0 }, { "middot", 183, NULL, "textperiodcentered{}", 0 }, { "minus", 8722, "mi", "-{}", 0 }, { "mu", 956, "*m", "mu", TEX_ENT_MATH }, { "nabla", 8711, NULL, "nabla", TEX_ENT_MATH }, { "nbsp", 160, "~", "~", TEX_ENT_ASCII }, { "ndash", 8211, "en", "--", TEX_ENT_ASCII }, { "ne", 8800, "!=", "not=", TEX_ENT_MATH }, { "ni", 8715, NULL, "ni", TEX_ENT_MATH }, { "not", 172, "no", "lnot", TEX_ENT_MATH }, { "notin", 8713, NULL, "not\\in", TEX_ENT_MATH }, { "nsub", 8836, NULL, "not\\subset", TEX_ENT_MATH }, { "ntilde", 241, "~n", "~{n}", 0 }, { "nu", 957, "*n", "nu", TEX_ENT_MATH }, { "oacute", 243, "'o", "'{o}", 0 }, { "ocirc", 244, "^o", "^{o}", 0 }, { "oelig", 339, "oe", "oe{}", 0 }, { "ograve", 242, "`o", "`{o}", 0 }, { "oline", 8254, NULL, "ominus", TEX_ENT_MATH }, { "omega", 969, "*w", "omega", TEX_ENT_MATH }, { "omicron", 959, "*o", "omicron", TEX_ENT_MATH }, { "oplus", 8853, NULL, "oplus", TEX_ENT_MATH }, { "or", 8744, "OR", "vee", TEX_ENT_MATH }, { "ordf", 170, NULL, "textordfeminine{}", 0 }, { "ordm", 186, NULL, "textordmasculine{}", 0 }, { "oslash", 248, "/o", "oslash", TEX_ENT_MATH }, { "otilde", 245, "~o", "~{o}", 0 }, { "otimes", 8855, NULL, "otimes", TEX_ENT_MATH }, { "ouml", 246, ":o", "\"{o}", 0 }, { "para", 182, NULL, "P{}", 0 }, { "part", 8706, "pd", "partial", TEX_ENT_MATH }, { "permil", 8240, NULL, "textperthousand{}", 0 }, { "perp", 8869, NULL, "perp", TEX_ENT_MATH }, { "phi", 966, "*f", "phi", TEX_ENT_MATH }, { "pi", 960, "*p", "pi", TEX_ENT_MATH }, { "piv", 982, "+p", "varpi", TEX_ENT_MATH }, { "plusmn", 177, "+-", "pm", TEX_ENT_MATH }, { "pound", 163, NULL, "pounds{}", 0 }, { "prime", 8242, NULL, "^\\prime{}", TEX_ENT_MATH | TEX_ENT_ASCII }, { "prod", 8719, "poduct", "prod", TEX_ENT_MATH }, { "prop", 8733, NULL, "propto", TEX_ENT_MATH }, { "psi", 968, "*q", "psi", TEX_ENT_MATH }, { "quot", 34, NULL, "\"", TEX_ENT_ASCII }, { "rArr", 8658, NULL, "Rightarrow", TEX_ENT_MATH }, { "radic", 8730, NULL, "surd", TEX_ENT_MATH }, { "rang", 9002, "ra", "rangle", TEX_ENT_MATH }, { "raquo", 187, "Fc", "guillemotright{}", 0 }, { "rarr", 8594, "->", "rightarrow", TEX_ENT_MATH }, { "rceil", 8969, NULL, "rceil", TEX_ENT_MATH }, { "rdquo", 8221, "rq", "''", TEX_ENT_ASCII }, { "real", 8476, NULL, "Re", TEX_ENT_MATH }, { "reg", 174, "rg", "textregistered{}", 0 }, { "rfloor", 8971, "rf", "rfloor", TEX_ENT_MATH }, { "rho", 961, "*r", "rho", TEX_ENT_MATH }, { "rlm", 8207, NULL, NULL, 0 }, /* XXX */ { "rsaquo", 8250, NULL, NULL, 0 }, /* XXX */ { "rsquo", 8217, "cq", "'", TEX_ENT_ASCII }, { "sbquo", 8218, NULL, NULL, 0 }, /* XXX */ { "scaron", 353, "vs", "v{s}", 0 }, { "sdot", 8901, NULL, "cdot", TEX_ENT_MATH }, { "sect", 167, "sc", "S{}", 0 }, { "shy", 173, NULL, "-{}", 0 }, { "sigma", 963, "*s", "sigma", TEX_ENT_MATH }, { "sigmaf", 962, "ts", "sigmav", TEX_ENT_MATH }, /* XXX?? */ { "sim", 8764, "ap", "sim", TEX_ENT_MATH }, { "sub", 8834, "sb", "subset", TEX_ENT_MATH }, { "sube", 8838, "ib", "subseteq", TEX_ENT_MATH }, { "sum", 8721, "sum", "sum", TEX_ENT_MATH }, { "sup", 8835, "sp", "supset", TEX_ENT_MATH }, { "sup1", 185, "S1", "$^1$", TEX_ENT_ASCII }, { "sup2", 178, "S2", "$^2$", TEX_ENT_ASCII }, { "sup3", 179, "S3", "$^3$", TEX_ENT_ASCII }, { "supe", 8839, "ip", "supseteq", TEX_ENT_MATH }, { "szlig", 223, "ss", "ss{}", 0 }, { "tau", 964, "*t", "tau", TEX_ENT_MATH }, { "there4", 8756, "3d", "therefore", TEX_ENT_MATH }, { "theta", 952, "*h", "theta", TEX_ENT_MATH }, { "thetasym", 977, "+h", "vartheta", TEX_ENT_MATH }, /* XXX?? */ { "thinsp", 8201, NULL, "hspace{0.167em}", 0 }, { "thorn", 254, "Tp", "th{}", 0 }, { "tilde", 732, "ti", "~{}", 0 }, { "times", 215, "mu", "times", TEX_ENT_MATH }, { "trade", 8482, "tm", "texttrademark{}", 0 }, { "uArr", 8657, NULL, "Uparrow", TEX_ENT_MATH }, { "uacute", 250, "'u", "'{u}", 0 }, { "uarr", 8593, "ua", "uparrow", TEX_ENT_MATH }, { "ucirc", 251, "^u", "^{u}", 0 }, { "ugrave", 249, "`u", "`{u}", 0 }, { "uml", 168, "ad", "\"{}", 0 }, { "upsih", 978, NULL, NULL, 0 }, /* XXX */ { "upsilon", 965, "*u", "upsilon", TEX_ENT_MATH }, { "uuml", 252, ":u", "\"{u}", 0 }, { "weierp", 8472, "wp", "wp", TEX_ENT_MATH }, { "xi", 958, "*c", "xi", TEX_ENT_MATH }, { "yacute", 253, "'y", "'{y}", 0 }, { "yen", 165, "Ye", "textyen{}", 0 }, { "yuml", 255, ":y", "\"{y}", 0 }, { "zeta", 950, "*z", "zeta", TEX_ENT_MATH }, { "zwj", 8205, NULL, NULL, 0 }, /* XXX */ { "zwnj", 8204, NULL, NULL, 0 }, /* XXX */ { NULL, 0, NULL, NULL, 0 } }; static int32_t entity_find_num(const struct lowdown_buf *buf) { char b[32]; char *ep; unsigned long long ulval; int base; if (buf->size < 4) return -1; /* Copy a hex or decimal value. */ if (buf->data[2] == 'x' || buf->data[2] == 'X') { if (buf->size < 5) return -1; if (buf->size - 4 > sizeof(b) - 1) return -1; memcpy(b, buf->data + 3, buf->size - 4); b[buf->size - 4] = '\0'; base = 16; } else { if (buf->size - 3 > sizeof(b) - 1) return -1; memcpy(b, buf->data + 2, buf->size - 3); b[buf->size - 3] = '\0'; base = 10; } /* * Convert within the given base. * This calling syntax is from OpenBSD's strtoull(3). */ errno = 0; ulval = strtoull(b, &ep, base); if (b[0] == '\0' || *ep != '\0') return -1; if (errno == ERANGE && ulval == ULLONG_MAX) return -1; if (ulval > INT32_MAX) return -1; return (int32_t)ulval; } /* * Convert a named entity to a unicode codepoint. * Return -1 on failure. */ static const struct ent * entity_find_named(const struct lowdown_buf *buf) { char b[32]; size_t i; /* * Copy into NUL-terminated buffer for easy strcmp(). * We omit the leading '&' and trailing ';'. */ if (buf->size - 2 > sizeof(b) - 1) return NULL; memcpy(b, buf->data + 1, buf->size - 2); b[buf->size - 2] = '\0'; /* TODO: can be trivially sped up by using a binary search. */ for (i = 0; ents[i].iso != NULL; i++) if (strcmp(b, ents[i].iso) == 0) return &ents[i]; return NULL; } /* * Basic sanity of HTML entity. * Needs to be &xyz; * Return zero on failure, non-zero on success. */ static int entity_sane(const struct lowdown_buf *buf) { if (buf->size < 3 || buf->data[0] != '&' || buf->data[buf->size - 1] != ';') return 0; return 1; } /* * Look up an entity and return its decimal value or -1 on failure (bad * formatting or couldn't find entity). * Handles both numeric (decimal and hex) and common named ones. */ int32_t entity_find_iso(const struct lowdown_buf *buf) { const struct ent *e; if (!entity_sane(buf)) return -1; if (buf->data[1] == '#') return entity_find_num(buf); if ((e = entity_find_named(buf)) == NULL) return -1; assert(e->unicode < INT32_MAX); return e->unicode; } /** * Look for the roff entity corresponding to "buf". If will either * return a special character (which must be escaped using the usual * \(xx or whatever) or NULL. If NULL and "iso" is -1, the character * couldn't be found. If NULL and "iso" is >= 0, "iso" is a unicode * character number that must be further escaped. */ const char * entity_find_nroff(const struct lowdown_buf *buf, int32_t *iso) { const struct ent *e; size_t i; *iso = -1; if (!entity_sane(buf)) return NULL; if (buf->data[1] == '#') { if ((*iso = entity_find_num(buf)) == -1) return NULL; for (i = 0; ents[i].iso != NULL; i++) if ((int32_t)ents[i].unicode == *iso) return ents[i].nroff; return NULL; } if ((e = entity_find_named(buf)) == NULL) return NULL; assert(e->unicode < INT32_MAX); *iso = e->unicode; return e->nroff; } /* * Looks for the TeX entity corresponding to "buf". * If "buf" is a numerical code, looks it up by number; if an HTML (ISO) * code, looks it up by that. * Returns the entity or NULL on failure. * On success, sets the TeX flags. */ const char * entity_find_tex(const struct lowdown_buf *buf, unsigned char *fl) { const struct ent *e; int32_t unicode; size_t i; if (!entity_sane(buf)) return NULL; if (buf->data[1] == '#') { if ((unicode = entity_find_num(buf)) == -1) return NULL; for (i = 0; ents[i].iso != NULL; i++) if ((int32_t)ents[i].unicode == unicode) { *fl = ents[i].texflags; return ents[i].tex; } return NULL; } if ((e = entity_find_named(buf)) == NULL) return NULL; assert(e->unicode < INT32_MAX); *fl = e->texflags; return e->tex; }