smartypants.c (11311B)
/* $Id$ */ /* * Copyright (c) 2020 Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "config.h" #if HAVE_SYS_QUEUE # include <sys/queue.h> #endif #include <assert.h> #include <ctype.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "lowdown.h" #include "extern.h" enum entity { ENT_COPY, ENT_REG, ENT_TMARK, ENT_SMARK, ENT_ELLIP, ENT_MDASH, ENT_NDASH, ENT_LDQUO, ENT_RDQUO, ENT_LSQUO, ENT_RSQUO, ENT_FRAC14, ENT_FRAC12, ENT_FRAC34, ENT__MAX }; enum type { TYPE_ROOT, /* root (LOWDOWN_ROOT) */ TYPE_BLOCK, /* block-level */ TYPE_SPAN, /* span-level */ TYPE_OPAQUE, /* skip */ TYPE_TEXT /* text (LOWDOWN_NORMAL_TEXT) */ }; struct sym { const char *key; /* input in markdown */ enum entity ent; /* output entity */ }; struct smarty { int left_wb; /* left wordbreak */ }; static const char *ents[ENT__MAX] = { "©", /* ENT_COPY */ "®", /* ENT_REG */ "™", /* ENT_TMARK */ "℠", /* ENT_SMARK */ "…", /* ENT_ELLIP */ "—", /* ENT_MDASH */ "–", /* ENT_NDASH */ "“", /* ENT_LDQUO */ "”", /* ENT_RDQUO */ "‘", /* ENT_LSQUO */ "’", /* ENT_RSQUO */ "¼", /* ENT_FRAC14 */ "½", /* ENT_FRAC12 */ "¾", /* ENT_FRAC34 */ }; /* * Order is important: check the longest subset first. * (So basically "---" comes before "--".) */ static const struct sym syms[] = { { "(c)", ENT_COPY }, { "(C)", ENT_COPY }, { "(r)", ENT_REG }, { "(R)", ENT_REG }, { "(tm)", ENT_TMARK }, { "(TM)", ENT_TMARK }, { "(sm)", ENT_SMARK }, { "(SM)", ENT_SMARK }, { "...", ENT_ELLIP }, { ". . .", ENT_ELLIP }, { "---", ENT_MDASH }, { "--", ENT_NDASH }, { NULL, ENT__MAX } }; /* * Symbols that require word-break on both sides. * Again, order is important: longest-first. */ static const struct sym syms2[] = { { "1/4th", ENT_FRAC14 }, { "1/4", ENT_FRAC14 }, { "3/4ths", ENT_FRAC34 }, { "3/4th", ENT_FRAC34 }, { "3/4", ENT_FRAC34 }, { "1/2", ENT_FRAC12 }, { NULL, ENT__MAX } }; static const enum type types[LOWDOWN__MAX] = { TYPE_ROOT, /* LOWDOWN_ROOT */ TYPE_OPAQUE, /* LOWDOWN_BLOCKCODE */ TYPE_BLOCK, /* LOWDOWN_BLOCKQUOTE */ TYPE_BLOCK, /* LOWDOWN_DEFINITION */ TYPE_BLOCK, /* LOWDOWN_DEFINITION_TITLE */ TYPE_BLOCK, /* LOWDOWN_DEFINITION_DATA */ TYPE_BLOCK, /* LOWDOWN_HEADER */ TYPE_BLOCK, /* LOWDOWN_HRULE */ TYPE_BLOCK, /* LOWDOWN_LIST */ TYPE_BLOCK, /* LOWDOWN_LISTITEM */ TYPE_BLOCK, /* LOWDOWN_PARAGRAPH */ TYPE_BLOCK, /* LOWDOWN_TABLE_BLOCK */ TYPE_BLOCK, /* LOWDOWN_TABLE_HEADER */ TYPE_BLOCK, /* LOWDOWN_TABLE_BODY */ TYPE_BLOCK, /* LOWDOWN_TABLE_ROW */ TYPE_BLOCK, /* LOWDOWN_TABLE_CELL */ TYPE_OPAQUE, /* LOWDOWN_BLOCKHTML */ TYPE_OPAQUE, /* LOWDOWN_LINK_AUTO */ TYPE_OPAQUE, /* LOWDOWN_CODESPAN */ TYPE_SPAN, /* LOWDOWN_DOUBLE_EMPHASIS */ TYPE_SPAN, /* LOWDOWN_EMPHASIS */ TYPE_SPAN, /* LOWDOWN_HIGHLIGHT */ TYPE_SPAN, /* LOWDOWN_IMAGE */ TYPE_SPAN, /* LOWDOWN_LINEBREAK */ TYPE_SPAN, /* LOWDOWN_LINK */ TYPE_SPAN, /* LOWDOWN_TRIPLE_EMPHASIS */ TYPE_SPAN, /* LOWDOWN_STRIKETHROUGH */ TYPE_SPAN, /* LOWDOWN_SUPERSCRIPT */ TYPE_BLOCK, /* LOWDOWN_FOOTNOTE */ TYPE_OPAQUE, /* LOWDOWN_MATH_BLOCK */ TYPE_OPAQUE, /* LOWDOWN_RAW_HTML */ TYPE_OPAQUE, /* LOWDOWN_ENTITY */ TYPE_TEXT, /* LOWDOWN_NORMAL_TEXT */ TYPE_BLOCK, /* LOWDOWN_DOC_HEADER */ TYPE_BLOCK, /* LOWDOWN_META */ }; /* * Given the sequence in "n" starting at "start" and ending at "end", * split "n" around the sequence and replace it with "entity". * This behaves properly if the leading or trailing sequence is * zero-length. * It may modify the subtree rooted at the parent of "n". * Return zero on failure (memory), non-zero on success. */ static int smarty_entity(struct lowdown_node *n, size_t *maxn, size_t start, size_t end, enum entity entity) { struct lowdown_node *nn, *nent; assert(n->type == LOWDOWN_NORMAL_TEXT); /* Allocate the subsequent entity. */ nent = calloc(1, sizeof(struct lowdown_node)); if (nent == NULL) return 0; TAILQ_INSERT_AFTER(&n->parent->children, n, nent, entries); nent->id = (*maxn)++; nent->type = LOWDOWN_ENTITY; nent->parent = n->parent; TAILQ_INIT(&nent->children); nent->rndr_entity.text.data = strdup(ents[entity]); if (nent->rndr_entity.text.data == NULL) return 0; nent->rndr_entity.text.size = strlen(ents[entity]); /* Allocate the remaining bits, if applicable. */ if (n->rndr_normal_text.text.size - end > 0) { nn = calloc(1, sizeof(struct lowdown_node)); if (nn == NULL) return 0; TAILQ_INSERT_AFTER(&n->parent->children, nent, nn, entries); nn->id = (*maxn)++; nn->type = LOWDOWN_NORMAL_TEXT; nn->parent = n->parent; TAILQ_INIT(&nn->children); nn->rndr_normal_text.text.size = n->rndr_normal_text.text.size - end; nn->rndr_normal_text.text.data = malloc(nn->rndr_normal_text.text.size); if (nn->rndr_normal_text.text.data == NULL) return 0; memcpy(nn->rndr_normal_text.text.data, n->rndr_normal_text.text.data + end, nn->rndr_normal_text.text.size); } n->rndr_normal_text.text.size = start; return 1; } /* * Whether the character to the left of a word constitutes a word break * on its left side. * This is any space or opening punctuation. */ static int smarty_is_wb_l(char c) { return isspace((unsigned char)c) || '(' == c || '[' == c; } /* * Whether the character to the right of a word constitutes a word * break. * This is any space or punctuation. */ static int smarty_is_wb_r(char c) { return isspace((unsigned char)c) || ispunct((unsigned char)c); } /* * Recursive scan for next white-space. * If "skip" is set, we're on the starting node and shouldn't do a check * for white-space in ourselves. */ static int smarty_right_wb_r(const struct lowdown_node *n, int skip) { const struct lowdown_buf *b; const struct lowdown_node *nn; /* Check type of node. */ if (types[n->type] == TYPE_BLOCK) return 1; if (types[n->type] == TYPE_OPAQUE) return 0; if (!skip && types[n->type] == TYPE_TEXT && n->rndr_normal_text.text.size) { assert(n->type == LOWDOWN_NORMAL_TEXT); b = &n->rndr_normal_text.text; return smarty_is_wb_r(b->data[0]); } /* First scan down. */ if ((nn = TAILQ_FIRST(&n->children)) != NULL) return smarty_right_wb_r(nn, 0); /* Now scan back up. */ do { /* FIXME: don't go up to block. */ if ((nn = TAILQ_NEXT(n, entries)) != NULL) return smarty_right_wb_r(nn, 0); } while ((n = n->parent) != NULL); return 1; } /* * See if the character to the right of position "pos" in node "n" marks * the end of a word. * This may require us to traverse the node graph if we're on a node * boundary as well. */ static int smarty_right_wb(const struct lowdown_node *n, size_t pos) { const struct lowdown_buf *b; assert(n->type == LOWDOWN_NORMAL_TEXT); b = &n->rndr_normal_text.text; if (pos + 1 <= b->size) return smarty_is_wb_r(b->data[pos]); return smarty_right_wb_r(n, 1); } /* * FIXME: this can be faster with a table-based lookup instead of the * switch statement. * Returns >1 if a left-quote entity was inserted as the next node * of the parse tree, <0 on failure, otherwise return zero. */ static int smarty_hbuf(struct lowdown_node *n, size_t *maxn, struct lowdown_buf *b, struct smarty *s) { size_t i = 0, j, sz; assert(n->type == LOWDOWN_NORMAL_TEXT); for (i = 0; i < b->size; i++) { switch (b->data[i]) { case '.': case '(': case '-': /* Symbols that don't need wordbreak. */ for (j = 0; syms[j].key != NULL; j++) { sz = strlen(syms[j].key); if (i + sz - 1 >= b->size) continue; if (memcmp(syms[j].key, &b->data[i], sz)) continue; if (!smarty_entity(n, maxn, i, i + sz, syms[j].ent)) return -1; return 0; } break; case '"': /* Left-wb and right-wb differ. */ if (!s->left_wb) { if (!smarty_right_wb(n, i + 1)) break; if (!smarty_entity(n, maxn, i, i + 1, ENT_RDQUO)) return -1; return 0; } if (!smarty_entity (n, maxn, i, i + 1, ENT_LDQUO)) return -1; return 1; case '\'': /* Left-wb and right-wb differ. */ if (!s->left_wb) { if (!smarty_entity(n, maxn, i, i + 1, ENT_RSQUO)) return -1; return 0; } if (!smarty_entity (n, maxn, i, i + 1, ENT_LSQUO)) return -1; return 1; case '1': case '3': /* Symbols that require wb. */ if (!s->left_wb) break; for (j = 0; syms2[j].key != NULL; j++) { sz = strlen(syms2[j].key); if (i + sz - 1 >= b->size) continue; if (memcmp(syms2[j].key, &b->data[i], sz)) continue; if (!smarty_right_wb(n, i + sz)) continue; if (!smarty_entity(n, maxn, i, i + sz, syms2[j].ent)) return -1; return 0; } break; default: break; } s->left_wb = smarty_is_wb_l(b->data[i]); } return 0; } static int smarty_block(struct lowdown_node *, size_t *, enum lowdown_type); static int smarty_span(struct lowdown_node *root, size_t *maxn, struct smarty *s, enum lowdown_type type) { struct lowdown_node *n; int c; TAILQ_FOREACH(n, &root->children, entries) switch (types[n->type]) { case TYPE_TEXT: assert(n->type == LOWDOWN_NORMAL_TEXT); c = smarty_hbuf(n, maxn, &n->rndr_normal_text.text, s); if (c < 0) return 0; if (c > 0) n = TAILQ_NEXT(n, entries); break; case TYPE_SPAN: if (!smarty_span(n, maxn, s, type)) return 0; break; case TYPE_OPAQUE: s->left_wb = 0; break; case TYPE_BLOCK: if (!smarty_block(n, maxn, type)) return 0; break; case TYPE_ROOT: abort(); } return 1; } static int smarty_block(struct lowdown_node *root, size_t *maxn, enum lowdown_type type) { struct smarty s; struct lowdown_node *n; int c; s.left_wb = 1; TAILQ_FOREACH(n, &root->children, entries) switch (types[n->type]) { case TYPE_ROOT: case TYPE_BLOCK: if (!smarty_block(n, maxn, type)) return 0; break; case TYPE_TEXT: assert(n->type == LOWDOWN_NORMAL_TEXT); c = smarty_hbuf(n, maxn, &n->rndr_normal_text.text, &s); if (c < 0) return 0; if (c > 0) n = TAILQ_NEXT(n, entries); break; case TYPE_SPAN: if (!smarty_span(n, maxn, &s, type)) return 0; break; case TYPE_OPAQUE: s.left_wb = 0; break; default: break; } s.left_wb = 1; return 1; } int smarty(struct lowdown_node *n, size_t maxn, enum lowdown_type type) { if (n == NULL) return 1; assert(types[n->type] == TYPE_ROOT); return smarty_block(n, &maxn, type); }