diff.c (38103B)
/* $Id$ */ /* * Copyright (c) 2017--2021 Kristaps Dzonsons <kristaps@bsd.lv> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "config.h" #if HAVE_SYS_QUEUE # include <sys/queue.h> #endif #include <sys/types.h> #include <assert.h> #include <ctype.h> #include <float.h> #include <math.h> #if HAVE_MD5 # include <md5.h> #endif #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "lowdown.h" #include "libdiff.h" #include "extern.h" /* * If "node" is not NULL, this represents our match attempts for a * single node in a node tree. We basically use "optmatch" and "opt" to * keep trying to find the most optimal candidate in the other tree, * which ends up being "match". */ struct xnode { char sig[MD5_DIGEST_STRING_LENGTH]; double weight; /* queue weight */ const struct lowdown_node *node; /* basis node */ const struct lowdown_node *match; /* matching node */ size_t opt; /* match optimality */ const struct lowdown_node *optmatch; /* current optimal */ }; /* * A map of all nodes in the current tree by their ID. A map can have * holes (in which case the xnode's "node" is NULL) since we collapse * adjacent text nodes as a preprocess. */ struct xmap { struct xnode *nodes; /* holey table */ size_t maxsize; /* size of "nodes" */ size_t maxid; /* max node id */ size_t maxnodes; /* non-NULL count */ double maxweight; /* node weight */ }; /* * Queue of nodes. This is used in creating the priority queue of next * nodes to parse. */ struct pnode { const struct lowdown_node *node; /* priority node */ TAILQ_ENTRY(pnode) entries; }; /* * Convenience structure to hold maps we use when merging together the * trees. */ struct merger { const struct xmap *xoldmap; /* source xnodes */ const struct xmap *xnewmap; /* destination xnodes */ size_t id; /* maxid in new tree */ }; TAILQ_HEAD(pnodeq, pnode); /* * A node used in computing the shortest edit script. */ struct sesnode { char *buf; /* buffer */ size_t bufsz; /* length of buffer (less NUL) */ int tailsp; /* whether there's trailing space */ int headsp; /* whether there's leading space */ }; static void MD5Updatebuf(MD5_CTX *ctx, const struct lowdown_buf *v) { assert(v != NULL); MD5Update(ctx, (const uint8_t *)v->data, v->size); } static void MD5Updatev(MD5_CTX *ctx, const void *v, size_t sz) { assert(v != NULL); MD5Update(ctx, (const unsigned char *)v, sz); } /* * If this returns non-zero, the node should be considered opaque and * we will not do any difference processing within it. It will still be * marked with weight and signature from child nodes and interior data. */ static int is_opaque(const struct lowdown_node *n) { assert(n != NULL); return n->type == LOWDOWN_TABLE_BLOCK || n->type == LOWDOWN_META; } /* * Assign signatures and weights. * This is defined by "Phase 2" in sec. 5.2., along with the specific * heuristics given in the "Tuning" section. * We use the MD5 algorithm for computing hashes. * Returns the weight of the node rooted at "n". * If "parent" is not NULL, its hash is updated with the hash computed * for the current "n" and its children. * Return <0 on failure. */ static double assign_sigs(MD5_CTX *parent, struct xmap *map, const struct lowdown_node *n, int ign) { const struct lowdown_node *nn; ssize_t weight = -1; MD5_CTX ctx; double v = 0.0, vv; struct xnode *xn; struct xnode xntmp; void *pp; int ign_chld = ign; /* * Get our node slot unless we're ignoring the node. * Ignoring comes when a parent in our chain is opaque. */ if (!ign) { if (n->id >= map->maxsize) { pp = recallocarray(map->nodes, map->maxsize, n->id + 64, sizeof(struct xnode)); if (pp == NULL) return -1.0; map->nodes = pp; map->maxsize = n->id + 64; } xn = &map->nodes[n->id]; assert(xn->node == NULL); assert(xn->weight == 0.0); xn->node = n; if (n->id > map->maxid) map->maxid = n->id; assert(map->maxid < map->maxsize); map->maxnodes++; ign_chld = is_opaque(n); } /* Recursive step. */ MD5Init(&ctx); MD5Updatev(&ctx, &n->type, sizeof(enum lowdown_rndrt)); TAILQ_FOREACH(nn, &n->children, entries) { if ((vv = assign_sigs(&ctx, map, nn, ign_chld)) < 0.0) return vv; v += vv; } /* Re-assign "xn": child might have reallocated. */ memset(&xntmp, 0, sizeof(struct xnode)); xn = ign ? &xntmp : &map->nodes[n->id]; xn->weight = v; /* * Compute our weight. * The weight is either the log of the contained text length for * leaf nodes or the accumulated sub-element weight for * non-terminal nodes plus one. */ switch (n->type) { case LOWDOWN_BLOCKCODE: weight = n->rndr_blockcode.text.size; break; case LOWDOWN_BLOCKHTML: weight = n->rndr_blockhtml.text.size; break; case LOWDOWN_LINK_AUTO: weight = n->rndr_autolink.link.size; break; case LOWDOWN_CODESPAN: weight = n->rndr_codespan.text.size; break; case LOWDOWN_META: weight = n->rndr_meta.key.size; break; case LOWDOWN_IMAGE: weight = n->rndr_image.link.size + n->rndr_image.title.size + n->rndr_image.dims.size + n->rndr_image.alt.size; break; case LOWDOWN_RAW_HTML: weight = n->rndr_raw_html.text.size; break; case LOWDOWN_NORMAL_TEXT: weight = n->rndr_normal_text.text.size; break; case LOWDOWN_ENTITY: weight = n->rndr_entity.text.size; break; default: break; } /* Weight can be zero if text size is zero. */ if (weight >= 0) xn->weight = 1.0 + (weight == 0 ? 0.0 : log(weight)); else xn->weight += 1.0; /* * Augment our signature from our attributes. * This depends upon the node. * Avoid using attributes that are "mutable" relative to the * generated output, e.g., list display numbers. */ switch (n->type) { case LOWDOWN_LIST: MD5Updatev(&ctx, &n->rndr_list.flags, sizeof(enum hlist_fl)); break; case LOWDOWN_LISTITEM: MD5Updatev(&ctx, &n->rndr_listitem.flags, sizeof(enum hlist_fl)); MD5Updatev(&ctx, &n->rndr_listitem.num, sizeof(size_t)); break; case LOWDOWN_HEADER: MD5Updatev(&ctx, &n->rndr_header.level, sizeof(size_t)); break; case LOWDOWN_NORMAL_TEXT: MD5Updatebuf(&ctx, &n->rndr_normal_text.text); break; case LOWDOWN_META: MD5Updatebuf(&ctx, &n->rndr_meta.key); break; case LOWDOWN_ENTITY: MD5Updatebuf(&ctx, &n->rndr_entity.text); break; case LOWDOWN_LINK_AUTO: MD5Updatebuf(&ctx, &n->rndr_autolink.link); MD5Updatev(&ctx, &n->rndr_autolink.type, sizeof(enum halink_type)); break; case LOWDOWN_RAW_HTML: MD5Updatebuf(&ctx, &n->rndr_raw_html.text); break; case LOWDOWN_LINK: MD5Updatebuf(&ctx, &n->rndr_link.link); MD5Updatebuf(&ctx, &n->rndr_link.title); break; case LOWDOWN_BLOCKCODE: MD5Updatebuf(&ctx, &n->rndr_blockcode.text); MD5Updatebuf(&ctx, &n->rndr_blockcode.lang); break; case LOWDOWN_CODESPAN: MD5Updatebuf(&ctx, &n->rndr_codespan.text); break; case LOWDOWN_TABLE_HEADER: MD5Updatev(&ctx, &n->rndr_table_header.columns, sizeof(size_t)); break; case LOWDOWN_TABLE_CELL: MD5Updatev(&ctx, &n->rndr_table_cell.flags, sizeof(enum htbl_flags)); MD5Updatev(&ctx, &n->rndr_table_cell.col, sizeof(size_t)); break; case LOWDOWN_IMAGE: MD5Updatebuf(&ctx, &n->rndr_image.link); MD5Updatebuf(&ctx, &n->rndr_image.title); MD5Updatebuf(&ctx, &n->rndr_image.dims); MD5Updatebuf(&ctx, &n->rndr_image.alt); break; case LOWDOWN_MATH_BLOCK: MD5Updatev(&ctx, &n->rndr_math.blockmode, sizeof(int)); break; case LOWDOWN_BLOCKHTML: MD5Updatebuf(&ctx, &n->rndr_blockhtml.text); break; default: break; } MD5End(&ctx, xn->sig); if (parent != NULL) MD5Update(parent, (uint8_t *)xn->sig, MD5_DIGEST_STRING_LENGTH - 1); if (xn->weight > map->maxweight) map->maxweight = xn->weight; assert(isfinite(xn->weight)); assert(isnormal(xn->weight)); assert(xn->weight > 0.0); return xn->weight; } /* * Enqueue "n" into a priority queue "pq". * Priority is given to weights; and if weights are equal, then * proximity to the parse root given by a pre-order identity. * FIXME: use a priority heap. * Return zero on failure, non-zero on success. */ static int pqueue(const struct lowdown_node *n, struct xmap *map, struct pnodeq *pq) { struct pnode *p, *pp; struct xnode *xnew, *xold; if ((p = malloc(sizeof(struct pnode))) == NULL) return 0; p->node = n; xnew = &map->nodes[n->id]; assert(xnew != NULL); assert(xnew->node != NULL); TAILQ_FOREACH(pp, pq, entries) { xold = &map->nodes[pp->node->id]; assert(xold->node != NULL); if (xnew->weight >= xold->weight) break; } if (pp == NULL) { TAILQ_INSERT_TAIL(pq, p, entries); return 1; } else if (xnew->weight > xold->weight) { TAILQ_INSERT_BEFORE(pp, p, entries); return 1; } for (; pp != NULL; pp = TAILQ_NEXT(pp, entries)) { assert(p->node->id != pp->node->id); if (p->node->id < pp->node->id) break; } if (pp == NULL) TAILQ_INSERT_TAIL(pq, p, entries); else TAILQ_INSERT_BEFORE(pp, p, entries); return 1; } /* * Candidate optimality between "xnew" and "xold" as described in "Phase * 3" of sec. 5.2. * This also uses the heuristic described in "Tuning" for how many * levels to search upward. */ static size_t optimality(struct xnode *xnew, struct xmap *xnewmap, struct xnode *xold, struct xmap *xoldmap) { size_t opt = 1, d, i = 0; /* Height: log(n) * W/W_0 or at least 1. */ d = ceil(log(xnewmap->maxnodes) * xnew->weight / xnewmap->maxweight); if (d == 0) d = 1; /* FIXME: are we supposed to bound to "d"? */ while (xnew->node->parent != NULL && xold->node->parent != NULL && i < d) { xnew = &xnewmap->nodes[xnew->node->parent->id]; xold = &xoldmap->nodes[xold->node->parent->id]; if (xnew->match != NULL && xnew->match == xold->node) opt++; i++; } return opt; } /* * Compute the candidacy of "xnew" to "xold" as described in "Phase 3" * of sec. 5.2 and using the optimality() function as a basis. * If "xnew" does not have a match assigned (no prior candidacy), assign * it immediately to "xold". * If it does, then compute the optimality and select the greater of the * two optimalities. * As an extension to the paper, if the optimalities are equal, use the * "closer" node to the current identifier. */ static void candidate(struct xnode *xnew, struct xmap *xnewmap, struct xnode *xold, struct xmap *xoldmap) { size_t opt; long long dnew, dold; assert(xnew->node != NULL); assert(xold->node != NULL); if (xnew->optmatch == NULL) { xnew->optmatch = xold->node; xnew->opt = optimality(xnew, xnewmap, xold, xoldmap); return; } opt = optimality(xnew, xnewmap, xold, xoldmap); if (opt == xnew->opt) { /* * Use a simple norm over the identifier space. * Choose the lesser of the norms. */ dold = llabs((long long) (xnew->optmatch->id - xnew->node->id)); dnew = llabs((long long) (xold->node->id - xnew->node->id)); if (dold > dnew) { xnew->optmatch = xold->node; xnew->opt = opt; } } else if (opt > xnew->opt) { xnew->optmatch = xold->node; xnew->opt = opt; } } /* * Do the two internal nodes equal each other? * This depends upon the node type. * By default, all similarly-labelled (typed) nodes are equal. */ static int match_eq(const struct lowdown_node *n1, const struct lowdown_node *n2) { if (n1->type != n2->type) return 0; switch (n1->type) { case LOWDOWN_LINK: if (!hbuf_eq (&n1->rndr_link.link, &n2->rndr_link.link)) return 0; if (!hbuf_eq (&n1->rndr_link.title, &n2->rndr_link.title)) return 0; break; case LOWDOWN_HEADER: if (n1->rndr_header.level != n2->rndr_header.level) return 0; break; case LOWDOWN_META: if (!hbuf_eq (&n1->rndr_meta.key, &n2->rndr_meta.key)) return 0; break; case LOWDOWN_LISTITEM: if (n1->rndr_listitem.num != n2->rndr_listitem.num) return 0; if (n1->rndr_listitem.flags != n2->rndr_listitem.flags) return 0; break; default: break; } return 1; } /* * Return non-zero if this node is the only child. */ static int match_singleton(const struct lowdown_node *n) { if (n->parent == NULL) return 1; return TAILQ_NEXT(n, entries) == TAILQ_PREV(n, lowdown_nodeq, entries); } /* * Algorithm to "propogate up" according to "Phase 3" of sec. 5.2. * This also uses the heuristic described in "Tuning" for how many * levels to search upward. * I augment this by making singleton children pass upward. * FIXME: right now, this doesn't clobber existing upward matches. Is * that correct behaviour? */ static void match_up(struct xnode *xnew, struct xmap *xnewmap, struct xnode *xold, struct xmap *xoldmap) { size_t d, i = 0; /* Height: log(n) * W/W_0 or at least 1. */ d = ceil(log(xnewmap->maxnodes) * xnew->weight / xnewmap->maxweight); if (d == 0) d = 1; while (xnew->node->parent != NULL && xold->node->parent != NULL && i < d) { /* Are the "labels" the same? */ if (!match_eq(xnew->node->parent, xold->node->parent)) break; xnew = &xnewmap->nodes[xnew->node->parent->id]; xold = &xoldmap->nodes[xold->node->parent->id]; if (xold->match != NULL || xnew->match != NULL) break; xnew->match = xold->node; xold->match = xnew->node; i++; } if (i != d) return; /* * Pass up singletons. * This is an extension of the algorithm. */ while (xnew->node->parent != NULL && xold->node->parent != NULL) { if (!match_singleton(xnew->node) || !match_singleton(xold->node)) break; if (!match_eq(xnew->node->parent, xold->node->parent)) break; xnew = &xnewmap->nodes[xnew->node->parent->id]; xold = &xoldmap->nodes[xold->node->parent->id]; if (xold->match != NULL || xnew->match != NULL) break; xnew->match = xold->node; xold->match = xnew->node; } } /* * Algorithm that "propogates down" according to "Phase 3" of sec. 5.2. * This (recursively) makes sure that a matched tree has all of the * subtree nodes also matched. */ static void match_down(struct xnode *xnew, struct xmap *xnewmap, struct xnode *xold, struct xmap *xoldmap) { struct lowdown_node *nnew, *nold; /* * If we're matching into a component that has already been * matched, we're in the subtree proper (the subtree root is * checked that it's not already matched) and the fact that this * is within a match indicates we're more the "larger" of the * matches, so unset its match status. */ if (xold->match != NULL) { assert(xold->node == xnewmap->nodes[xold->match->id].match); xnewmap->nodes[xold->match->id].match = NULL; xold->match = NULL; } assert(xnew->match == NULL); assert(xold->match == NULL); xnew->match = xold->node; xold->match = xnew->node; if (is_opaque(xnew->node)) { assert(is_opaque(xold->node)); return; } nnew = TAILQ_FIRST(&xnew->node->children); nold = TAILQ_FIRST(&xold->node->children); while (nnew != NULL) { assert(NULL != nold); xnew = &xnewmap->nodes[nnew->id]; xold = &xoldmap->nodes[nold->id]; match_down(xnew, xnewmap, xold, xoldmap); nnew = TAILQ_NEXT(nnew, entries); nold = TAILQ_NEXT(nold, entries); } assert(nold == NULL); } /* * Clone a single node and all of its "attributes". * That is, its type and "leaf node" data. * Assign the identifier as given. * Note that some attributes, such as the table column array, aren't * copied. * We'll re-create those later. */ static struct lowdown_node * node_clone(const struct lowdown_node *v, size_t id) { struct lowdown_node *n; int rc = 1; size_t i; if ((n = calloc(1, sizeof(struct lowdown_node))) == NULL) return NULL; TAILQ_INIT(&n->children); n->type = v->type; n->id = id; switch (n->type) { case LOWDOWN_DEFINITION: n->rndr_definition.flags = v->rndr_definition.flags; break; case LOWDOWN_META: rc = hbuf_clone(&v->rndr_meta.key, &n->rndr_meta.key); break; case LOWDOWN_LIST: n->rndr_list.flags = v->rndr_list.flags; break; case LOWDOWN_LISTITEM: n->rndr_listitem.flags = v->rndr_listitem.flags; n->rndr_listitem.num = v->rndr_listitem.num; break; case LOWDOWN_HEADER: n->rndr_header.level = v->rndr_header.level; break; case LOWDOWN_NORMAL_TEXT: rc = hbuf_clone(&v->rndr_normal_text.text, &n->rndr_normal_text.text); break; case LOWDOWN_ENTITY: rc = hbuf_clone(&v->rndr_entity.text, &n->rndr_entity.text); break; case LOWDOWN_LINK_AUTO: rc = hbuf_clone(&v->rndr_autolink.link, &n->rndr_autolink.link); n->rndr_autolink.type = v->rndr_autolink.type; break; case LOWDOWN_RAW_HTML: rc = hbuf_clone(&v->rndr_raw_html.text, &n->rndr_raw_html.text); break; case LOWDOWN_LINK: rc = hbuf_clone(&v->rndr_link.link, &n->rndr_link.link) && hbuf_clone(&v->rndr_link.title, &n->rndr_link.title); break; case LOWDOWN_BLOCKCODE: rc = hbuf_clone(&v->rndr_blockcode.text, &n->rndr_blockcode.text) && hbuf_clone(&v->rndr_blockcode.lang, &n->rndr_blockcode.lang); break; case LOWDOWN_CODESPAN: rc = hbuf_clone(&v->rndr_codespan.text, &n->rndr_codespan.text); break; case LOWDOWN_TABLE_BLOCK: n->rndr_table.columns = v->rndr_table.columns; break; case LOWDOWN_TABLE_HEADER: n->rndr_table_header.columns = v->rndr_table_header.columns; n->rndr_table_header.flags = calloc (n->rndr_table_header.columns, sizeof(enum htbl_flags)); if (n->rndr_table_header.flags == NULL) return NULL; for (i = 0; i < n->rndr_table_header.columns; i++) n->rndr_table_header.flags[i] = v->rndr_table_header.flags[i]; break; case LOWDOWN_TABLE_CELL: n->rndr_table_cell.flags = v->rndr_table_cell.flags; n->rndr_table_cell.col = v->rndr_table_cell.col; n->rndr_table_cell.columns = v->rndr_table_cell.columns; break; case LOWDOWN_IMAGE: rc = hbuf_clone(&v->rndr_image.link, &n->rndr_image.link) && hbuf_clone(&v->rndr_image.title, &n->rndr_image.title) && hbuf_clone(&v->rndr_image.dims, &n->rndr_image.dims) && hbuf_clone(&v->rndr_image.alt, &n->rndr_image.alt); break; case LOWDOWN_MATH_BLOCK: n->rndr_math.blockmode = v->rndr_math.blockmode; break; case LOWDOWN_BLOCKHTML: rc = hbuf_clone(&v->rndr_blockhtml.text, &n->rndr_blockhtml.text); break; default: break; } if (!rc) { lowdown_node_free(n); n = NULL; } return n; } /* * Take the sub-tree "v" and clone it and all of the nodes beneath it, * returning the cloned node. * This starts using identifiers at "id". */ static struct lowdown_node * node_clonetree(const struct lowdown_node *v, size_t *id) { struct lowdown_node *n, *nn; const struct lowdown_node *vv; if ((n = node_clone(v, *id++)) == NULL) return NULL; TAILQ_FOREACH(vv, &v->children, entries) { if ((nn = node_clonetree(vv, id)) == NULL) goto out; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; } return n; out: lowdown_node_free(n); return NULL; } /* * Count the number of words in a normal-text node. */ static size_t node_countwords(const struct lowdown_node *n) { const char *cp; size_t i = 0, sz, words = 0; cp = n->rndr_normal_text.text.data; sz = n->rndr_normal_text.text.size; /* Skip leading space. */ while (i < sz && isspace((unsigned char)cp[i])) i++; /* First go through word, then trailing space. */ while (i < sz) { assert(!isspace((unsigned char)cp[i])); words++; while (i < sz && !isspace((unsigned char)cp[i])) i++; while (i < sz && isspace((unsigned char)cp[i])) i++; } return words; } /* * Like node_countwords(), except dupping individual words into a * structure. * Return zero on failure (memory), non-zero on success. */ static int node_tokenise(const struct lowdown_node *n, struct sesnode *toks, size_t toksz, char **savep) { char *cp; size_t i = 0, sz, words = 0; *savep = NULL; if (toksz == 0) return 1; sz = n->rndr_normal_text.text.size; *savep = cp = malloc(sz + 1); if (cp == NULL) return 0; memcpy(cp, n->rndr_normal_text.text.data, sz); cp[sz] = '\0'; *savep = cp; /* Skip leading space. */ if (i < sz) toks[0].headsp = isspace((unsigned char)cp[0]); while (i < sz && isspace((unsigned char)cp[i])) i++; while (i < sz) { assert(words < toksz); assert(!isspace((unsigned char)cp[i])); toks[words].buf = &cp[i]; toks[words].bufsz = 0; while (i < sz && !isspace((unsigned char)cp[i])) { toks[words].bufsz++; i++; } words++; if (i == sz) break; toks[words - 1].tailsp = 1; assert(isspace((unsigned char)cp[i])); cp[i++] = '\0'; while (i < sz && isspace((unsigned char)cp[i])) i++; } return 1; } static int node_word_cmp(const void *p1, const void *p2) { const struct sesnode *l1 = p1, *l2 = p2; if (l1->bufsz != l2->bufsz) return 0; return 0 == strncmp(l1->buf, l2->buf, l1->bufsz); } /* * Return zero on failure (memory), non-zero on success. */ static int node_lcs(const struct lowdown_node *nold, const struct lowdown_node *nnew, struct lowdown_node *n, size_t *id) { const struct sesnode *tmp; struct lowdown_node *nn; struct sesnode *newtok = NULL, *oldtok = NULL; char *newtokbuf = NULL, *oldtokbuf = NULL; size_t i, newtoksz, oldtoksz; struct diff d; int rc = 0; memset(&d, 0, sizeof(struct diff)); newtoksz = node_countwords(nnew); oldtoksz = node_countwords(nold); newtok = calloc(newtoksz, sizeof(struct sesnode)); if (newtok == NULL) goto out; oldtok = calloc(oldtoksz, sizeof(struct sesnode)); if (oldtok == NULL) goto out; if (!node_tokenise(nnew, newtok, newtoksz, &newtokbuf)) goto out; if (!node_tokenise(nold, oldtok, oldtoksz, &oldtokbuf)) goto out; if (!diff(&d, node_word_cmp, sizeof(struct sesnode), oldtok, oldtoksz, newtok, newtoksz)) goto out; for (i = 0; i < d.sessz; i++) { tmp = d.ses[i].e; if (tmp->headsp) { nn = calloc(1, sizeof(struct lowdown_node)); if (nn == NULL) goto out; TAILQ_INSERT_TAIL(&n->children, nn, entries); TAILQ_INIT(&nn->children); nn->type = LOWDOWN_NORMAL_TEXT; nn->id = (*id)++; nn->parent = n; nn->rndr_normal_text.text.size = 1; nn->rndr_normal_text.text.data = strdup(" "); if (nn->rndr_normal_text.text.data == NULL) goto out; } nn = calloc(1, sizeof(struct lowdown_node)); if (nn == NULL) goto out; TAILQ_INSERT_TAIL(&n->children, nn, entries); TAILQ_INIT(&nn->children); nn->type = LOWDOWN_NORMAL_TEXT; nn->id = (*id)++; nn->parent = n; nn->rndr_normal_text.text.size = tmp->bufsz; nn->rndr_normal_text.text.data = calloc(1, tmp->bufsz + 1); if (nn->rndr_normal_text.text.data == NULL) goto out; memcpy(nn->rndr_normal_text.text.data, tmp->buf, tmp->bufsz); nn->chng = DIFF_DELETE == d.ses[i].type ? LOWDOWN_CHNG_DELETE : DIFF_ADD == d.ses[i].type ? LOWDOWN_CHNG_INSERT : LOWDOWN_CHNG_NONE; if (tmp->tailsp) { nn = calloc(1, sizeof(struct lowdown_node)); if (nn == NULL) goto out; TAILQ_INSERT_TAIL(&n->children, nn, entries); TAILQ_INIT(&nn->children); nn->type = LOWDOWN_NORMAL_TEXT; nn->id = (*id)++; nn->parent = n; nn->rndr_normal_text.text.size = 1; nn->rndr_normal_text.text.data = strdup(" "); if (nn->rndr_normal_text.text.data == NULL) goto out; } } rc = 1; out: free(d.ses); free(d.lcs); free(newtok); free(oldtok); free(newtokbuf); free(oldtokbuf); return rc; } /* * Merge the new tree "nnew" with the old "nold" using a depth-first * algorithm. * The produced tree will show the new tree with deleted nodes from the * old and inserted ones. * It will also show moved nodes by delete/add pairs. * This uses "Phase 5" semantics, but implements the merge algorithm * without notes from the paper. */ static struct lowdown_node * node_merge(const struct lowdown_node *nold, const struct lowdown_node *nnew, struct merger *parms) { const struct xnode *xnew, *xold; struct lowdown_node *n, *nn; const struct lowdown_node *nnold; const struct xmap *xoldmap = parms->xoldmap, *xnewmap = parms->xnewmap; /* * Invariant: the current nodes are matched. * Start by putting that node into the current output. */ assert(nnew != NULL && nold != NULL ); xnew = &xnewmap->nodes[nnew->id]; xold = &xoldmap->nodes[nold->id]; assert(xnew->match != NULL); assert(xold->match != NULL); assert(xnew->match == xold->node); if ((n = node_clone(nnew, parms->id++)) == NULL) goto err; /* Now walk through the children on both sides. */ nold = TAILQ_FIRST(&nold->children); nnew = TAILQ_FIRST(&nnew->children); while (nnew != NULL) { /* * Begin by flushing out all of the nodes that have been * deleted from the old tree at this level. * According to the paper, deleted nodes have no match. * These will leave us with old nodes that are in the * new tree (not necessarily at this level, though). */ while (nold != NULL) { xold = &xoldmap->nodes[nold->id]; if (xold->match != NULL || LOWDOWN_NORMAL_TEXT == nold->type) break; if ((nn = node_clonetree (nold, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_DELETE; nold = TAILQ_NEXT(nold, entries); } /* * Now flush inserted nodes. * According to the paper, these have no match. * This leaves us with nodes that are matched somewhere * (not necessarily at this level) with the old. */ while (nnew != NULL) { xnew = &xnewmap->nodes[nnew->id]; if (xnew->match != NULL || LOWDOWN_NORMAL_TEXT == nnew->type) break; if ((nn = node_clonetree (nnew, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_INSERT; nnew = TAILQ_NEXT(nnew, entries); } /* * If both nodes are text nodes, then we want to run the * LCS algorithm on them. * This is an extension of the BULD algorithm. */ if (nold != NULL && nnew != NULL && nold->type == LOWDOWN_NORMAL_TEXT && xold->match == NULL && nnew->type == LOWDOWN_NORMAL_TEXT && xnew->match == NULL) { if (!node_lcs(nold, nnew, n, &parms->id)) goto err; nold = TAILQ_NEXT(nold, entries); nnew = TAILQ_NEXT(nnew, entries); } while (nold != NULL) { xold = &xoldmap->nodes[nold->id]; if (xold->match != NULL) break; if ((nn = node_clonetree (nold, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_DELETE; nold = TAILQ_NEXT(nold, entries); } while (nnew != NULL) { xnew = &xnewmap->nodes[nnew->id]; if (xnew->match != NULL) break; if ((nn = node_clonetree (nnew, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_INSERT; nnew = TAILQ_NEXT(nnew, entries); } /* Nothing more to do at this level? */ if (nnew == NULL) break; /* * Now we take the current new node and see if it's a * match with a node in the current level. * If it is, then we can flush out old nodes (moved, * which we call deleted and re-inserted) until we get * to the matching one. * Then we'll be in lock-step with the old tree. */ xnew = &xnewmap->nodes[nnew->id]; assert(xnew->match != NULL); /* Scan ahead to find a matching old. */ for (nnold = nold; nnold != NULL ; ) { xold = &xoldmap->nodes[nnold->id]; if (xnew->node == xold->match) break; nnold = TAILQ_NEXT(nnold, entries); } /* * We did not find a match. * This means that the new node has been moved from * somewhere else in the tree. */ if (nnold == NULL) { if ((nn = node_clonetree (nnew, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_INSERT; nnew = TAILQ_NEXT(nnew, entries); continue; } /* Match found: flush old nodes til the match. */ while (nold != NULL) { xold = &xoldmap->nodes[nold->id]; if (xnew->node == xold->match) break; if ((nn = node_clonetree (nold, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_DELETE; nold = TAILQ_NEXT(nold, entries); } assert(nold != NULL); /* * Now we're in lock-step. * Do the recursive step between the matched pair. * Then continue on to the next nodes. */ if (is_opaque(nnew)) { assert(is_opaque(nold)); if ((nn = node_clonetree (nnew, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; } else { assert(!is_opaque(nold)); nn = node_merge(nold, nnew, parms); if (nn == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; } nold = TAILQ_NEXT(nold, entries); nnew = TAILQ_NEXT(nnew, entries); } /* Flush remaining old nodes. */ while (nold != NULL) { if ((nn = node_clonetree (nold, &parms->id)) == NULL) goto err; TAILQ_INSERT_TAIL(&n->children, nn, entries); nn->parent = n; nn->chng = LOWDOWN_CHNG_DELETE; nold = TAILQ_NEXT(nold, entries); } return n; err: lowdown_node_free(n); return NULL; } /* * Optimise from top down. * This works by selecting matching non-terminal nodes, both adjacent * (i.e., children of the same adjacent nodes), and seeing if their * immediate siblings may be matched by label. * This works well when looking at pure-paragraph changes. */ static void node_optimise_topdown(const struct lowdown_node *n, struct xmap *newmap, struct xmap *oldmap) { struct xnode *xn, *xmatch, *xnchild, *xmchild, *xnnext, *xmnext; const struct lowdown_node *match, *nchild, *mchild, *nnext, *mnext; if (is_opaque(n) || TAILQ_EMPTY(&n->children)) return; xn = &newmap->nodes[n->id]; assert(xn != NULL); if ((match = xn->match) == NULL) return; xmatch = &oldmap->nodes[match->id]; assert(xmatch != NULL); TAILQ_FOREACH(nchild, &n->children, entries) { if (is_opaque(nchild) || TAILQ_EMPTY(&nchild->children)) continue; xnchild = &newmap->nodes[nchild->id]; assert(xnchild != NULL); if ((mchild = xnchild->match) == NULL) continue; if (mchild->parent->id != match->id) continue; xmchild = &oldmap->nodes[mchild->id]; assert(xmchild != NULL); /* * Do we have a non-terminal sibling after us without a * match? */ if ((nnext = TAILQ_NEXT(nchild, entries)) == NULL) continue; if (is_opaque(nnext) || TAILQ_EMPTY(&nnext->children)) continue; xnnext = &newmap->nodes[nnext->id]; assert(xnnext != NULL); if (xnnext->match != NULL) continue; if ((mnext = TAILQ_NEXT(mchild, entries)) == NULL) continue; if (is_opaque(mnext) || TAILQ_EMPTY(&mnext->children)) continue; xmnext = &oldmap->nodes[mnext->id]; assert(xmnext != NULL); if (xmnext->match != NULL) continue; if (!match_eq(nnext, mnext)) continue; xnnext->match = mnext; xmnext->match = nnext; } TAILQ_FOREACH(nchild, &n->children, entries) node_optimise_topdown(nchild, newmap, oldmap); } /* * Optimise bottom-up over all un-matched nodes: examine all the * children of the un-matched nodes and see which of their matches, if * found, are under a root that's the same node as we are. * This lets us compute the largest fraction of un-matched nodes' * children that are in the same tree. * If that fraction is >50%, then we consider that the subtrees are * matched. */ static void node_optimise_bottomup(const struct lowdown_node *n, struct xmap *newmap, struct xmap *oldmap) { const struct lowdown_node *nn, *on, *nnn, *maxn = NULL; double w, maxw = 0.0, tw = 0.0; /* Ignore opaque nodes. */ if (is_opaque(n) || TAILQ_EMPTY(&n->children)) return; /* Do a depth-first pre-order search. */ TAILQ_FOREACH(nn, &n->children, entries) { tw += newmap->nodes[nn->id].weight; node_optimise_bottomup(nn, newmap, oldmap); } /* * We're now at a non-leaf node. * If we're already matched, then move on. */ if (newmap->nodes[n->id].match != NULL) return; TAILQ_FOREACH(nn, &n->children, entries) { if (newmap->nodes[nn->id].match == NULL) continue; if ((on = newmap->nodes[nn->id].match->parent) == NULL) continue; if (on == maxn) continue; if (!match_eq(n, on)) continue; /* * We've now established "on" as the parent of the * matched node, and that "on" is equivalent. * See what fraction of on's children are matched to our * children. * FIXME: this will harmlessly (except in time) look at * the same parent multiple times. */ w = 0.0; TAILQ_FOREACH(nnn, &n->children, entries) { if (newmap->nodes[nnn->id].match == NULL) continue; if (on != newmap->nodes[nnn->id].match->parent) continue; w += newmap->nodes[nnn->id].weight; } /* Is this the highest fraction? */ if (w > maxw) { maxw = w; maxn = on; } } /* See if we found any similar sub-trees. */ if (maxn == NULL) return; /* * Our magic breakpoint is 50%. * If the matched sub-tree has a greater than 50% match by * weight, then set us as a match! */ if (maxw / tw >= 0.5) { newmap->nodes[n->id].match = maxn; oldmap->nodes[maxn->id].match = n; } } struct lowdown_node * lowdown_diff(const struct lowdown_node *nold, const struct lowdown_node *nnew, size_t *maxn) { struct xmap xoldmap, xnewmap; struct xnode *xnew, *xold; struct pnodeq pq; struct pnode *p; const struct lowdown_node *n, *nn; struct lowdown_node *comp = NULL; size_t i; struct merger parms; memset(&xoldmap, 0, sizeof(struct xmap)); memset(&xnewmap, 0, sizeof(struct xmap)); TAILQ_INIT(&pq); /* * First, assign signatures and weights. * See "Phase 2", sec 5.2. */ if (assign_sigs(NULL, &xoldmap, nold, 0) < 0.0) goto out; if (assign_sigs(NULL, &xnewmap, nnew, 0) < 0.0) goto out; /* Prime the priority queue with the root. */ if (!pqueue(nnew, &xnewmap, &pq)) goto out; /* * Match-make while we have nodes in the priority queue. * This is guaranteed to be finite. * See "Phase 3", sec 5.2. */ while ((p = TAILQ_FIRST(&pq)) != NULL) { TAILQ_REMOVE(&pq, p, entries); n = p->node; free(p); xnew = &xnewmap.nodes[n->id]; assert(xnew->match == NULL); assert(xnew->optmatch == NULL); assert(xnew->opt == 0); /* * Look for candidates: if we have a matching signature, * test for optimality. * Highest optimality gets to be matched. * See "Phase 3", sec. 5.2. */ for (i = 0; i < xoldmap.maxid + 1; i++) { xold = &xoldmap.nodes[i]; if (xold->node == NULL) continue; if (xold->match != NULL) continue; if (strcmp(xnew->sig, xold->sig)) continue; assert(xold->match == NULL); candidate(xnew, &xnewmap, xold, &xoldmap); } /* * No match: enqueue children ("Phase 3" cont.). * Ignore opaque nodes. */ if (xnew->optmatch == NULL) { if (is_opaque(n)) continue; TAILQ_FOREACH(nn, &n->children, entries) if (!pqueue(nn, &xnewmap, &pq)) goto out; continue; } /* * Match found and is optimal. * Now use the bottom-up and top-down (doesn't matter * which order) algorithms. * See "Phase 3", sec. 5.2. */ assert(xnew->match == NULL); assert(xoldmap.nodes[xnew->optmatch->id].match == NULL); match_down(xnew, &xnewmap, &xoldmap.nodes[xnew->optmatch->id], &xoldmap); match_up(xnew, &xnewmap, &xoldmap.nodes[xnew->optmatch->id], &xoldmap); } /* * If our trees are *totally* different, we may end up in the * situation where our root nodes are never matched. This will * violate an invariant in node_merge() where the entry nodes * are assumed to be matched. */ if (xnewmap.nodes[nnew->id].match == NULL) { assert(nnew->type == LOWDOWN_ROOT); assert(nold->type == LOWDOWN_ROOT); xnew = &xnewmap.nodes[nnew->id]; xold = &xoldmap.nodes[nold->id]; assert(xold->match == NULL); xnew->match = xold->node; xold->match = xnew->node; } /* * Following the above, make sure that our LOWDOWN_DOC_HEADER * nodes are also matched, because they are fixed in the tree. */ n = TAILQ_FIRST(&nnew->children); nn = TAILQ_FIRST(&nold->children); if (n != NULL && nn != NULL && n->type == LOWDOWN_DOC_HEADER && nn->type == LOWDOWN_DOC_HEADER) { xnew = &xnewmap.nodes[n->id]; xold = &xoldmap.nodes[nn->id]; if (xnew->match == NULL) { xnew->match = xold->node; xold->match = xnew->node; } } /* * All nodes have been processed. * Now we need to optimise, so run a "Phase 4", sec. 5.2. * Our optimisation is nothing like the paper's. */ node_optimise_topdown(nnew, &xnewmap, &xoldmap); node_optimise_bottomup(nnew, &xnewmap, &xoldmap); /* * The tree is optimal. * Now we need to compute the delta and merge the trees. * See "Phase 5", sec. 5.2. */ memset(&parms, 0, sizeof(struct merger)); parms.xoldmap = &xoldmap; parms.xnewmap = &xnewmap; comp = node_merge(nold, nnew, &parms); *maxn = xnewmap.maxid > xoldmap.maxid ? xnewmap.maxid + 1 : xoldmap.maxid + 1; out: assert(comp != NULL); while ((p = TAILQ_FIRST(&pq)) != NULL) { TAILQ_REMOVE(&pq, p, entries); free(p); } free(xoldmap.nodes); free(xnewmap.nodes); return comp; }