html_escape.c (7771B)
/* $Id$ */
/*
* Copyright (c) 2008, Natacha Porté
* Copyright (c) 2011, Vicent MartÃ
* Copyright (c) 2014, Xavier Mendez, Devin Torres and the Hoedown authors
* Copyright (c) 2016--2017, 2020 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include "config.h"
#if HAVE_SYS_QUEUE
# include <sys/queue.h>
#endif
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "lowdown.h"
#include "extern.h"
/*
* The following characters will not be escaped:
*
* -_.+!*'(),%#@?=;:/,+&$~ alphanum
*
* Note that this character set is the addition of:
*
* - The characters which are safe to be in an URL
* - The characters which are *not* safe to be in an URL because they
* are RESERVED characters.
*
* We assume (lazily) that any RESERVED char that appears inside an URL
* is actually meant to have its native function (i.e. as an URL
* component/separator) and hence needs no escaping.
*
* There are two exceptions: the chacters & (amp) and ' (single quote)
* do not appear in the table. They are meant to appear in the URL as
* components, yet they require special HTML-entity escaping to generate
* valid HTML markup.
*
* All other characters will be escaped to %XX.
*/
static const int href_tbl[UINT8_MAX + 1] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/*
* For each 8-bit character, if non-zero, the HTML entity we need to
* substitute for safe output. According to the OWASP rules:
* & --> &
* < --> <
* > --> > optional
* " --> " optional
* ' --> ' optional: ' is not recommended
* / --> / optional: end an HTML entity
*/
static const int esc_tbl[UINT8_MAX + 1] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
/*
* Maximum value of optional entity subsititute.
* Above this (>ESC_TBL_OWASP_MAX) is mandatory.
*/
#define ESC_TBL_OWASP_MAX 3
/*
* For literal contexts, maximum value of optional entity subsititute.
* Above this is mandatory.
*/
#define ESC_TBL_LITERAL_MAX 3
/*
* Named entities (mostly).
*/
static const char *esc_name[] = {
"",
"", /* oops */
"'",
"/",
">",
"<",
"&",
};
/*
* Numeric entities.
*/
static const char *esc_num[] = {
"",
"", /* oops */
"'",
"/",
">",
"<",
"&",
};
/*
* Escape general HTML attributes.
* This is modelled after the main Markdown parser.
*/
int
hesc_attr(struct lowdown_buf *ob, const char *data, size_t size)
{
size_t i, mark;
int rc;
if (size == 0)
return 1;
for (i = 0; i < size; i++) {
mark = i;
while (i < size && data[i] != '"' && data[i] != '&')
i++;
if (mark == 0 && i >= size)
return hbuf_put(ob, data, size);
if (i > mark &&
!hbuf_put(ob, data + mark, i - mark))
return 0;
if (i >= size)
break;
rc = 1;
if (data[i] == '"')
rc = HBUF_PUTSL(ob, """);
else if (data[i] == '&')
rc = HBUF_PUTSL(ob, "&");
if (!rc)
return 0;
}
return 1;
}
/*
* Escape (part of) a URL inside HTML.
* Return zero on failure (memory), non-zero otherwise.
*/
int
hesc_href(struct lowdown_buf *ob, const char *data, size_t size)
{
static const char hex_chars[] = "0123456789ABCDEF";
size_t i, mark;
char hex_str[3];
int rc;
if (size == 0)
return 1;
hex_str[0] = '%';
for (i = 0; i < size; i++) {
mark = i;
while (i < size && href_tbl[(unsigned char)data[i]])
i++;
/*
* Optimization for cases where there's nothing to
* escape.
*/
if (mark == 0 && i >= size)
return hbuf_put(ob, data, size);
if (i > mark &&
!hbuf_put(ob, data + mark, i - mark))
return 0;
/* Escaping... */
if (i >= size)
break;
switch (data[i]) {
case '&':
/*
* Amp appears all the time in URLs, but needs
* HTML-entity escaping to be inside an href.
*/
rc = HBUF_PUTSL(ob, "&");
break;
case '\'':
/*
* The single quote is a valid URL character
* according to the standard; it needs HTML
* entity escaping too.
*/
rc = HBUF_PUTSL(ob, "'");
break;
default:
/*
* Every other character goes with a %XX
* escaping.
*/
hex_str[1] = hex_chars[(data[i] >> 4) & 0xF];
hex_str[2] = hex_chars[data[i] & 0xF];
rc = hbuf_put(ob, hex_str, 3);
break;
}
if (!rc)
return 0;
}
return 1;
}
/*
* Escape HTML.
* If "literal", we also want to escape some extra characters.
* If "secure", also escape characters as suggested by OWASP rules.
* If "num", use only numeric escapes.
* Does nothing if "size" is zero.
* Return zero on failure (memory), non-zero otherwise.
*/
int
hesc_html(struct lowdown_buf *ob, const char *data,
size_t size, int secure, int literal, int num)
{
size_t i, mark;
int max = 0, rc;
unsigned char ch;
if (size == 0)
return 1;
if (!literal && !secure)
max = ESC_TBL_OWASP_MAX;
else if (literal && !secure)
max = ESC_TBL_LITERAL_MAX;
for (i = 0; ; i++) {
mark = i;
while (i < size &&
esc_tbl[(unsigned char)data[i]] == 0)
i++;
/* Case where there's nothing to escape. */
if (mark == 0 && i >= size)
return hbuf_put(ob, data, size);
if (i > mark &&
!hbuf_put(ob, data + mark, i - mark))
return 0;
if (i >= size)
break;
ch = (unsigned char)data[i];
if (esc_tbl[ch] <= max)
rc = hbuf_putc(ob, data[i]);
else
rc = hbuf_puts(ob, num ?
esc_num[esc_tbl[ch]] :
esc_name[esc_tbl[ch]]);
if (!rc)
return 0;
}
return 1;
}