autolink.c (6721B)
/* $Id$ */ /* * Copyright (c) 2008, Natacha Porté * Copyright (c) 2011, Vicent Martà * Copyright (c) 2014, Xavier Mendez, Devin Torres and the Hoedown authors * Copyright (c) 2016--2017, 2021 Kristaps Dzonsons * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "config.h" #if HAVE_SYS_QUEUE # include <sys/queue.h> #endif #include <ctype.h> #include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include "lowdown.h" #include "extern.h" #define VALID_URIS_SZ 6 /* * List of URI prefixes that are considered "valid". */ static const char *const valid_uris[VALID_URIS_SZ] = { "http://", "https://", "/", "#", "ftp://", "mailto:" }; /* * Verify that a URL has a safe protocol. */ static int halink_is_safe(const char *data, size_t size) { size_t i, len; for (i = 0; i < VALID_URIS_SZ; ++i) { len = strlen(valid_uris[i]); if (size > len && strncasecmp(data, valid_uris[i], len) == 0 && isalnum((unsigned char)data[len])) return 1; } return 0; } /* * Find the end of a hyperlink. * Returns the position of the end. */ static size_t autolink_delim(char *data, size_t link_end, size_t max_rewind, size_t size) { char cclose, copen = 0; size_t closing, opening, i, new_end; for (i = 0; i < link_end; ++i) if (data[i] == '<') { link_end = i; break; } while (link_end > 0) if (strchr("?!.,:", data[link_end - 1]) != NULL) link_end--; else if (data[link_end - 1] == ';') { new_end = link_end - 2; while (new_end > 0 && isalpha((unsigned char)data[new_end])) new_end--; if (new_end < link_end - 2 && data[new_end] == '&') link_end = new_end; else link_end--; } else break; if (link_end == 0) return 0; cclose = data[link_end - 1]; switch (cclose) { case '"': copen = '"'; break; case '\'': copen = '\''; break; case ')': copen = '('; break; case ']': copen = '['; break; case '}': copen = '{'; break; } if (copen != 0) { closing = opening = i = 0; /* * Try to close the final punctuation sign in this same * line; if we managed to close it outside of the URL, * that means that it's not part of the URL. If it * closes inside the URL, that means it is part of the * URL. * * Examples: * * foo http://www.pokemon.com/Pikachu_(Electric) bar * => http://www.pokemon.com/Pikachu_(Electric) * * foo (http://www.pokemon.com/Pikachu_(Electric)) bar * => http://www.pokemon.com/Pikachu_(Electric) * * foo http://www.pokemon.com/Pikachu_(Electric)) bar * => http://www.pokemon.com/Pikachu_(Electric)) * * (foo http://www.pokemon.com/Pikachu_(Electric)) bar * => foo http://www.pokemon.com/Pikachu_(Electric) */ while (i < link_end) { if (data[i] == copen) opening++; else if (data[i] == cclose) closing++; i++; } if (closing != opening) link_end--; } return link_end; } /* * To make sure that a domain is well-formed. * Returns zero on failure, non-zero on success. * XXX: this function needs to be replaced. */ static size_t check_domain(char *data, size_t size) { size_t i, np = 0; if (!isalnum((unsigned char)data[0])) return 0; for (i = 1; i < size - 1; ++i) { if (strchr(".:", data[i]) != NULL) np++; else if (!isalnum((unsigned char)data[i]) && data[i] != '-') break; } /* A valid domain needs to have at least a dot. */ return np ? i : 0; } /* * Search for the next www link in data. */ ssize_t halink_www(size_t *rewind_p, struct lowdown_buf *link, char *data, size_t max_rewind, size_t size) { size_t link_end; if (max_rewind > 0 && !ispunct((unsigned char)data[-1]) && !isspace((unsigned char)data[-1])) return 0; if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) return 0; link_end = check_domain(data, size); if (link_end == 0) return 0; while (link_end < size && !isspace((unsigned char)data[link_end])) link_end++; link_end = autolink_delim(data, link_end, max_rewind, size); if (link_end == 0) return 0; if (!hbuf_put(link, data, link_end)) return -1; *rewind_p = 0; return link_end; } /* * Search for the next email in data. */ ssize_t halink_email(size_t *rewind_p, struct lowdown_buf *link, char *data, size_t max_rewind, size_t size) { size_t link_end, rewind; int nb = 0, np = 0; char c; for (rewind = 0; rewind < max_rewind; ++rewind) { c = data[-1 - rewind]; if (isalnum((unsigned char)c)) continue; if (strchr(".+-_", c) != NULL) continue; break; } if (rewind == 0) return 0; for (link_end = 0; link_end < size; ++link_end) { c = data[link_end]; if (isalnum(c)) continue; if (c == '@') nb++; else if (c == '.' && link_end < size - 1) np++; else if (c != '-' && c != '_') break; } if (link_end < 2 || nb != 1 || np == 0 || !isalpha((unsigned char)data[link_end - 1])) return 0; link_end = autolink_delim(data, link_end, max_rewind, size); if (link_end == 0) return 0; if (!hbuf_put(link, data - rewind, link_end + rewind)) return -1; *rewind_p = rewind; return link_end; } /* * Search for the next URL in data. */ ssize_t halink_url(size_t *rewind_p, struct lowdown_buf *link, char *data, size_t max_rewind, size_t size) { size_t link_end, rewind = 0, domain_len; if (size < 4 || data[1] != '/' || data[2] != '/') return 0; while (rewind < max_rewind && isalpha((unsigned char)data[-1 - rewind])) rewind++; if (!halink_is_safe(data - rewind, size + rewind)) return 0; link_end = strlen("://"); domain_len = check_domain(data + link_end, size - link_end); if (domain_len == 0) return 0; link_end += domain_len; while (link_end < size && !isspace((unsigned char)data[link_end])) link_end++; link_end = autolink_delim(data, link_end, max_rewind, size); if (link_end == 0) return 0; if (!hbuf_put(link, data - rewind, link_end + rewind)) return -1; *rewind_p = rewind; return link_end; }