first commit
This commit is contained in:
22
server/internal/sanitize/.gitignore
vendored
Normal file
22
server/internal/sanitize/.gitignore
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
27
server/internal/sanitize/LICENSE
Normal file
27
server/internal/sanitize/LICENSE
Normal file
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2017 Mechanism Design. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
62
server/internal/sanitize/README.md
Normal file
62
server/internal/sanitize/README.md
Normal file
@@ -0,0 +1,62 @@
|
||||
sanitize [](https://godoc.org/github.com/kennygrant/sanitize) [](https://goreportcard.com/report/github.com/kennygrant/sanitize) [](https://circleci.com/gh/kennygrant/sanitize)
|
||||
========
|
||||
|
||||
Package sanitize provides functions to sanitize html and paths with go (golang).
|
||||
|
||||
FUNCTIONS
|
||||
|
||||
|
||||
```go
|
||||
sanitize.Accents(s string) string
|
||||
```
|
||||
|
||||
Accents replaces a set of accented characters with ascii equivalents.
|
||||
|
||||
```go
|
||||
sanitize.BaseName(s string) string
|
||||
```
|
||||
|
||||
BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path.
|
||||
|
||||
```go
|
||||
sanitize.HTML(s string) string
|
||||
```
|
||||
|
||||
HTML strips html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text.
|
||||
|
||||
```go
|
||||
sanitize.HTMLAllowing(s string, args...[]string) (string, error)
|
||||
```
|
||||
|
||||
HTMLAllowing parses html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used.
|
||||
|
||||
```go
|
||||
sanitize.Name(s string) string
|
||||
```
|
||||
|
||||
Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
|
||||
|
||||
```go
|
||||
sanitize.Path(s string) string
|
||||
```
|
||||
|
||||
Path makes a string safe to use as an url path.
|
||||
|
||||
|
||||
Changes
|
||||
-------
|
||||
|
||||
Version 1.2
|
||||
|
||||
Adjusted HTML function to avoid linter warning
|
||||
Added more tests from https://githubengineering.com/githubs-post-csp-journey/
|
||||
Chnaged name of license file
|
||||
Added badges and change log to readme
|
||||
|
||||
Version 1.1
|
||||
Fixed type in comments.
|
||||
Merge pull request from Povilas Balzaravicius Pawka
|
||||
- replace br tags with newline even when they contain a space
|
||||
|
||||
Version 1.0
|
||||
First release
|
||||
388
server/internal/sanitize/sanitize.go
Normal file
388
server/internal/sanitize/sanitize.go
Normal file
@@ -0,0 +1,388 @@
|
||||
// Package sanitize provides functions for sanitizing text.
|
||||
package sanitize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"html"
|
||||
"html/template"
|
||||
"io"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
parser "golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var (
|
||||
ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
|
||||
|
||||
defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
|
||||
|
||||
defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
|
||||
)
|
||||
|
||||
// HTMLAllowing sanitizes html, allowing some tags.
|
||||
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
|
||||
func HTMLAllowing(s string, args ...[]string) (string, error) {
|
||||
|
||||
allowedTags := defaultTags
|
||||
if len(args) > 0 {
|
||||
allowedTags = args[0]
|
||||
}
|
||||
allowedAttributes := defaultAttributes
|
||||
if len(args) > 1 {
|
||||
allowedAttributes = args[1]
|
||||
}
|
||||
|
||||
// Parse the html
|
||||
tokenizer := parser.NewTokenizer(strings.NewReader(s))
|
||||
|
||||
buffer := bytes.NewBufferString("")
|
||||
ignore := ""
|
||||
|
||||
for {
|
||||
tokenType := tokenizer.Next()
|
||||
token := tokenizer.Token()
|
||||
|
||||
switch tokenType {
|
||||
|
||||
case parser.ErrorToken:
|
||||
err := tokenizer.Err()
|
||||
if err == io.EOF {
|
||||
return buffer.String(), nil
|
||||
}
|
||||
return "", err
|
||||
|
||||
case parser.StartTagToken:
|
||||
|
||||
if len(ignore) == 0 && includes(allowedTags, token.Data) {
|
||||
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
|
||||
buffer.WriteString(token.String())
|
||||
} else if includes(ignoreTags, token.Data) {
|
||||
ignore = token.Data
|
||||
}
|
||||
|
||||
case parser.SelfClosingTagToken:
|
||||
|
||||
if len(ignore) == 0 && includes(allowedTags, token.Data) {
|
||||
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
|
||||
buffer.WriteString(token.String())
|
||||
} else if token.Data == ignore {
|
||||
ignore = ""
|
||||
}
|
||||
|
||||
case parser.EndTagToken:
|
||||
if len(ignore) == 0 && includes(allowedTags, token.Data) {
|
||||
token.Attr = []parser.Attribute{}
|
||||
buffer.WriteString(token.String())
|
||||
} else if token.Data == ignore {
|
||||
ignore = ""
|
||||
}
|
||||
|
||||
case parser.TextToken:
|
||||
// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
|
||||
if ignore == "" {
|
||||
buffer.WriteString(token.String())
|
||||
}
|
||||
case parser.CommentToken:
|
||||
// We ignore comments by default
|
||||
case parser.DoctypeToken:
|
||||
// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
|
||||
default:
|
||||
// We ignore unknown token types by default
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
|
||||
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
|
||||
func HTML(s string) (output string) {
|
||||
|
||||
// Shortcut strings with no tags in them
|
||||
if !strings.ContainsAny(s, "<>") {
|
||||
output = s
|
||||
} else {
|
||||
|
||||
// First remove line breaks etc as these have no meaning outside html tags (except pre)
|
||||
// this means pre sections will lose formatting... but will result in less unintentional paras.
|
||||
s = strings.Replace(s, "\n", "", -1)
|
||||
|
||||
// Then replace line breaks with newlines, to preserve that formatting
|
||||
s = strings.Replace(s, "</p>", "\n", -1)
|
||||
s = strings.Replace(s, "<br>", "\n", -1)
|
||||
s = strings.Replace(s, "</br>", "\n", -1)
|
||||
s = strings.Replace(s, "<br/>", "\n", -1)
|
||||
s = strings.Replace(s, "<br />", "\n", -1)
|
||||
|
||||
// Walk through the string removing all tags
|
||||
b := bytes.NewBufferString("")
|
||||
inTag := false
|
||||
for _, r := range s {
|
||||
switch r {
|
||||
case '<':
|
||||
inTag = true
|
||||
case '>':
|
||||
inTag = false
|
||||
default:
|
||||
if !inTag {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
output = b.String()
|
||||
}
|
||||
|
||||
// Remove a few common harmless entities, to arrive at something more like plain text
|
||||
output = strings.Replace(output, "‘", "'", -1)
|
||||
output = strings.Replace(output, "’", "'", -1)
|
||||
output = strings.Replace(output, "“", "\"", -1)
|
||||
output = strings.Replace(output, "”", "\"", -1)
|
||||
output = strings.Replace(output, " ", " ", -1)
|
||||
output = strings.Replace(output, """, "\"", -1)
|
||||
output = strings.Replace(output, "'", "'", -1)
|
||||
|
||||
// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
|
||||
output = html.UnescapeString(output)
|
||||
|
||||
// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
|
||||
output = template.HTMLEscapeString(output)
|
||||
|
||||
// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
|
||||
output = strings.Replace(output, """, "\"", -1)
|
||||
output = strings.Replace(output, "'", "'", -1)
|
||||
output = strings.Replace(output, "& ", "& ", -1) // NB space after
|
||||
output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
// We are very restrictive as this is intended for ascii url slugs
|
||||
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
|
||||
|
||||
// Path makes a string safe to use as a URL path,
|
||||
// removing accents and replacing separators with -.
|
||||
// The path may still start at / and is not intended
|
||||
// for use as a file system path without prefix.
|
||||
func Path(s string) string {
|
||||
// Start with lowercase string
|
||||
filePath := strings.ToLower(s)
|
||||
filePath = strings.Replace(filePath, "..", "", -1)
|
||||
filePath = path.Clean(filePath)
|
||||
|
||||
// Remove illegal characters for paths, flattening accents
|
||||
// and replacing some common separators with -
|
||||
filePath = cleanString(filePath, illegalPath)
|
||||
|
||||
// NB this may be of length 0, caller must check
|
||||
return filePath
|
||||
}
|
||||
|
||||
// Remove all other unrecognised characters apart from
|
||||
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
|
||||
|
||||
// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
|
||||
func Name(s string) string {
|
||||
// Start with lowercase string
|
||||
fileName := s
|
||||
fileName = path.Clean(path.Base(fileName))
|
||||
|
||||
// Remove illegal characters for names, replacing some common separators with -
|
||||
fileName = cleanString(fileName, illegalName)
|
||||
|
||||
// NB this may be of length 0, caller must check
|
||||
return fileName
|
||||
}
|
||||
|
||||
// Replace these separators with -
|
||||
var baseNameSeparators = regexp.MustCompile(`[./]`)
|
||||
|
||||
// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
|
||||
// No attempt is made to normalise a path or normalise case.
|
||||
func BaseName(s string) string {
|
||||
|
||||
// Replace certain joining characters with a dash
|
||||
baseName := baseNameSeparators.ReplaceAllString(s, "-")
|
||||
|
||||
// Remove illegal characters for names, replacing some common separators with -
|
||||
baseName = cleanString(baseName, illegalName)
|
||||
|
||||
// NB this may be of length 0, caller must check
|
||||
return baseName
|
||||
}
|
||||
|
||||
// A very limited list of transliterations to catch common european names translated to urls.
|
||||
// This set could be expanded with at least caps and many more characters.
|
||||
var transliterations = map[rune]string{
|
||||
'À': "A",
|
||||
'Á': "A",
|
||||
'Â': "A",
|
||||
'Ã': "A",
|
||||
'Ä': "A",
|
||||
'Å': "AA",
|
||||
'Æ': "AE",
|
||||
'Ç': "C",
|
||||
'È': "E",
|
||||
'É': "E",
|
||||
'Ê': "E",
|
||||
'Ë': "E",
|
||||
'Ì': "I",
|
||||
'Í': "I",
|
||||
'Î': "I",
|
||||
'Ï': "I",
|
||||
'Ð': "D",
|
||||
'Ł': "L",
|
||||
'Ñ': "N",
|
||||
'Ò': "O",
|
||||
'Ó': "O",
|
||||
'Ô': "O",
|
||||
'Õ': "O",
|
||||
'Ö': "OE",
|
||||
'Ø': "OE",
|
||||
'Œ': "OE",
|
||||
'Ù': "U",
|
||||
'Ú': "U",
|
||||
'Ü': "UE",
|
||||
'Û': "U",
|
||||
'Ý': "Y",
|
||||
'Þ': "TH",
|
||||
'ẞ': "SS",
|
||||
'à': "a",
|
||||
'á': "a",
|
||||
'â': "a",
|
||||
'ã': "a",
|
||||
'ä': "ae",
|
||||
'å': "aa",
|
||||
'æ': "ae",
|
||||
'ç': "c",
|
||||
'è': "e",
|
||||
'é': "e",
|
||||
'ê': "e",
|
||||
'ë': "e",
|
||||
'ì': "i",
|
||||
'í': "i",
|
||||
'î': "i",
|
||||
'ï': "i",
|
||||
'ð': "d",
|
||||
'ł': "l",
|
||||
'ñ': "n",
|
||||
'ń': "n",
|
||||
'ò': "o",
|
||||
'ó': "o",
|
||||
'ô': "o",
|
||||
'õ': "o",
|
||||
'ō': "o",
|
||||
'ö': "oe",
|
||||
'ø': "oe",
|
||||
'œ': "oe",
|
||||
'ś': "s",
|
||||
'ù': "u",
|
||||
'ú': "u",
|
||||
'û': "u",
|
||||
'ū': "u",
|
||||
'ü': "ue",
|
||||
'ý': "y",
|
||||
'ÿ': "y",
|
||||
'ż': "z",
|
||||
'þ': "th",
|
||||
'ß': "ss",
|
||||
}
|
||||
|
||||
// Accents replaces a set of accented characters with ascii equivalents.
|
||||
func Accents(s string) string {
|
||||
// Replace some common accent characters
|
||||
b := bytes.NewBufferString("")
|
||||
for _, c := range s {
|
||||
// Check transliterations first
|
||||
if val, ok := transliterations[c]; ok {
|
||||
b.WriteString(val)
|
||||
} else {
|
||||
b.WriteRune(c)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
var (
|
||||
// If the attribute contains data: or javascript: anywhere, ignore it
|
||||
// we don't allow this in attributes as it is so frequently used for xss
|
||||
// NB we allow spaces in the value, and lowercase.
|
||||
illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
|
||||
|
||||
// We are far more restrictive with href attributes.
|
||||
legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
|
||||
)
|
||||
|
||||
// cleanAttributes returns an array of attributes after removing malicious ones.
|
||||
func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
|
||||
if len(a) == 0 {
|
||||
return a
|
||||
}
|
||||
|
||||
var cleaned []parser.Attribute
|
||||
for _, attr := range a {
|
||||
if includes(allowed, attr.Key) {
|
||||
|
||||
val := strings.ToLower(attr.Val)
|
||||
|
||||
// Check for illegal attribute values
|
||||
if illegalAttr.FindString(val) != "" {
|
||||
attr.Val = ""
|
||||
}
|
||||
|
||||
// Check for legal href values - / mailto:// http:// or https://
|
||||
if attr.Key == "href" {
|
||||
if legalHrefAttr.FindString(val) == "" {
|
||||
attr.Val = ""
|
||||
}
|
||||
}
|
||||
|
||||
// If we still have an attribute, append it to the array
|
||||
if attr.Val != "" {
|
||||
cleaned = append(cleaned, attr)
|
||||
}
|
||||
}
|
||||
}
|
||||
return cleaned
|
||||
}
|
||||
|
||||
// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
|
||||
var (
|
||||
separators = regexp.MustCompile(`[!&_="#|+?:]`)
|
||||
|
||||
dashes = regexp.MustCompile(`[\-]+`)
|
||||
)
|
||||
|
||||
// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
|
||||
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
|
||||
func cleanString(s string, r *regexp.Regexp) string {
|
||||
|
||||
// Remove any trailing space to avoid ending on -
|
||||
s = strings.Trim(s, " ")
|
||||
|
||||
// Flatten accents first so that if we remove non-ascii we still get a legible name
|
||||
s = Accents(s)
|
||||
|
||||
// Replace certain joining characters with a dash
|
||||
s = separators.ReplaceAllString(s, "-")
|
||||
|
||||
// Remove all other unrecognised characters - NB we do allow any printable characters
|
||||
//s = r.ReplaceAllString(s, "")
|
||||
|
||||
// Remove any multiple dashes caused by replacements above
|
||||
s = dashes.ReplaceAllString(s, "-")
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// includes checks for inclusion of a string in a []string.
|
||||
func includes(a []string, s string) bool {
|
||||
for _, as := range a {
|
||||
if as == s {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user