first commit

This commit is contained in:
Akhil Gupta
2021-05-29 15:20:50 +05:30
commit d25c30a7b2
194 changed files with 49873 additions and 0 deletions

22
server/internal/sanitize/.gitignore vendored Normal file
View File

@@ -0,0 +1,22 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe

View File

@@ -0,0 +1,27 @@
Copyright (c) 2017 Mechanism Design. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,62 @@
sanitize [![GoDoc](https://godoc.org/github.com/kennygrant/sanitize?status.svg)](https://godoc.org/github.com/kennygrant/sanitize) [![Go Report Card](https://goreportcard.com/badge/github.com/kennygrant/sanitize)](https://goreportcard.com/report/github.com/kennygrant/sanitize) [![CircleCI](https://circleci.com/gh/kennygrant/sanitize.svg?style=svg)](https://circleci.com/gh/kennygrant/sanitize)
========
Package sanitize provides functions to sanitize html and paths with go (golang).
FUNCTIONS
```go
sanitize.Accents(s string) string
```
Accents replaces a set of accented characters with ascii equivalents.
```go
sanitize.BaseName(s string) string
```
BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path.
```go
sanitize.HTML(s string) string
```
HTML strips html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text.
```go
sanitize.HTMLAllowing(s string, args...[]string) (string, error)
```
HTMLAllowing parses html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used.
```go
sanitize.Name(s string) string
```
Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
```go
sanitize.Path(s string) string
```
Path makes a string safe to use as an url path.
Changes
-------
Version 1.2
Adjusted HTML function to avoid linter warning
Added more tests from https://githubengineering.com/githubs-post-csp-journey/
Chnaged name of license file
Added badges and change log to readme
Version 1.1
Fixed type in comments.
Merge pull request from Povilas Balzaravicius Pawka
- replace br tags with newline even when they contain a space
Version 1.0
First release

View File

@@ -0,0 +1,388 @@
// Package sanitize provides functions for sanitizing text.
package sanitize
import (
"bytes"
"html"
"html/template"
"io"
"path"
"regexp"
"strings"
parser "golang.org/x/net/html"
)
var (
ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
)
// HTMLAllowing sanitizes html, allowing some tags.
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
func HTMLAllowing(s string, args ...[]string) (string, error) {
allowedTags := defaultTags
if len(args) > 0 {
allowedTags = args[0]
}
allowedAttributes := defaultAttributes
if len(args) > 1 {
allowedAttributes = args[1]
}
// Parse the html
tokenizer := parser.NewTokenizer(strings.NewReader(s))
buffer := bytes.NewBufferString("")
ignore := ""
for {
tokenType := tokenizer.Next()
token := tokenizer.Token()
switch tokenType {
case parser.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
return buffer.String(), nil
}
return "", err
case parser.StartTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
buffer.WriteString(token.String())
} else if includes(ignoreTags, token.Data) {
ignore = token.Data
}
case parser.SelfClosingTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = cleanAttributes(token.Attr, allowedAttributes)
buffer.WriteString(token.String())
} else if token.Data == ignore {
ignore = ""
}
case parser.EndTagToken:
if len(ignore) == 0 && includes(allowedTags, token.Data) {
token.Attr = []parser.Attribute{}
buffer.WriteString(token.String())
} else if token.Data == ignore {
ignore = ""
}
case parser.TextToken:
// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
if ignore == "" {
buffer.WriteString(token.String())
}
case parser.CommentToken:
// We ignore comments by default
case parser.DoctypeToken:
// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
default:
// We ignore unknown token types by default
}
}
}
// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
func HTML(s string) (output string) {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
output = s
} else {
// First remove line breaks etc as these have no meaning outside html tags (except pre)
// this means pre sections will lose formatting... but will result in less unintentional paras.
s = strings.Replace(s, "\n", "", -1)
// Then replace line breaks with newlines, to preserve that formatting
s = strings.Replace(s, "</p>", "\n", -1)
s = strings.Replace(s, "<br>", "\n", -1)
s = strings.Replace(s, "</br>", "\n", -1)
s = strings.Replace(s, "<br/>", "\n", -1)
s = strings.Replace(s, "<br />", "\n", -1)
// Walk through the string removing all tags
b := bytes.NewBufferString("")
inTag := false
for _, r := range s {
switch r {
case '<':
inTag = true
case '>':
inTag = false
default:
if !inTag {
b.WriteRune(r)
}
}
}
output = b.String()
}
// Remove a few common harmless entities, to arrive at something more like plain text
output = strings.Replace(output, "&#8216;", "'", -1)
output = strings.Replace(output, "&#8217;", "'", -1)
output = strings.Replace(output, "&#8220;", "\"", -1)
output = strings.Replace(output, "&#8221;", "\"", -1)
output = strings.Replace(output, "&nbsp;", " ", -1)
output = strings.Replace(output, "&quot;", "\"", -1)
output = strings.Replace(output, "&apos;", "'", -1)
// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
output = html.UnescapeString(output)
// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
output = template.HTMLEscapeString(output)
// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
output = strings.Replace(output, "&#34;", "\"", -1)
output = strings.Replace(output, "&#39;", "'", -1)
output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after
output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
return output
}
// We are very restrictive as this is intended for ascii url slugs
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
// Path makes a string safe to use as a URL path,
// removing accents and replacing separators with -.
// The path may still start at / and is not intended
// for use as a file system path without prefix.
func Path(s string) string {
// Start with lowercase string
filePath := strings.ToLower(s)
filePath = strings.Replace(filePath, "..", "", -1)
filePath = path.Clean(filePath)
// Remove illegal characters for paths, flattening accents
// and replacing some common separators with -
filePath = cleanString(filePath, illegalPath)
// NB this may be of length 0, caller must check
return filePath
}
// Remove all other unrecognised characters apart from
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
func Name(s string) string {
// Start with lowercase string
fileName := s
fileName = path.Clean(path.Base(fileName))
// Remove illegal characters for names, replacing some common separators with -
fileName = cleanString(fileName, illegalName)
// NB this may be of length 0, caller must check
return fileName
}
// Replace these separators with -
var baseNameSeparators = regexp.MustCompile(`[./]`)
// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
// No attempt is made to normalise a path or normalise case.
func BaseName(s string) string {
// Replace certain joining characters with a dash
baseName := baseNameSeparators.ReplaceAllString(s, "-")
// Remove illegal characters for names, replacing some common separators with -
baseName = cleanString(baseName, illegalName)
// NB this may be of length 0, caller must check
return baseName
}
// A very limited list of transliterations to catch common european names translated to urls.
// This set could be expanded with at least caps and many more characters.
var transliterations = map[rune]string{
'À': "A",
'Á': "A",
'Â': "A",
'Ã': "A",
'Ä': "A",
'Å': "AA",
'Æ': "AE",
'Ç': "C",
'È': "E",
'É': "E",
'Ê': "E",
'Ë': "E",
'Ì': "I",
'Í': "I",
'Î': "I",
'Ï': "I",
'Ð': "D",
'Ł': "L",
'Ñ': "N",
'Ò': "O",
'Ó': "O",
'Ô': "O",
'Õ': "O",
'Ö': "OE",
'Ø': "OE",
'Œ': "OE",
'Ù': "U",
'Ú': "U",
'Ü': "UE",
'Û': "U",
'Ý': "Y",
'Þ': "TH",
'ẞ': "SS",
'à': "a",
'á': "a",
'â': "a",
'ã': "a",
'ä': "ae",
'å': "aa",
'æ': "ae",
'ç': "c",
'è': "e",
'é': "e",
'ê': "e",
'ë': "e",
'ì': "i",
'í': "i",
'î': "i",
'ï': "i",
'ð': "d",
'ł': "l",
'ñ': "n",
'ń': "n",
'ò': "o",
'ó': "o",
'ô': "o",
'õ': "o",
'ō': "o",
'ö': "oe",
'ø': "oe",
'œ': "oe",
'ś': "s",
'ù': "u",
'ú': "u",
'û': "u",
'ū': "u",
'ü': "ue",
'ý': "y",
'ÿ': "y",
'ż': "z",
'þ': "th",
'ß': "ss",
}
// Accents replaces a set of accented characters with ascii equivalents.
func Accents(s string) string {
// Replace some common accent characters
b := bytes.NewBufferString("")
for _, c := range s {
// Check transliterations first
if val, ok := transliterations[c]; ok {
b.WriteString(val)
} else {
b.WriteRune(c)
}
}
return b.String()
}
var (
// If the attribute contains data: or javascript: anywhere, ignore it
// we don't allow this in attributes as it is so frequently used for xss
// NB we allow spaces in the value, and lowercase.
illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
// We are far more restrictive with href attributes.
legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
)
// cleanAttributes returns an array of attributes after removing malicious ones.
func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
if len(a) == 0 {
return a
}
var cleaned []parser.Attribute
for _, attr := range a {
if includes(allowed, attr.Key) {
val := strings.ToLower(attr.Val)
// Check for illegal attribute values
if illegalAttr.FindString(val) != "" {
attr.Val = ""
}
// Check for legal href values - / mailto:// http:// or https://
if attr.Key == "href" {
if legalHrefAttr.FindString(val) == "" {
attr.Val = ""
}
}
// If we still have an attribute, append it to the array
if attr.Val != "" {
cleaned = append(cleaned, attr)
}
}
}
return cleaned
}
// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
var (
separators = regexp.MustCompile(`[!&_="#|+?:]`)
dashes = regexp.MustCompile(`[\-]+`)
)
// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
func cleanString(s string, r *regexp.Regexp) string {
// Remove any trailing space to avoid ending on -
s = strings.Trim(s, " ")
// Flatten accents first so that if we remove non-ascii we still get a legible name
s = Accents(s)
// Replace certain joining characters with a dash
s = separators.ReplaceAllString(s, "-")
// Remove all other unrecognised characters - NB we do allow any printable characters
//s = r.ReplaceAllString(s, "")
// Remove any multiple dashes caused by replacements above
s = dashes.ReplaceAllString(s, "-")
return s
}
// includes checks for inclusion of a string in a []string.
func includes(a []string, s string) bool {
for _, as := range a {
if as == s {
return true
}
}
return false
}