2017-04-13 05:52:24 +03:00
// Copyright 2017 The Gitea Authors. All rights reserved.
// Copyright 2017 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
2017-09-16 20:17:57 +03:00
package markup
2017-04-13 05:52:24 +03:00
import (
2019-12-31 04:53:28 +03:00
"bytes"
"io"
2017-04-13 05:52:24 +03:00
"regexp"
"sync"
"code.gitea.io/gitea/modules/setting"
"github.com/microcosm-cc/bluemonday"
)
// Sanitizer is a protection wrapper of *bluemonday.Policy which does not allow
// any modification to the underlying policies once it's been created.
type Sanitizer struct {
policy * bluemonday . Policy
init sync . Once
}
var sanitizer = & Sanitizer { }
// NewSanitizer initializes sanitizer with allowed attributes based on settings.
// Multiple calls to this function will only create one instance of Sanitizer during
// entire application lifecycle.
func NewSanitizer ( ) {
sanitizer . init . Do ( func ( ) {
2019-10-15 04:31:09 +03:00
ReplaceSanitizer ( )
} )
}
2017-04-13 05:52:24 +03:00
2019-10-15 04:31:09 +03:00
// ReplaceSanitizer replaces the current sanitizer to account for changes in settings
func ReplaceSanitizer ( ) {
sanitizer . policy = bluemonday . UGCPolicy ( )
2020-07-01 00:34:03 +03:00
// For Chroma markdown plugin
2020-08-04 22:56:37 +03:00
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ^is-loading$ ` ) ) . OnElements ( "pre" )
2020-07-01 00:34:03 +03:00
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ^(chroma )?language-[\w-]+$ ` ) ) . OnElements ( "code" )
2017-04-13 05:52:24 +03:00
2019-10-15 04:31:09 +03:00
// Checkboxes
sanitizer . policy . AllowAttrs ( "type" ) . Matching ( regexp . MustCompile ( ` ^checkbox$ ` ) ) . OnElements ( "input" )
2020-12-13 04:05:50 +03:00
sanitizer . policy . AllowAttrs ( "checked" , "disabled" ) . OnElements ( "input" )
2019-10-14 01:29:10 +03:00
2019-10-15 04:31:09 +03:00
// Custom URL-Schemes
2021-04-06 00:38:31 +03:00
if len ( setting . Markdown . CustomURLSchemes ) > 0 {
sanitizer . policy . AllowURLSchemes ( setting . Markdown . CustomURLSchemes ... )
}
2019-10-15 04:31:09 +03:00
// Allow keyword markup
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ^ ` + keywordClass + ` $ ` ) ) . OnElements ( "span" )
2019-12-03 22:02:41 +03:00
2020-01-20 07:39:21 +03:00
// Allow classes for anchors
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ref-issue ` ) ) . OnElements ( "a" )
2020-03-08 22:17:03 +03:00
// Allow classes for task lists
2020-05-11 02:14:49 +03:00
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` task-list-item ` ) ) . OnElements ( "li" )
2020-03-08 22:17:03 +03:00
2020-04-24 16:22:36 +03:00
// Allow icons
2020-04-26 08:09:08 +03:00
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ^icon(\s+[\p { L}\p { N}_-]+)+$ ` ) ) . OnElements ( "i" )
// Allow unlabelled labels
sanitizer . policy . AllowNoAttrs ( ) . OnElements ( "label" )
2020-04-24 16:22:36 +03:00
2020-04-28 21:05:39 +03:00
// Allow classes for emojis
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` emoji ` ) ) . OnElements ( "img" )
2020-12-13 04:05:50 +03:00
// Allow icons, emojis, and chroma syntax on span
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` ^((icon(\s+[\p { L}\p { N}_-]+)+)|(emoji))$|^([a-z][a-z0-9] { 0,2})$ ` ) ) . OnElements ( "span" )
2020-05-03 23:17:24 +03:00
2021-03-29 23:44:28 +03:00
// Allow data tables
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` data-table ` ) ) . OnElements ( "table" )
sanitizer . policy . AllowAttrs ( "class" ) . Matching ( regexp . MustCompile ( ` line-num ` ) ) . OnElements ( "th" , "td" )
2020-02-28 23:05:12 +03:00
// Allow generally safe attributes
generalSafeAttrs := [ ] string { "abbr" , "accept" , "accept-charset" ,
"accesskey" , "action" , "align" , "alt" ,
"aria-describedby" , "aria-hidden" , "aria-label" , "aria-labelledby" ,
"axis" , "border" , "cellpadding" , "cellspacing" , "char" ,
"charoff" , "charset" , "checked" ,
"clear" , "cols" , "colspan" , "color" ,
"compact" , "coords" , "datetime" , "dir" ,
"disabled" , "enctype" , "for" , "frame" ,
"headers" , "height" , "hreflang" ,
"hspace" , "ismap" , "label" , "lang" ,
"maxlength" , "media" , "method" ,
"multiple" , "name" , "nohref" , "noshade" ,
"nowrap" , "open" , "prompt" , "readonly" , "rel" , "rev" ,
"rows" , "rowspan" , "rules" , "scope" ,
"selected" , "shape" , "size" , "span" ,
"start" , "summary" , "tabindex" , "target" ,
"title" , "type" , "usemap" , "valign" , "value" ,
"vspace" , "width" , "itemprop" ,
}
generalSafeElements := [ ] string {
"h1" , "h2" , "h3" , "h4" , "h5" , "h6" , "h7" , "h8" , "br" , "b" , "i" , "strong" , "em" , "a" , "pre" , "code" , "img" , "tt" ,
"div" , "ins" , "del" , "sup" , "sub" , "p" , "ol" , "ul" , "table" , "thead" , "tbody" , "tfoot" , "blockquote" ,
"dl" , "dt" , "dd" , "kbd" , "q" , "samp" , "var" , "hr" , "ruby" , "rt" , "rp" , "li" , "tr" , "td" , "th" , "s" , "strike" , "summary" ,
"details" , "caption" , "figure" , "figcaption" ,
"abbr" , "bdo" , "cite" , "dfn" , "mark" , "small" , "span" , "time" , "wbr" ,
}
sanitizer . policy . AllowAttrs ( generalSafeAttrs ... ) . OnElements ( generalSafeElements ... )
sanitizer . policy . AllowAttrs ( "itemscope" , "itemtype" ) . OnElements ( "div" )
// FIXME: Need to handle longdesc in img but there is no easy way to do it
2019-12-07 22:49:04 +03:00
// Custom keyword markup
for _ , rule := range setting . ExternalSanitizerRules {
if rule . Regexp != nil {
sanitizer . policy . AllowAttrs ( rule . AllowAttr ) . Matching ( rule . Regexp ) . OnElements ( rule . Element )
} else {
sanitizer . policy . AllowAttrs ( rule . AllowAttr ) . OnElements ( rule . Element )
}
}
2017-04-13 05:52:24 +03:00
}
// Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist.
func Sanitize ( s string ) string {
2017-04-19 14:16:36 +03:00
NewSanitizer ( )
2017-04-13 05:52:24 +03:00
return sanitizer . policy . Sanitize ( s )
}
2019-12-31 04:53:28 +03:00
// SanitizeReader sanitizes a Reader
func SanitizeReader ( r io . Reader ) * bytes . Buffer {
NewSanitizer ( )
return sanitizer . policy . SanitizeReader ( r )
}
2017-04-13 05:52:24 +03:00
// SanitizeBytes takes a []byte slice that contains a HTML fragment or document and applies policy whitelist.
func SanitizeBytes ( b [ ] byte ) [ ] byte {
if len ( b ) == 0 {
// nothing to sanitize
return b
}
2017-04-19 14:16:36 +03:00
NewSanitizer ( )
2017-04-13 05:52:24 +03:00
return sanitizer . policy . SanitizeBytes ( b )
}