2017-09-16 20:17:57 +03:00
// Copyright 2017 The Gitea Authors. All rights reserved.
2022-11-27 21:20:29 +03:00
// SPDX-License-Identifier: MIT
2017-09-16 20:17:57 +03:00
package markup
import (
"bytes"
2024-11-26 19:46:02 +03:00
"fmt"
2021-04-20 01:25:08 +03:00
"io"
2017-09-16 20:17:57 +03:00
"regexp"
"strings"
2021-07-15 23:33:56 +03:00
"sync"
2017-09-16 20:17:57 +03:00
2019-12-31 04:53:28 +03:00
"code.gitea.io/gitea/modules/markup/common"
2017-09-16 20:17:57 +03:00
"golang.org/x/net/html"
2018-02-27 10:09:18 +03:00
"golang.org/x/net/html/atom"
2019-03-27 14:15:23 +03:00
"mvdan.cc/xurls/v2"
2017-09-16 20:17:57 +03:00
)
// Issue name styles
const (
IssueNameStyleNumeric = "numeric"
IssueNameStyleAlphanumeric = "alphanumeric"
2022-06-10 08:39:53 +03:00
IssueNameStyleRegexp = "regexp"
2017-09-16 20:17:57 +03:00
)
2024-11-16 11:41:44 +03:00
type globalVarsType struct {
hashCurrentPattern * regexp . Regexp
shortLinkPattern * regexp . Regexp
anyHashPattern * regexp . Regexp
comparePattern * regexp . Regexp
fullURLPattern * regexp . Regexp
emailRegex * regexp . Regexp
blackfridayExtRegex * regexp . Regexp
emojiShortCodeRegex * regexp . Regexp
issueFullPattern * regexp . Regexp
filesChangedFullPattern * regexp . Regexp
2024-11-18 08:25:42 +03:00
codePreviewPattern * regexp . Regexp
2024-11-16 11:41:44 +03:00
tagCleaner * regexp . Regexp
nulCleaner * strings . Replacer
}
var globalVars = sync . OnceValue [ * globalVarsType ] ( func ( ) * globalVarsType {
v := & globalVarsType { }
2017-09-16 20:17:57 +03:00
// NOTE: All below regex matching do not perform any extra validation.
// Thus a link is produced even if the linked entity does not exist.
// While fast, this is also incorrect and lead to false positives.
// TODO: fix invalid linking issue
2021-12-11 20:21:36 +03:00
// valid chars in encoded path and parameter: [-+~_%.a-zA-Z0-9/]
2024-01-19 19:05:02 +03:00
// hashCurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae
// Although SHA1 hashes are 40 chars long, SHA256 are 64, the regex matches the hash from 7 to 64 chars in length
2021-12-11 20:21:36 +03:00
// so that abbreviated hash links can be used as well. This matches git and GitHub usability.
2024-11-16 11:41:44 +03:00
v . hashCurrentPattern = regexp . MustCompile ( ` (?:\s|^|\(|\[)([0-9a-f] { 7,64})(?:\s|$|\)|\]|[.,:](\s|$)) ` )
2017-09-16 20:17:57 +03:00
2018-02-27 10:09:18 +03:00
// shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax
2024-11-16 11:41:44 +03:00
v . shortLinkPattern = regexp . MustCompile ( ` \[\[(.*?)\]\](\w*) ` )
2017-09-16 20:17:57 +03:00
2024-02-21 13:08:08 +03:00
// anyHashPattern splits url containing SHA into parts
2024-11-16 11:41:44 +03:00
v . anyHashPattern = regexp . MustCompile ( ` https?://(?:\S+/) { 4,5}([0-9a-f] { 40,64})(/[-+~%./\w]+)?(\?[-+~%.\w&=]+)?(#[-+~%.\w]+)? ` )
2021-12-11 20:21:36 +03:00
// comparePattern matches "http://domain/org/repo/compare/COMMIT1...COMMIT2#hash"
2024-11-16 11:41:44 +03:00
v . comparePattern = regexp . MustCompile ( ` https?://(?:\S+/) { 4,5}([0-9a-f] { 7,64})(\.\.\.?)([0-9a-f] { 7,64})?(#[-+~_%.a-zA-Z0-9]+)? ` )
2017-09-16 20:17:57 +03:00
2024-02-21 13:08:08 +03:00
// fullURLPattern matches full URL like "mailto:...", "https://..." and "ssh+git://..."
2024-11-16 11:41:44 +03:00
v . fullURLPattern = regexp . MustCompile ( ` ^[a-z][-+\w]+: ` )
2018-02-27 10:09:18 +03:00
2024-02-21 13:08:08 +03:00
// emailRegex is definitely not perfect with edge cases,
// it is still accepted by the CommonMark specification, as well as the HTML5 spec:
2018-02-27 10:09:18 +03:00
// http://spec.commonmark.org/0.28/#email-address
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
2024-11-16 11:41:44 +03:00
v . emailRegex = regexp . MustCompile ( "(?:\\s|^|\\(|\\[)([a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9]{2,}(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)(?:\\s|$|\\)|\\]|;|,|\\?|!|\\.(\\s|$))" )
2018-02-27 10:09:18 +03:00
2024-02-21 13:08:08 +03:00
// blackfridayExtRegex is for blackfriday extensions create IDs like fn:user-content-footnote
2024-11-16 11:41:44 +03:00
v . blackfridayExtRegex = regexp . MustCompile ( ` [^:]*:user-content- ` )
2020-04-28 21:05:39 +03:00
2024-02-21 13:08:08 +03:00
// emojiShortCodeRegex find emoji by alias like :smile:
2024-11-16 11:41:44 +03:00
v . emojiShortCodeRegex = regexp . MustCompile ( ` :[-+\w]+: ` )
2017-09-16 20:17:57 +03:00
2024-11-16 11:41:44 +03:00
// example: https://domain/org/repo/pulls/27#hash
v . issueFullPattern = regexp . MustCompile ( ` https?://(?:\S+/)[\w_.-]+/[\w_.-]+/(?:issues|pulls)/((?:\w { 1,10}-)?[1-9][0-9]*)([\?|#](\S+)?)?\b ` )
// example: https://domain/org/repo/pulls/27/files#hash
v . filesChangedFullPattern = regexp . MustCompile ( ` https?://(?:\S+/)[\w_.-]+/[\w_.-]+/pulls/((?:\w { 1,10}-)?[1-9][0-9]*)/files([\?|#](\S+)?)?\b ` )
2024-11-18 08:25:42 +03:00
// codePreviewPattern matches "http://domain/.../{owner}/{repo}/src/commit/{commit}/{filepath}#L10-L20"
v . codePreviewPattern = regexp . MustCompile ( ` https?://\S+/([^\s/]+)/([^\s/]+)/src/commit/([0-9a-f] { 7,64})(/\S+)#(L\d+(-L\d+)?) ` )
2024-11-16 11:41:44 +03:00
v . tagCleaner = regexp . MustCompile ( ` <((?:/?\w+/\w+)|(?:/[\w ]+/)|(/?[hH][tT][mM][lL]\b)|(/?[hH][eE][aA][dD]\b)) ` )
v . nulCleaner = strings . NewReplacer ( "\000" , "" )
return v
} )
2019-10-14 01:29:10 +03:00
2024-02-21 13:08:08 +03:00
// IsFullURLBytes reports whether link fits valid format.
func IsFullURLBytes ( link [ ] byte ) bool {
2024-11-16 11:41:44 +03:00
return globalVars ( ) . fullURLPattern . Match ( link )
2017-09-16 20:17:57 +03:00
}
2024-02-21 13:08:08 +03:00
func IsFullURLString ( link string ) bool {
2024-11-16 11:41:44 +03:00
return globalVars ( ) . fullURLPattern . MatchString ( link )
2018-02-27 10:09:18 +03:00
}
2024-06-21 21:23:54 +03:00
func IsNonEmptyRelativePath ( link string ) bool {
return link != "" && ! IsFullURLString ( link ) && link [ 0 ] != '/' && link [ 0 ] != '?' && link [ 0 ] != '#'
}
2019-10-15 04:31:09 +03:00
// CustomLinkURLSchemes allows for additional schemes to be detected when parsing links within text
func CustomLinkURLSchemes ( schemes [ ] string ) {
schemes = append ( schemes , "http" , "https" )
withAuth := make ( [ ] string , 0 , len ( schemes ) )
validScheme := regexp . MustCompile ( ` ^[a-z]+$ ` )
for _ , s := range schemes {
if ! validScheme . MatchString ( s ) {
continue
}
without := false
for _ , sna := range xurls . SchemesNoAuthority {
if s == sna {
without = true
break
}
}
if without {
s += ":"
} else {
s += "://"
}
withAuth = append ( withAuth , s )
}
2024-11-18 08:25:42 +03:00
common . GlobalVars ( ) . LinkRegex , _ = xurls . StrictMatchingScheme ( strings . Join ( withAuth , "|" ) )
2019-10-15 04:31:09 +03:00
}
2021-04-20 01:25:08 +03:00
type processor func ( ctx * RenderContext , node * html . Node )
2018-02-27 10:09:18 +03:00
2024-11-26 19:46:02 +03:00
// PostProcessDefault does the final required transformations to the passed raw HTML
2018-02-27 10:09:18 +03:00
// data, and ensures its validity. Transformations include: replacing links and
// emails with HTML links, parsing shortlinks in the format of [[Link]], like
// MediaWiki, linking issues in the format #ID, and mentions in the format
// @user, and others.
2024-11-26 19:46:02 +03:00
func PostProcessDefault ( ctx * RenderContext , input io . Reader , output io . Writer ) error {
procs := [ ] processor {
fullIssuePatternProcessor ,
comparePatternProcessor ,
codePreviewPatternProcessor ,
fullHashPatternProcessor ,
shortLinkProcessor ,
linkProcessor ,
mentionProcessor ,
issueIndexPatternProcessor ,
commitCrossReferencePatternProcessor ,
hashCurrentPatternProcessor ,
emailAddressProcessor ,
emojiProcessor ,
emojiShortCodeProcessor ,
}
return postProcess ( ctx , procs , input , output )
2018-02-27 10:09:18 +03:00
}
2024-12-04 04:39:33 +03:00
// PostProcessCommitMessage will use the same logic as PostProcess, but will disable
2024-11-26 19:46:02 +03:00
// the shortLinkProcessor.
2024-12-04 04:39:33 +03:00
func PostProcessCommitMessage ( ctx * RenderContext , content string ) ( string , error ) {
2024-11-26 19:46:02 +03:00
procs := [ ] processor {
fullIssuePatternProcessor ,
comparePatternProcessor ,
fullHashPatternProcessor ,
linkProcessor ,
mentionProcessor ,
issueIndexPatternProcessor ,
commitCrossReferencePatternProcessor ,
hashCurrentPatternProcessor ,
emailAddressProcessor ,
emojiProcessor ,
emojiShortCodeProcessor ,
}
return postProcessString ( ctx , procs , content )
2020-04-28 21:05:39 +03:00
}
var emojiProcessors = [ ] processor {
emojiShortCodeProcessor ,
emojiProcessor ,
2019-09-10 12:03:30 +03:00
}
2024-12-04 04:39:33 +03:00
// PostProcessCommitMessageSubject will use the same logic as PostProcess and
// PostProcessCommitMessage, but will disable the shortLinkProcessor and
2019-09-10 12:03:30 +03:00
// emailAddressProcessor, will add a defaultLinkProcessor if defaultLink is set,
// which changes every text node into a link to the passed default link.
2024-12-04 04:39:33 +03:00
func PostProcessCommitMessageSubject ( ctx * RenderContext , defaultLink , content string ) ( string , error ) {
2024-11-26 19:46:02 +03:00
procs := [ ] processor {
fullIssuePatternProcessor ,
comparePatternProcessor ,
fullHashPatternProcessor ,
linkProcessor ,
mentionProcessor ,
issueIndexPatternProcessor ,
commitCrossReferencePatternProcessor ,
hashCurrentPatternProcessor ,
emojiShortCodeProcessor ,
emojiProcessor ,
}
2024-11-16 11:41:44 +03:00
procs = append ( procs , func ( ctx * RenderContext , node * html . Node ) {
ch := & html . Node { Parent : node , Type : html . TextNode , Data : node . Data }
node . Type = html . ElementNode
node . Data = "a"
node . DataAtom = atom . A
node . Attr = [ ] html . Attribute { { Key : "href" , Val : defaultLink } , { Key : "class" , Val : "muted" } }
node . FirstChild , node . LastChild = ch , ch
} )
2024-11-26 19:46:02 +03:00
return postProcessString ( ctx , procs , content )
2019-09-10 12:03:30 +03:00
}
2024-12-04 04:39:33 +03:00
// PostProcessIssueTitle to process title on individual issue/pull page
func PostProcessIssueTitle ( ctx * RenderContext , title string ) ( string , error ) {
2024-11-26 19:46:02 +03:00
return postProcessString ( ctx , [ ] processor {
2024-12-04 04:39:33 +03:00
issueIndexPatternProcessor ,
commitCrossReferencePatternProcessor ,
hashCurrentPatternProcessor ,
2021-04-20 01:25:08 +03:00
emojiShortCodeProcessor ,
emojiProcessor ,
} , title )
}
2024-12-04 04:39:33 +03:00
// PostProcessDescriptionHTML will use similar logic as PostProcess, but will
2019-03-12 05:23:34 +03:00
// use a single special linkProcessor.
2024-12-04 04:39:33 +03:00
func PostProcessDescriptionHTML ( ctx * RenderContext , content string ) ( string , error ) {
2024-11-26 19:46:02 +03:00
return postProcessString ( ctx , [ ] processor {
2021-04-20 01:25:08 +03:00
descriptionLinkProcessor ,
emojiShortCodeProcessor ,
emojiProcessor ,
} , content )
2019-03-12 05:23:34 +03:00
}
2024-12-04 04:39:33 +03:00
// PostProcessEmoji for when we want to just process emoji and shortcodes
2021-07-08 14:38:13 +03:00
// in various places it isn't already run through the normal markdown processor
2024-12-04 04:39:33 +03:00
func PostProcessEmoji ( ctx * RenderContext , content string ) ( string , error ) {
2024-11-26 19:46:02 +03:00
return postProcessString ( ctx , emojiProcessors , content )
2020-04-28 21:05:39 +03:00
}
2024-12-04 04:39:33 +03:00
func postProcessString ( ctx * RenderContext , procs [ ] processor , content string ) ( string , error ) {
var buf strings . Builder
if err := postProcess ( ctx , procs , strings . NewReader ( content ) , & buf ) ; err != nil {
return "" , err
}
return buf . String ( ) , nil
}
2021-04-20 01:25:08 +03:00
func postProcess ( ctx * RenderContext , procs [ ] processor , input io . Reader , output io . Writer ) error {
2024-12-04 04:39:33 +03:00
if ! ctx . usedByRender && ctx . RenderHelper != nil {
defer ctx . RenderHelper . CleanUp ( )
}
2021-04-20 01:25:08 +03:00
// FIXME: don't read all content to memory
2021-09-22 08:38:34 +03:00
rawHTML , err := io . ReadAll ( input )
2021-04-20 01:25:08 +03:00
if err != nil {
return err
2018-02-27 10:09:18 +03:00
}
// parse the HTML
2022-12-12 06:03:54 +03:00
node , err := html . Parse ( io . MultiReader (
// prepend "<html><body>"
strings . NewReader ( "<html><body>" ) ,
// Strip out nuls - they're always invalid
2024-11-16 11:41:44 +03:00
bytes . NewReader ( globalVars ( ) . tagCleaner . ReplaceAll ( [ ] byte ( globalVars ( ) . nulCleaner . Replace ( string ( rawHTML ) ) ) , [ ] byte ( "<$1" ) ) ) ,
2022-12-12 06:03:54 +03:00
// close the tags
strings . NewReader ( "</body></html>" ) ,
) )
2018-02-27 10:09:18 +03:00
if err != nil {
2024-11-26 19:46:02 +03:00
return fmt . Errorf ( "markup.postProcess: invalid HTML: %w" , err )
2018-02-27 10:09:18 +03:00
}
2021-06-22 01:12:22 +03:00
if node . Type == html . DocumentNode {
node = node . FirstChild
2018-02-27 10:09:18 +03:00
}
2024-01-15 11:49:24 +03:00
visitNode ( ctx , procs , node )
2021-01-20 18:10:50 +03:00
2021-06-22 01:12:22 +03:00
newNodes := make ( [ ] * html . Node , 0 , 5 )
if node . Data == "html" {
node = node . FirstChild
for node != nil && node . Data != "body" {
node = node . NextSibling
2021-01-20 18:10:50 +03:00
}
2021-06-22 01:12:22 +03:00
}
if node != nil {
2021-01-20 18:10:50 +03:00
if node . Data == "body" {
child := node . FirstChild
for child != nil {
newNodes = append ( newNodes , child )
child = child . NextSibling
}
} else {
newNodes = append ( newNodes , node )
}
}
2018-02-27 10:09:18 +03:00
// Render everything to buf.
2021-04-20 01:25:08 +03:00
for _ , node := range newNodes {
2022-03-17 21:04:36 +03:00
if err := html . Render ( output , node ) ; err != nil {
2024-11-26 19:46:02 +03:00
return fmt . Errorf ( "markup.postProcess: html.Render: %w" , err )
2017-09-16 20:17:57 +03:00
}
2018-02-27 10:09:18 +03:00
}
2021-04-20 01:25:08 +03:00
return nil
2018-02-27 10:09:18 +03:00
}
2024-11-18 08:25:42 +03:00
func isEmojiNode ( node * html . Node ) bool {
if node . Type == html . ElementNode && node . Data == atom . Span . String ( ) {
for _ , attr := range node . Attr {
if ( attr . Key == "class" || attr . Key == "data-attr-class" ) && strings . Contains ( attr . Val , "emoji" ) {
return true
}
}
}
return false
}
2024-06-04 15:19:41 +03:00
func visitNode ( ctx * RenderContext , procs [ ] processor , node * html . Node ) * html . Node {
2023-01-31 08:21:29 +03:00
// Add user-content- to IDs and "#" links if they don't already have them
2019-12-24 01:38:50 +03:00
for idx , attr := range node . Attr {
2023-01-31 08:21:29 +03:00
val := strings . TrimPrefix ( attr . Val , "#" )
2024-11-16 11:41:44 +03:00
notHasPrefix := ! ( strings . HasPrefix ( val , "user-content-" ) || globalVars ( ) . blackfridayExtRegex . MatchString ( val ) )
2023-01-31 08:21:29 +03:00
if attr . Key == "id" && notHasPrefix {
2019-12-24 01:38:50 +03:00
node . Attr [ idx ] . Val = "user-content-" + attr . Val
}
2020-04-28 21:05:39 +03:00
2023-01-31 08:21:29 +03:00
if attr . Key == "href" && strings . HasPrefix ( attr . Val , "#" ) && notHasPrefix {
node . Attr [ idx ] . Val = "#user-content-" + val
}
2019-12-24 01:38:50 +03:00
}
2020-04-28 21:05:39 +03:00
2018-02-27 10:09:18 +03:00
switch node . Type {
case html . TextNode :
2024-11-18 08:25:42 +03:00
for _ , proc := range procs {
proc ( ctx , node ) // it might add siblings
}
2018-02-27 10:09:18 +03:00
case html . ElementNode :
2024-11-18 08:25:42 +03:00
if isEmojiNode ( node ) {
// TextNode emoji will be converted to `<span class="emoji">`, then the next iteration will visit the "span"
// if we don't stop it, it will go into the TextNode again and create an infinite recursion
2024-06-21 21:23:54 +03:00
return node . NextSibling
2024-11-18 08:25:42 +03:00
} else if node . Data == "code" || node . Data == "pre" {
return node . NextSibling // ignore code and pre nodes
2024-06-21 21:23:54 +03:00
} else if node . Data == "img" {
return visitNodeImg ( ctx , node )
} else if node . Data == "video" {
return visitNodeVideo ( ctx , node )
2020-02-28 07:16:05 +03:00
} else if node . Data == "a" {
2024-11-18 08:25:42 +03:00
procs = emojiProcessors // Restrict text in links to emojis
2017-09-16 20:17:57 +03:00
}
2024-06-04 15:19:41 +03:00
for n := node . FirstChild ; n != nil ; {
n = visitNode ( ctx , procs , n )
2017-09-16 20:17:57 +03:00
}
2024-06-18 01:56:45 +03:00
default :
2018-02-27 10:09:18 +03:00
}
2024-06-04 15:19:41 +03:00
return node . NextSibling
2018-02-27 10:09:18 +03:00
}
2019-10-14 01:29:10 +03:00
// createKeyword() renders a highlighted version of an action keyword
2024-11-18 08:25:42 +03:00
func createKeyword ( ctx * RenderContext , content string ) * html . Node {
// CSS class for action keywords (e.g. "closes: #1")
const keywordClass = "issue-keyword"
2019-10-14 01:29:10 +03:00
span := & html . Node {
Type : html . ElementNode ,
Data : atom . Span . String ( ) ,
Attr : [ ] html . Attribute { } ,
}
2024-11-18 08:25:42 +03:00
span . Attr = append ( span . Attr , ctx . RenderInternal . NodeSafeAttr ( "class" , keywordClass ) )
2019-10-14 01:29:10 +03:00
text := & html . Node {
Type : html . TextNode ,
Data : content ,
}
span . AppendChild ( text )
return span
}
2024-11-18 08:25:42 +03:00
func createLink ( ctx * RenderContext , href , content , class string ) * html . Node {
2019-04-09 06:18:48 +03:00
a := & html . Node {
Type : html . ElementNode ,
Data : atom . A . String ( ) ,
2024-11-14 08:02:11 +03:00
Attr : [ ] html . Attribute { { Key : "href" , Val : href } } ,
}
2024-11-24 11:18:57 +03:00
if ! RenderBehaviorForTesting . DisableAdditionalAttributes {
2024-11-14 08:02:11 +03:00
a . Attr = append ( a . Attr , html . Attribute { Key : "data-markdown-generated-content" } )
2019-04-09 06:18:48 +03:00
}
2019-09-10 12:03:30 +03:00
if class != "" {
2024-11-18 08:25:42 +03:00
a . Attr = append ( a . Attr , ctx . RenderInternal . NodeSafeAttr ( "class" , class ) )
2019-09-10 12:03:30 +03:00
}
2019-04-09 06:18:48 +03:00
text := & html . Node {
2018-02-27 10:09:18 +03:00
Type : html . TextNode ,
Data : content ,
}
2019-04-09 06:18:48 +03:00
a . AppendChild ( text )
return a
}
2019-10-14 01:29:10 +03:00
// replaceContent takes text node, and in its content it replaces a section of
// it with the specified newNode.
2018-02-27 10:09:18 +03:00
func replaceContent ( node * html . Node , i , j int , newNode * html . Node ) {
2019-10-14 01:29:10 +03:00
replaceContentList ( node , i , j , [ ] * html . Node { newNode } )
}
// replaceContentList takes text node, and in its content it replaces a section of
// it with the specified newNodes. An example to visualize how this can work can
// be found here: https://play.golang.org/p/5zP8NnHZ03s
func replaceContentList ( node * html . Node , i , j int , newNodes [ ] * html . Node ) {
2018-02-27 10:09:18 +03:00
// get the data before and after the match
before := node . Data [ : i ]
after := node . Data [ j : ]
// Replace in the current node the text, so that it is only what it is
// supposed to have.
node . Data = before
// Get the current next sibling, before which we place the replaced data,
// and after that we place the new text node.
nextSibling := node . NextSibling
2019-10-14 01:29:10 +03:00
for _ , n := range newNodes {
node . Parent . InsertBefore ( n , nextSibling )
}
2018-02-27 10:09:18 +03:00
if after != "" {
node . Parent . InsertBefore ( & html . Node {
Type : html . TextNode ,
Data : after ,
} , nextSibling )
2017-09-16 20:17:57 +03:00
}
}