2022-08-13 21:32:34 +03:00
// Copyright 2022 The Gitea Authors. All rights reserved.
2022-11-27 21:20:29 +03:00
// SPDX-License-Identifier: MIT
2022-08-13 21:32:34 +03:00
package charset
import (
"fmt"
"regexp"
"strings"
"unicode"
"unicode/utf8"
"code.gitea.io/gitea/modules/translation"
"golang.org/x/net/html"
)
// VScode defaultWordRegexp
var defaultWordRegexp = regexp . MustCompile ( ` (-?\d*\.\d\w*)|([^\ ` + "`" + ` \~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\ { \]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+) ` )
func NewEscapeStreamer ( locale translation . Locale , next HTMLStreamer , allowed ... rune ) HTMLStreamer {
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 15:51:36 +03:00
allowedM := make ( map [ rune ] bool , len ( allowed ) )
for _ , v := range allowed {
allowedM [ v ] = true
}
2022-08-13 21:32:34 +03:00
return & escapeStreamer {
escaped : & EscapeStatus { } ,
PassthroughHTMLStreamer : * NewPassthroughStreamer ( next ) ,
locale : locale ,
ambiguousTables : AmbiguousTablesForLocale ( locale ) ,
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 15:51:36 +03:00
allowed : allowedM ,
2022-08-13 21:32:34 +03:00
}
}
type escapeStreamer struct {
PassthroughHTMLStreamer
escaped * EscapeStatus
locale translation . Locale
ambiguousTables [ ] * AmbiguousTable
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 15:51:36 +03:00
allowed map [ rune ] bool
2022-08-13 21:32:34 +03:00
}
func ( e * escapeStreamer ) EscapeStatus ( ) * EscapeStatus {
return e . escaped
}
// Text tells the next streamer there is a text
func ( e * escapeStreamer ) Text ( data string ) error {
sb := & strings . Builder { }
2023-04-22 21:53:00 +03:00
var until int
var next int
pos := 0
2022-08-13 21:32:34 +03:00
if len ( data ) > len ( UTF8BOM ) && data [ : len ( UTF8BOM ) ] == string ( UTF8BOM ) {
_ , _ = sb . WriteString ( data [ : len ( UTF8BOM ) ] )
pos = len ( UTF8BOM )
}
2022-08-24 14:50:13 +03:00
dataBytes := [ ] byte ( data )
2022-08-13 21:32:34 +03:00
for pos < len ( data ) {
nextIdxs := defaultWordRegexp . FindStringIndex ( data [ pos : ] )
if nextIdxs == nil {
until = len ( data )
next = until
} else {
until , next = nextIdxs [ 0 ] + pos , nextIdxs [ 1 ] + pos
}
2023-12-17 17:38:54 +03:00
// from pos until we know that the runes are not \r\t\n or even ' '
2022-08-13 21:32:34 +03:00
runes := make ( [ ] rune , 0 , next - until )
positions := make ( [ ] int , 0 , next - until + 1 )
for pos < until {
2022-08-24 14:50:13 +03:00
r , sz := utf8 . DecodeRune ( dataBytes [ pos : ] )
2022-08-13 21:32:34 +03:00
positions = positions [ : 0 ]
positions = append ( positions , pos , pos + sz )
types , confusables , _ := e . runeTypes ( r )
2022-08-24 14:50:13 +03:00
if err := e . handleRunes ( dataBytes , [ ] rune { r } , positions , types , confusables , sb ) ; err != nil {
2022-08-13 21:32:34 +03:00
return err
}
pos += sz
}
for i := pos ; i < next ; {
2022-08-24 14:50:13 +03:00
r , sz := utf8 . DecodeRune ( dataBytes [ i : ] )
2022-08-13 21:32:34 +03:00
runes = append ( runes , r )
positions = append ( positions , i )
i += sz
}
positions = append ( positions , next )
types , confusables , runeCounts := e . runeTypes ( runes ... )
if runeCounts . needsEscape ( ) {
2022-08-24 14:50:13 +03:00
if err := e . handleRunes ( dataBytes , runes , positions , types , confusables , sb ) ; err != nil {
2022-08-13 21:32:34 +03:00
return err
}
} else {
2022-08-24 14:50:13 +03:00
_ , _ = sb . Write ( dataBytes [ pos : next ] )
2022-08-13 21:32:34 +03:00
}
pos = next
}
if sb . Len ( ) > 0 {
if err := e . PassthroughHTMLStreamer . Text ( sb . String ( ) ) ; err != nil {
return err
}
}
return nil
}
2022-08-24 14:50:13 +03:00
func ( e * escapeStreamer ) handleRunes ( data [ ] byte , runes [ ] rune , positions [ ] int , types [ ] runeType , confusables [ ] rune , sb * strings . Builder ) error {
2022-08-13 21:32:34 +03:00
for i , r := range runes {
switch types [ i ] {
case brokenRuneType :
if sb . Len ( ) > 0 {
if err := e . PassthroughHTMLStreamer . Text ( sb . String ( ) ) ; err != nil {
return err
}
sb . Reset ( )
}
end := positions [ i + 1 ]
start := positions [ i ]
2022-08-24 14:50:13 +03:00
if err := e . brokenRune ( data [ start : end ] ) ; err != nil {
2022-08-13 21:32:34 +03:00
return err
}
case ambiguousRuneType :
if sb . Len ( ) > 0 {
if err := e . PassthroughHTMLStreamer . Text ( sb . String ( ) ) ; err != nil {
return err
}
sb . Reset ( )
}
if err := e . ambiguousRune ( r , confusables [ 0 ] ) ; err != nil {
return err
}
confusables = confusables [ 1 : ]
case invisibleRuneType :
if sb . Len ( ) > 0 {
if err := e . PassthroughHTMLStreamer . Text ( sb . String ( ) ) ; err != nil {
return err
}
sb . Reset ( )
}
if err := e . invisibleRune ( r ) ; err != nil {
return err
}
default :
_ , _ = sb . WriteRune ( r )
}
}
return nil
}
func ( e * escapeStreamer ) brokenRune ( bs [ ] byte ) error {
e . escaped . Escaped = true
e . escaped . HasBadRunes = true
if err := e . PassthroughHTMLStreamer . StartTag ( "span" , html . Attribute {
Key : "class" ,
Val : "broken-code-point" ,
} ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . Text ( fmt . Sprintf ( "<%X>" , bs ) ) ; err != nil {
return err
}
return e . PassthroughHTMLStreamer . EndTag ( "span" )
}
func ( e * escapeStreamer ) ambiguousRune ( r , c rune ) error {
e . escaped . Escaped = true
e . escaped . HasAmbiguous = true
if err := e . PassthroughHTMLStreamer . StartTag ( "span" , html . Attribute {
Key : "class" ,
2023-03-24 13:35:38 +03:00
Val : "ambiguous-code-point" ,
2022-08-13 21:32:34 +03:00
} , html . Attribute {
2023-03-24 13:35:38 +03:00
Key : "data-tooltip-content" ,
2022-08-13 21:32:34 +03:00
Val : e . locale . Tr ( "repo.ambiguous_character" , r , c ) ,
} ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . StartTag ( "span" , html . Attribute {
Key : "class" ,
Val : "char" ,
} ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . Text ( string ( r ) ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . EndTag ( "span" ) ; err != nil {
return err
}
return e . PassthroughHTMLStreamer . EndTag ( "span" )
}
func ( e * escapeStreamer ) invisibleRune ( r rune ) error {
e . escaped . Escaped = true
e . escaped . HasInvisible = true
if err := e . PassthroughHTMLStreamer . StartTag ( "span" , html . Attribute {
Key : "class" ,
Val : "escaped-code-point" ,
} , html . Attribute {
Key : "data-escaped" ,
Val : fmt . Sprintf ( "[U+%04X]" , r ) ,
} ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . StartTag ( "span" , html . Attribute {
Key : "class" ,
Val : "char" ,
} ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . Text ( string ( r ) ) ; err != nil {
return err
}
if err := e . PassthroughHTMLStreamer . EndTag ( "span" ) ; err != nil {
return err
}
return e . PassthroughHTMLStreamer . EndTag ( "span" )
}
type runeCountType struct {
numBasicRunes int
numNonConfusingNonBasicRunes int
numAmbiguousRunes int
numInvisibleRunes int
numBrokenRunes int
}
func ( counts runeCountType ) needsEscape ( ) bool {
if counts . numBrokenRunes > 0 {
return true
}
if counts . numBasicRunes == 0 &&
counts . numNonConfusingNonBasicRunes > 0 {
return false
}
return counts . numAmbiguousRunes > 0 || counts . numInvisibleRunes > 0
}
type runeType int
const (
2022-12-08 11:21:37 +03:00
basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay
2022-08-13 21:32:34 +03:00
brokenRuneType
nonBasicASCIIRuneType
ambiguousRuneType
invisibleRuneType
)
func ( e * escapeStreamer ) runeTypes ( runes ... rune ) ( types [ ] runeType , confusables [ ] rune , runeCounts runeCountType ) {
types = make ( [ ] runeType , len ( runes ) )
for i , r := range runes {
var confusable rune
switch {
case r == utf8 . RuneError :
types [ i ] = brokenRuneType
runeCounts . numBrokenRunes ++
case r == ' ' || r == '\t' || r == '\n' :
runeCounts . numBasicRunes ++
Fix isAllowed of escapeStreamer (#22814)
The use of `sort.Search` is wrong: The slice should be sorted, and
`return >= 0` doen't mean it exists, see the
[manual](https://pkg.go.dev/sort#Search).
Could be fixed like this if we really need it:
```diff
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 823b63513..fcf1ffbc1 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -20,6 +20,9 @@ import (
var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
+ sort.Slice(allowed, func(i, j int) bool {
+ return allowed[i] < allowed[j]
+ })
return &escapeStreamer{
escaped: &EscapeStatus{},
PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
@@ -284,14 +287,8 @@ func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables
}
func (e *escapeStreamer) isAllowed(r rune) bool {
- if len(e.allowed) == 0 {
- return false
- }
- if len(e.allowed) == 1 {
- return e.allowed[0] == r
- }
-
- return sort.Search(len(e.allowed), func(i int) bool {
+ i := sort.Search(len(e.allowed), func(i int) bool {
return e.allowed[i] >= r
- }) >= 0
+ })
+ return i < len(e.allowed) && e.allowed[i] == r
}
```
But I don't think so, a map is better to do it.
2023-02-09 15:51:36 +03:00
case e . allowed [ r ] :
2022-08-13 21:32:34 +03:00
if r > 0x7e || r < 0x20 {
types [ i ] = nonBasicASCIIRuneType
runeCounts . numNonConfusingNonBasicRunes ++
} else {
runeCounts . numBasicRunes ++
}
case unicode . Is ( InvisibleRanges , r ) :
types [ i ] = invisibleRuneType
runeCounts . numInvisibleRunes ++
case unicode . IsControl ( r ) :
types [ i ] = invisibleRuneType
runeCounts . numInvisibleRunes ++
case isAmbiguous ( r , & confusable , e . ambiguousTables ... ) :
confusables = append ( confusables , confusable )
types [ i ] = ambiguousRuneType
runeCounts . numAmbiguousRunes ++
case r > 0x7e || r < 0x20 :
types [ i ] = nonBasicASCIIRuneType
runeCounts . numNonConfusingNonBasicRunes ++
default :
runeCounts . numBasicRunes ++
}
}
return types , confusables , runeCounts
}