2019-08-15 15:07:28 +03:00
// Copyright 2014 The Gogs Authors. All rights reserved.
2022-11-27 21:20:29 +03:00
// SPDX-License-Identifier: MIT
2019-08-15 15:07:28 +03:00
package charset
import (
"bytes"
"fmt"
2021-04-20 01:25:08 +03:00
"io"
2020-06-03 01:20:19 +03:00
"strings"
2019-08-15 15:07:28 +03:00
"unicode/utf8"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
2021-10-25 00:12:43 +03:00
"code.gitea.io/gitea/modules/util"
2019-08-15 15:07:28 +03:00
2019-12-14 03:15:48 +03:00
"github.com/gogs/chardet"
2019-08-15 15:07:28 +03:00
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = [ ] byte { '\xef' , '\xbb' , '\xbf' }
2024-01-27 21:02:51 +03:00
type ConvertOpts struct {
KeepBOM bool
}
Fix various typos (#20338)
* Fix various typos
Found via `codespell -q 3 -S ./options/locale,./options/license,./public/vendor -L actived,allways,attachements,ba,befores,commiter,pullrequest,pullrequests,readby,splitted,te,unknwon`
Co-authored-by: zeripath <art27@cantab.net>
2022-07-13 00:32:37 +03:00
// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
2024-01-27 21:02:51 +03:00
func ToUTF8WithFallbackReader ( rd io . Reader , opts ConvertOpts ) io . Reader {
2022-01-20 20:46:10 +03:00
buf := make ( [ ] byte , 2048 )
2021-10-25 00:12:43 +03:00
n , err := util . ReadAtMost ( rd , buf )
2021-04-20 01:25:08 +03:00
if err != nil {
2024-01-27 21:02:51 +03:00
return io . MultiReader ( bytes . NewReader ( MaybeRemoveBOM ( buf [ : n ] , opts ) ) , rd )
2021-04-20 01:25:08 +03:00
}
charsetLabel , err := DetectEncoding ( buf [ : n ] )
if err != nil || charsetLabel == "UTF-8" {
2024-01-27 21:02:51 +03:00
return io . MultiReader ( bytes . NewReader ( MaybeRemoveBOM ( buf [ : n ] , opts ) ) , rd )
2021-04-20 01:25:08 +03:00
}
encoding , _ := charset . Lookup ( charsetLabel )
if encoding == nil {
return io . MultiReader ( bytes . NewReader ( buf [ : n ] ) , rd )
}
return transform . NewReader (
io . MultiReader (
2024-01-27 21:02:51 +03:00
bytes . NewReader ( MaybeRemoveBOM ( buf [ : n ] , opts ) ) ,
2021-04-20 01:25:08 +03:00
rd ,
) ,
encoding . NewDecoder ( ) ,
)
}
2024-01-27 21:02:51 +03:00
// ToUTF8 converts content to UTF8 encoding
func ToUTF8 ( content [ ] byte , opts ConvertOpts ) ( string , error ) {
2019-08-15 15:07:28 +03:00
charsetLabel , err := DetectEncoding ( content )
if err != nil {
return "" , err
} else if charsetLabel == "UTF-8" {
2024-01-27 21:02:51 +03:00
return string ( MaybeRemoveBOM ( content , opts ) ) , nil
2019-08-15 15:07:28 +03:00
}
encoding , _ := charset . Lookup ( charsetLabel )
if encoding == nil {
return string ( content ) , fmt . Errorf ( "Unknown encoding: %s" , charsetLabel )
}
// If there is an error, we concatenate the nicely decoded part and the
2019-09-21 20:01:34 +03:00
// original left over. This way we won't lose much data.
2019-08-15 15:07:28 +03:00
result , n , err := transform . Bytes ( encoding . NewDecoder ( ) , content )
if err != nil {
result = append ( result , content [ n : ] ... )
}
2024-01-27 21:02:51 +03:00
result = MaybeRemoveBOM ( result , opts )
2019-08-15 15:07:28 +03:00
return string ( result ) , err
}
Fix various typos (#20338)
* Fix various typos
Found via `codespell -q 3 -S ./options/locale,./options/license,./public/vendor -L actived,allways,attachements,ba,befores,commiter,pullrequest,pullrequests,readby,splitted,te,unknwon`
Co-authored-by: zeripath <art27@cantab.net>
2022-07-13 00:32:37 +03:00
// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
2024-01-27 21:02:51 +03:00
func ToUTF8WithFallback ( content [ ] byte , opts ConvertOpts ) [ ] byte {
bs , _ := io . ReadAll ( ToUTF8WithFallbackReader ( bytes . NewReader ( content ) , opts ) )
2021-04-20 01:25:08 +03:00
return bs
2019-08-15 15:07:28 +03:00
}
// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
2024-01-27 21:02:51 +03:00
func ToUTF8DropErrors ( content [ ] byte , opts ConvertOpts ) [ ] byte {
2019-08-15 15:07:28 +03:00
charsetLabel , err := DetectEncoding ( content )
if err != nil || charsetLabel == "UTF-8" {
2024-01-27 21:02:51 +03:00
return MaybeRemoveBOM ( content , opts )
2019-08-15 15:07:28 +03:00
}
encoding , _ := charset . Lookup ( charsetLabel )
if encoding == nil {
return content
}
// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded [ ] byte
decoder := encoding . NewDecoder ( )
idx := 0
for {
result , n , err := transform . Bytes ( decoder , content [ idx : ] )
decoded = append ( decoded , result ... )
if err == nil {
break
}
decoded = append ( decoded , ' ' )
idx = idx + n + 1
if idx >= len ( content ) {
break
}
}
2024-01-27 21:02:51 +03:00
return MaybeRemoveBOM ( decoded , opts )
2019-08-15 15:07:28 +03:00
}
2024-01-27 21:02:51 +03:00
// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
func MaybeRemoveBOM ( content [ ] byte , opts ConvertOpts ) [ ] byte {
if opts . KeepBOM {
return content
}
2019-08-15 15:07:28 +03:00
if len ( content ) > 2 && bytes . Equal ( content [ 0 : 3 ] , UTF8BOM ) {
return content [ 3 : ]
}
return content
}
// DetectEncoding detect the encoding of content
func DetectEncoding ( content [ ] byte ) ( string , error ) {
2022-05-21 16:06:24 +03:00
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
// instead we walk backwards from the end to trim off a the incomplete character
toValidate := content
end := len ( toValidate ) - 1
if end < 0 {
// no-op
} else if toValidate [ end ] >> 5 == 0b110 {
// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
toValidate = toValidate [ : end ]
} else if end > 0 && toValidate [ end ] >> 6 == 0b10 && toValidate [ end - 1 ] >> 4 == 0b1110 {
// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
toValidate = toValidate [ : end - 1 ]
} else if end > 1 && toValidate [ end ] >> 6 == 0b10 && toValidate [ end - 1 ] >> 6 == 0b10 && toValidate [ end - 2 ] >> 3 == 0b11110 {
// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
toValidate = toValidate [ : end - 2 ]
}
if utf8 . Valid ( toValidate ) {
2019-08-15 15:07:28 +03:00
log . Debug ( "Detected encoding: utf-8 (fast)" )
return "UTF-8" , nil
}
textDetector := chardet . NewTextDetector ( )
var detectContent [ ] byte
if len ( content ) < 1024 {
// Check if original content is valid
if _ , err := textDetector . DetectBest ( content ) ; err != nil {
return "" , err
}
times := 1024 / len ( content )
detectContent = make ( [ ] byte , 0 , times * len ( content ) )
for i := 0 ; i < times ; i ++ {
detectContent = append ( detectContent , content ... )
}
} else {
detectContent = content
}
2020-06-03 01:20:19 +03:00
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
results , err := textDetector . DetectAll ( detectContent )
2019-08-15 15:07:28 +03:00
if err != nil {
2020-06-03 01:20:19 +03:00
if err == chardet . NotDetectedError && len ( setting . Repository . AnsiCharset ) > 0 {
log . Debug ( "Using default AnsiCharset: %s" , setting . Repository . AnsiCharset )
return setting . Repository . AnsiCharset , nil
}
2019-08-15 15:07:28 +03:00
return "" , err
}
2020-06-03 01:20:19 +03:00
topConfidence := results [ 0 ] . Confidence
topResult := results [ 0 ]
priority , has := setting . Repository . DetectedCharsetScore [ strings . ToLower ( strings . TrimSpace ( topResult . Charset ) ) ]
for _ , result := range results {
// As results are sorted in confidence order - if we have a different confidence
// we know it's less than the current confidence and can break out of the loop early
if result . Confidence != topConfidence {
break
}
Fix various typos (#20338)
* Fix various typos
Found via `codespell -q 3 -S ./options/locale,./options/license,./public/vendor -L actived,allways,attachements,ba,befores,commiter,pullrequest,pullrequests,readby,splitted,te,unknwon`
Co-authored-by: zeripath <art27@cantab.net>
2022-07-13 00:32:37 +03:00
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess
2020-06-03 01:20:19 +03:00
resultPriority , resultHas := setting . Repository . DetectedCharsetScore [ strings . ToLower ( strings . TrimSpace ( result . Charset ) ) ]
if resultHas && ( ! has || resultPriority < priority ) {
topResult = result
priority = resultPriority
has = true
}
}
2019-08-15 15:07:28 +03:00
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
2020-06-03 01:20:19 +03:00
if topResult . Charset != "UTF-8" && len ( setting . Repository . AnsiCharset ) > 0 {
2019-08-15 15:07:28 +03:00
log . Debug ( "Using default AnsiCharset: %s" , setting . Repository . AnsiCharset )
return setting . Repository . AnsiCharset , err
}
2020-06-03 01:20:19 +03:00
log . Debug ( "Detected encoding: %s" , topResult . Charset )
return topResult . Charset , err
2019-08-15 15:07:28 +03:00
}