2021-06-05 15:32:19 +03:00
// Copyright 2021 The Gitea Authors. All rights reserved.
2022-11-27 21:20:29 +03:00
// SPDX-License-Identifier: MIT
2021-06-05 15:32:19 +03:00
package typesniffer
import (
2023-03-07 15:11:24 +03:00
"bytes"
2021-06-05 15:32:19 +03:00
"fmt"
"io"
"net/http"
"regexp"
"strings"
2021-10-25 00:12:43 +03:00
"code.gitea.io/gitea/modules/util"
2021-06-05 15:32:19 +03:00
)
// Use at most this many bytes to determine Content Type.
const sniffLen = 1024
2022-05-28 18:10:14 +03:00
const (
// SvgMimeType MIME type of SVG images.
SvgMimeType = "image/svg+xml"
// ApplicationOctetStream MIME type of binary files.
ApplicationOctetStream = "application/octet-stream"
)
2021-06-05 15:32:19 +03:00
2022-01-20 20:46:10 +03:00
var (
2023-03-07 15:11:24 +03:00
svgComment = regexp . MustCompile ( ` (?s)<!--.*?--> ` )
svgTagRegex = regexp . MustCompile ( ` (?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b ` )
svgTagInXMLRegex = regexp . MustCompile ( ` (?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b ` )
2022-01-20 20:46:10 +03:00
)
2021-06-05 15:32:19 +03:00
2021-07-08 14:38:13 +03:00
// SniffedType contains information about a blobs type.
2021-06-05 15:32:19 +03:00
type SniffedType struct {
contentType string
}
// IsText etects if content format is plain text.
func ( ct SniffedType ) IsText ( ) bool {
return strings . Contains ( ct . contentType , "text/" )
}
// IsImage detects if data is an image format
func ( ct SniffedType ) IsImage ( ) bool {
return strings . Contains ( ct . contentType , "image/" )
}
// IsSvgImage detects if data is an SVG image format
func ( ct SniffedType ) IsSvgImage ( ) bool {
return strings . Contains ( ct . contentType , SvgMimeType )
}
// IsPDF detects if data is a PDF format
func ( ct SniffedType ) IsPDF ( ) bool {
return strings . Contains ( ct . contentType , "application/pdf" )
}
// IsVideo detects if data is an video format
func ( ct SniffedType ) IsVideo ( ) bool {
return strings . Contains ( ct . contentType , "video/" )
}
// IsAudio detects if data is an video format
func ( ct SniffedType ) IsAudio ( ) bool {
return strings . Contains ( ct . contentType , "audio/" )
}
// IsRepresentableAsText returns true if file content can be represented as
// plain text or is empty.
func ( ct SniffedType ) IsRepresentableAsText ( ) bool {
return ct . IsText ( ) || ct . IsSvgImage ( )
}
2023-08-15 05:31:25 +03:00
// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
2022-07-29 18:26:55 +03:00
func ( ct SniffedType ) IsBrowsableBinaryType ( ) bool {
return ct . IsImage ( ) || ct . IsSvgImage ( ) || ct . IsPDF ( ) || ct . IsVideo ( ) || ct . IsAudio ( )
}
// GetMimeType returns the mime type
func ( ct SniffedType ) GetMimeType ( ) string {
return strings . SplitN ( ct . contentType , ";" , 2 ) [ 0 ]
}
2021-06-05 15:32:19 +03:00
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
func DetectContentType ( data [ ] byte ) SniffedType {
if len ( data ) == 0 {
return SniffedType { "text/unknown" }
}
ct := http . DetectContentType ( data )
if len ( data ) > sniffLen {
data = data [ : sniffLen ]
}
2023-03-07 15:11:24 +03:00
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
detectByHTML := strings . Contains ( ct , "text/plain" ) || strings . Contains ( ct , "text/html" )
detectByXML := strings . Contains ( ct , "text/xml" )
if detectByHTML || detectByXML {
dataProcessed := svgComment . ReplaceAll ( data , nil )
dataProcessed = bytes . TrimSpace ( dataProcessed )
if detectByHTML && svgTagRegex . Match ( dataProcessed ) ||
detectByXML && svgTagInXMLRegex . Match ( dataProcessed ) {
ct = SvgMimeType
}
2021-06-05 15:32:19 +03:00
}
2023-03-08 06:40:41 +03:00
if strings . HasPrefix ( ct , "audio/" ) && bytes . HasPrefix ( data , [ ] byte ( "ID3" ) ) {
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
ct2 := http . DetectContentType ( data [ 3 : ] )
if strings . HasPrefix ( ct2 , "text/" ) {
ct = ct2
}
}
2023-08-15 05:31:25 +03:00
if ct == "application/ogg" {
dataHead := data
if len ( dataHead ) > 256 {
dataHead = dataHead [ : 256 ] // only need to do a quick check for the file header
}
if bytes . Contains ( dataHead , [ ] byte ( "theora" ) ) || bytes . Contains ( dataHead , [ ] byte ( "dirac" ) ) {
ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
} else {
ct = "audio/ogg" // for most cases, it is used as an audio container
}
}
2021-06-05 15:32:19 +03:00
return SniffedType { ct }
}
// DetectContentTypeFromReader guesses the content type contained in the reader.
func DetectContentTypeFromReader ( r io . Reader ) ( SniffedType , error ) {
buf := make ( [ ] byte , sniffLen )
2021-10-25 00:12:43 +03:00
n , err := util . ReadAtMost ( r , buf )
if err != nil {
2021-06-05 15:32:19 +03:00
return SniffedType { } , fmt . Errorf ( "DetectContentTypeFromReader io error: %w" , err )
}
buf = buf [ : n ]
return DetectContentType ( buf ) , nil
}