2021-06-05 14:32:19 +02:00
// Copyright 2021 The Gitea Authors. All rights reserved.
2022-11-27 13:20:29 -05:00
// SPDX-License-Identifier: MIT
2021-06-05 14:32:19 +02:00
package typesniffer
import (
2023-03-07 20:11:24 +08:00
"bytes"
2021-06-05 14:32:19 +02:00
"fmt"
"io"
"net/http"
"regexp"
"strings"
2021-10-24 23:12:43 +02:00
"code.gitea.io/gitea/modules/util"
2021-06-05 14:32:19 +02:00
)
// Use at most this many bytes to determine Content Type.
const sniffLen = 1024
2022-05-28 18:10:14 +03:00
const (
// SvgMimeType MIME type of SVG images.
SvgMimeType = "image/svg+xml"
// ApplicationOctetStream MIME type of binary files.
ApplicationOctetStream = "application/octet-stream"
)
2021-06-05 14:32:19 +02:00
2022-01-20 18:46:10 +01:00
var (
2023-03-07 20:11:24 +08:00
svgComment = regexp . MustCompile ( ` (?s)<!--.*?--> ` )
svgTagRegex = regexp . MustCompile ( ` (?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b ` )
svgTagInXMLRegex = regexp . MustCompile ( ` (?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b ` )
2022-01-20 18:46:10 +01:00
)
2021-06-05 14:32:19 +02:00
2021-07-08 07:38:13 -04:00
// SniffedType contains information about a blobs type.
2021-06-05 14:32:19 +02:00
type SniffedType struct {
contentType string
}
// IsText etects if content format is plain text.
func ( ct SniffedType ) IsText ( ) bool {
return strings . Contains ( ct . contentType , "text/" )
}
// IsImage detects if data is an image format
func ( ct SniffedType ) IsImage ( ) bool {
return strings . Contains ( ct . contentType , "image/" )
}
// IsSvgImage detects if data is an SVG image format
func ( ct SniffedType ) IsSvgImage ( ) bool {
return strings . Contains ( ct . contentType , SvgMimeType )
}
// IsPDF detects if data is a PDF format
func ( ct SniffedType ) IsPDF ( ) bool {
return strings . Contains ( ct . contentType , "application/pdf" )
}
// IsVideo detects if data is an video format
func ( ct SniffedType ) IsVideo ( ) bool {
return strings . Contains ( ct . contentType , "video/" )
}
// IsAudio detects if data is an video format
func ( ct SniffedType ) IsAudio ( ) bool {
return strings . Contains ( ct . contentType , "audio/" )
}
// IsRepresentableAsText returns true if file content can be represented as
// plain text or is empty.
func ( ct SniffedType ) IsRepresentableAsText ( ) bool {
return ct . IsText ( ) || ct . IsSvgImage ( )
}
2022-07-29 17:26:55 +02:00
// IsBrowsableType returns whether a non-text type can be displayed in a browser
func ( ct SniffedType ) IsBrowsableBinaryType ( ) bool {
return ct . IsImage ( ) || ct . IsSvgImage ( ) || ct . IsPDF ( ) || ct . IsVideo ( ) || ct . IsAudio ( )
}
// GetMimeType returns the mime type
func ( ct SniffedType ) GetMimeType ( ) string {
return strings . SplitN ( ct . contentType , ";" , 2 ) [ 0 ]
}
2021-06-05 14:32:19 +02:00
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
func DetectContentType ( data [ ] byte ) SniffedType {
if len ( data ) == 0 {
return SniffedType { "text/unknown" }
}
ct := http . DetectContentType ( data )
if len ( data ) > sniffLen {
data = data [ : sniffLen ]
}
2023-03-07 20:11:24 +08:00
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
detectByHTML := strings . Contains ( ct , "text/plain" ) || strings . Contains ( ct , "text/html" )
detectByXML := strings . Contains ( ct , "text/xml" )
if detectByHTML || detectByXML {
dataProcessed := svgComment . ReplaceAll ( data , nil )
dataProcessed = bytes . TrimSpace ( dataProcessed )
if detectByHTML && svgTagRegex . Match ( dataProcessed ) ||
detectByXML && svgTagInXMLRegex . Match ( dataProcessed ) {
ct = SvgMimeType
}
2021-06-05 14:32:19 +02:00
}
2023-03-08 11:40:41 +08:00
if strings . HasPrefix ( ct , "audio/" ) && bytes . HasPrefix ( data , [ ] byte ( "ID3" ) ) {
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
// So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
ct2 := http . DetectContentType ( data [ 3 : ] )
if strings . HasPrefix ( ct2 , "text/" ) {
ct = ct2
}
}
2021-06-05 14:32:19 +02:00
return SniffedType { ct }
}
// DetectContentTypeFromReader guesses the content type contained in the reader.
func DetectContentTypeFromReader ( r io . Reader ) ( SniffedType , error ) {
buf := make ( [ ] byte , sniffLen )
2021-10-24 23:12:43 +02:00
n , err := util . ReadAtMost ( r , buf )
if err != nil {
2021-06-05 14:32:19 +02:00
return SniffedType { } , fmt . Errorf ( "DetectContentTypeFromReader io error: %w" , err )
}
buf = buf [ : n ]
return DetectContentType ( buf ) , nil
}