// Copyright 2021 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT package typesniffer import ( "bytes" "encoding/binary" "net/http" "regexp" "slices" "strings" "sync" ) const SniffContentSize = 1024 const ( MimeTypeImageSvg = "image/svg+xml" MimeTypeImageAvif = "image/avif" MimeTypeApplicationOctetStream = "application/octet-stream" ) var globalVars = sync.OnceValue(func() (ret struct { svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp }, ) { ret.svgComment = regexp.MustCompile(`(?s)`) ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(|>))\s*)*\s*(?:(|>))\s*)* len(data) { return nil, false } tag := string(data[4:8]) if tag != "ftyp" { return nil, false } brands = append(brands, string(data[8:12])) for i := 16; i+4 <= boxSize; i += 4 { brands = append(brands, string(data[i:i+4])) } return brands, true } // DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty. func DetectContentType(data []byte) SniffedType { if len(data) == 0 { return SniffedType{"text/plain"} } ct := http.DetectContentType(data) if len(data) > SniffContentSize { data = data[:SniffContentSize] } vars := globalVars() // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888 detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html") detectByXML := strings.Contains(ct, "text/xml") if detectByHTML || detectByXML { dataProcessed := vars.svgComment.ReplaceAll(data, nil) dataProcessed = bytes.TrimSpace(dataProcessed) if detectByHTML && vars.svgTagRegex.Match(dataProcessed) || detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) { ct = MimeTypeImageSvg } } if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". // So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content. // This works especially because audio files contain many unprintable/invalid characters like `0x00` ct2 := http.DetectContentType(data[3:]) if strings.HasPrefix(ct2, "text/") { ct = ct2 } } fileTypeBrands, found := detectFileTypeBox(data) if found && slices.Contains(fileTypeBrands, "avif") { ct = MimeTypeImageAvif } if ct == "application/ogg" { dataHead := data if len(dataHead) > 256 { dataHead = dataHead[:256] // only need to do a quick check for the file header } if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) { ct = "video/ogg" // ogg is only used for some video formats, and it's not popular } else { ct = "audio/ogg" // for most cases, it is used as an audio container } } return SniffedType{ct} }