2020-12-17 17:00:47 +03:00
// Copyright 2020 The Gitea Authors. All rights reserved.
2022-11-27 21:20:29 +03:00
// SPDX-License-Identifier: MIT
2020-12-17 17:00:47 +03:00
2021-08-24 19:47:09 +03:00
//go:build gogit
2020-12-17 17:00:47 +03:00
package git
import (
"bytes"
"io"
2021-11-17 23:37:00 +03:00
"strings"
2020-12-17 17:00:47 +03:00
"code.gitea.io/gitea/modules/analyze"
"github.com/go-enry/go-enry/v2"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-git/go-git/v5/plumbing/object"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func ( repo * Repository ) GetLanguageStats ( commitID string ) ( map [ string ] int64 , error ) {
r , err := git . PlainOpen ( repo . Path )
if err != nil {
return nil , err
}
rev , err := r . ResolveRevision ( plumbing . Revision ( commitID ) )
if err != nil {
return nil , err
}
commit , err := r . CommitObject ( * rev )
if err != nil {
return nil , err
}
tree , err := commit . Tree ( )
if err != nil {
return nil , err
}
2022-06-16 18:47:44 +03:00
checker , deferable := repo . CheckAttributeReader ( commitID )
defer deferable ( )
2021-09-09 23:13:36 +03:00
2022-10-29 10:04:21 +03:00
// sizes contains the current calculated size of all files by language
2020-12-17 17:00:47 +03:00
sizes := make ( map [ string ] int64 )
2022-10-29 10:04:21 +03:00
// by default we will only count the sizes of programming languages or markup languages
// unless they are explicitly set using linguist-language
includedLanguage := map [ string ] bool { }
// or if there's only one language in the repository
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64 ( 0 )
2020-12-17 17:00:47 +03:00
err = tree . Files ( ) . ForEach ( func ( f * object . File ) error {
2021-09-09 23:13:36 +03:00
if f . Size == 0 {
return nil
}
notVendored := false
notGenerated := false
if checker != nil {
attrs , err := checker . CheckPath ( f . Name )
if err == nil {
if vendored , has := attrs [ "linguist-vendored" ] ; has {
if vendored == "set" || vendored == "true" {
return nil
}
notVendored = vendored == "false"
}
if generated , has := attrs [ "linguist-generated" ] ; has {
if generated == "set" || generated == "true" {
return nil
}
notGenerated = generated == "false"
}
if language , has := attrs [ "linguist-language" ] ; has && language != "unspecified" && language != "" {
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
2021-09-20 22:46:51 +03:00
if len ( group ) != 0 {
2021-09-09 23:13:36 +03:00
language = group
}
2022-10-29 10:04:21 +03:00
// this language will always be added to the size
2021-09-09 23:13:36 +03:00
sizes [ language ] += f . Size
return nil
2021-11-17 23:37:00 +03:00
} else if language , has := attrs [ "gitlab-language" ] ; has && language != "unspecified" && language != "" {
// strip off a ? if present
if idx := strings . IndexByte ( language , '?' ) ; idx >= 0 {
language = language [ : idx ]
}
if len ( language ) != 0 {
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
if len ( group ) != 0 {
language = group
}
2022-10-29 10:04:21 +03:00
// this language will always be added to the size
2021-11-17 23:37:00 +03:00
sizes [ language ] += f . Size
return nil
}
2021-09-09 23:13:36 +03:00
}
}
}
if ( ! notVendored && analyze . IsVendor ( f . Name ) ) || enry . IsDotFile ( f . Name ) ||
2020-12-17 17:00:47 +03:00
enry . IsDocumentation ( f . Name ) || enry . IsConfiguration ( f . Name ) {
return nil
}
// If content can not be read or file is too big just do detection by filename
var content [ ] byte
if f . Size <= bigFileSize {
content , _ = readFile ( f , fileSizeLimit )
}
2021-09-09 23:13:36 +03:00
if ! notGenerated && enry . IsGenerated ( f . Name , content ) {
2020-12-17 17:00:47 +03:00
return nil
}
// TODO: Use .gitattributes file for linguist overrides
language := analyze . GetCodeLanguage ( f . Name , content )
if language == enry . OtherLanguage || language == "" {
return nil
}
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
if group != "" {
language = group
}
2022-10-29 10:04:21 +03:00
included , checked := includedLanguage [ language ]
if ! checked {
langtype := enry . GetLanguageType ( language )
included = langtype == enry . Programming || langtype == enry . Markup
includedLanguage [ language ] = included
}
if included {
sizes [ language ] += f . Size
} else if len ( sizes ) == 0 && ( firstExcludedLanguage == "" || firstExcludedLanguage == language ) {
firstExcludedLanguage = language
firstExcludedLanguageSize += f . Size
}
2020-12-17 17:00:47 +03:00
return nil
} )
if err != nil {
return nil , err
}
2022-10-29 10:04:21 +03:00
// If there are no included languages add the first excluded language
if len ( sizes ) == 0 && firstExcludedLanguage != "" {
sizes [ firstExcludedLanguage ] = firstExcludedLanguageSize
2020-12-17 17:00:47 +03:00
}
return sizes , nil
}
func readFile ( f * object . File , limit int64 ) ( [ ] byte , error ) {
r , err := f . Reader ( )
if err != nil {
return nil , err
}
defer r . Close ( )
if limit <= 0 {
2021-09-22 08:38:34 +03:00
return io . ReadAll ( r )
2020-12-17 17:00:47 +03:00
}
size := f . Size
if limit > 0 && size > limit {
size = limit
}
buf := bytes . NewBuffer ( nil )
buf . Grow ( int ( size ) )
_ , err = io . Copy ( buf , io . LimitReader ( r , limit ) )
return buf . Bytes ( ) , err
}