2020-12-17 14:00:47 +00:00
// Copyright 2020 The Gitea Authors. All rights reserved.
2022-11-27 13:20:29 -05:00
// SPDX-License-Identifier: MIT
2020-12-17 14:00:47 +00:00
2021-08-24 11:47:09 -05:00
//go:build !gogit
2020-12-17 14:00:47 +00:00
package git
import (
"bytes"
"io"
"code.gitea.io/gitea/modules/analyze"
2021-06-25 18:54:08 +02:00
"code.gitea.io/gitea/modules/log"
2024-02-23 18:24:27 +01:00
"code.gitea.io/gitea/modules/optional"
2020-12-17 14:00:47 +00:00
"github.com/go-enry/go-enry/v2"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func ( repo * Repository ) GetLanguageStats ( commitID string ) ( map [ string ] int64 , error ) {
2021-02-17 19:32:47 +00:00
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout
2024-08-21 01:04:57 +08:00
batchStdinWriter , batchReader , cancel , err := repo . CatFileBatch ( repo . Ctx )
if err != nil {
return nil , err
}
2021-03-04 02:57:01 +00:00
defer cancel ( )
2021-02-17 19:32:47 +00:00
writeID := func ( id string ) error {
2021-06-16 23:16:47 +01:00
_ , err := batchStdinWriter . Write ( [ ] byte ( id + "\n" ) )
2021-02-17 19:32:47 +00:00
return err
}
if err := writeID ( commitID ) ; err != nil {
return nil , err
}
shaBytes , typ , size , err := ReadBatchLine ( batchReader )
if typ != "commit" {
2021-06-25 18:54:08 +02:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2021-02-17 19:32:47 +00:00
return nil , ErrNotExist { commitID , "" }
}
2023-12-19 15:20:47 +08:00
sha , err := NewIDFromString ( string ( shaBytes ) )
2020-12-17 14:00:47 +00:00
if err != nil {
2021-06-25 18:54:08 +02:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2021-02-17 19:32:47 +00:00
return nil , ErrNotExist { commitID , "" }
}
commit , err := CommitFromReader ( repo , sha , io . LimitReader ( batchReader , size ) )
if err != nil {
2021-06-25 18:54:08 +02:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2020-12-17 14:00:47 +00:00
return nil , err
}
2021-06-20 23:00:46 +01:00
if _ , err = batchReader . Discard ( 1 ) ; err != nil {
return nil , err
}
2020-12-17 14:00:47 +00:00
tree := commit . Tree
2022-10-08 01:20:53 +08:00
entries , err := tree . ListEntriesRecursiveWithSize ( )
2020-12-17 14:00:47 +00:00
if err != nil {
return nil , err
}
2022-06-16 23:47:44 +08:00
checker , deferable := repo . CheckAttributeReader ( commitID )
defer deferable ( )
2021-09-09 21:13:36 +01:00
2021-02-17 19:32:47 +00:00
contentBuf := bytes . Buffer { }
var content [ ] byte
2022-10-29 09:04:21 +02:00
// sizes contains the current calculated size of all files by language
2020-12-17 14:00:47 +00:00
sizes := make ( map [ string ] int64 )
2022-10-29 09:04:21 +02:00
// by default we will only count the sizes of programming languages or markup languages
// unless they are explicitly set using linguist-language
includedLanguage := map [ string ] bool { }
// or if there's only one language in the repository
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64 ( 0 )
2020-12-17 14:00:47 +00:00
for _ , f := range entries {
2021-11-30 20:06:32 +00:00
select {
case <- repo . Ctx . Done ( ) :
return sizes , repo . Ctx . Err ( )
default :
}
2021-02-17 19:32:47 +00:00
contentBuf . Reset ( )
content = contentBuf . Bytes ( )
2021-09-09 21:13:36 +01:00
if f . Size ( ) == 0 {
continue
}
2024-02-23 18:24:27 +01:00
isVendored := optional . None [ bool ] ( )
isGenerated := optional . None [ bool ] ( )
isDocumentation := optional . None [ bool ] ( )
isDetectable := optional . None [ bool ] ( )
2021-09-09 21:13:36 +01:00
if checker != nil {
attrs , err := checker . CheckPath ( f . Name ( ) )
if err == nil {
2024-02-24 19:46:49 +01:00
isVendored = AttributeToBool ( attrs , AttributeLinguistVendored )
2024-02-23 18:24:27 +01:00
if isVendored . ValueOrDefault ( false ) {
continue
}
2024-02-24 19:46:49 +01:00
isGenerated = AttributeToBool ( attrs , AttributeLinguistGenerated )
2024-02-23 18:24:27 +01:00
if isGenerated . ValueOrDefault ( false ) {
continue
2021-09-09 21:13:36 +01:00
}
2024-02-23 18:24:27 +01:00
2024-02-24 19:46:49 +01:00
isDocumentation = AttributeToBool ( attrs , AttributeLinguistDocumentation )
2024-02-23 18:24:27 +01:00
if isDocumentation . ValueOrDefault ( false ) {
continue
}
2024-02-24 19:46:49 +01:00
isDetectable = AttributeToBool ( attrs , AttributeLinguistDetectable )
2024-02-23 18:24:27 +01:00
if ! isDetectable . ValueOrDefault ( true ) {
continue
}
2024-02-24 19:46:49 +01:00
hasLanguage := TryReadLanguageAttribute ( attrs )
2024-02-23 18:24:27 +01:00
if hasLanguage . Value ( ) != "" {
language := hasLanguage . Value ( )
2021-09-09 21:13:36 +01:00
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
2021-09-21 03:46:51 +08:00
if len ( group ) != 0 {
2021-09-09 21:13:36 +01:00
language = group
}
2022-10-29 09:04:21 +02:00
// this language will always be added to the size
2021-09-09 21:13:36 +01:00
sizes [ language ] += f . Size ( )
continue
}
}
}
2024-02-23 18:24:27 +01:00
if ( ! isVendored . Has ( ) && analyze . IsVendor ( f . Name ( ) ) ) ||
enry . IsDotFile ( f . Name ( ) ) ||
( ! isDocumentation . Has ( ) && enry . IsDocumentation ( f . Name ( ) ) ) ||
enry . IsConfiguration ( f . Name ( ) ) {
2020-12-17 14:00:47 +00:00
continue
}
// If content can not be read or file is too big just do detection by filename
2021-02-17 19:32:47 +00:00
2020-12-17 14:00:47 +00:00
if f . Size ( ) <= bigFileSize {
2021-02-17 19:32:47 +00:00
if err := writeID ( f . ID . String ( ) ) ; err != nil {
return nil , err
}
_ , _ , size , err := ReadBatchLine ( batchReader )
if err != nil {
2021-06-25 18:54:08 +02:00
log . Debug ( "Error reading blob: %s Err: %v" , f . ID . String ( ) , err )
2021-02-17 19:32:47 +00:00
return nil , err
}
sizeToRead := size
2021-06-16 23:16:47 +01:00
discard := int64 ( 1 )
2021-02-17 19:32:47 +00:00
if size > fileSizeLimit {
sizeToRead = fileSizeLimit
2021-06-16 23:16:47 +01:00
discard = size - fileSizeLimit + 1
2021-02-17 19:32:47 +00:00
}
_ , err = contentBuf . ReadFrom ( io . LimitReader ( batchReader , sizeToRead ) )
if err != nil {
return nil , err
}
content = contentBuf . Bytes ( )
2024-02-22 04:48:19 +01:00
if err := DiscardFull ( batchReader , discard ) ; err != nil {
2021-02-17 19:32:47 +00:00
return nil , err
}
2020-12-17 14:00:47 +00:00
}
2024-02-23 18:24:27 +01:00
if ! isGenerated . Has ( ) && enry . IsGenerated ( f . Name ( ) , content ) {
2020-12-17 14:00:47 +00:00
continue
}
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze . GetCodeLanguage ( f . Name ( ) , content )
2023-05-25 03:37:36 +08:00
if language == "" {
2020-12-17 14:00:47 +00:00
continue
}
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
if group != "" {
language = group
}
2022-10-29 09:04:21 +02:00
included , checked := includedLanguage [ language ]
if ! checked {
2023-05-25 03:37:36 +08:00
langType := enry . GetLanguageType ( language )
included = langType == enry . Programming || langType == enry . Markup
2022-10-29 09:04:21 +02:00
includedLanguage [ language ] = included
}
2024-02-23 18:24:27 +01:00
if included || isDetectable . ValueOrDefault ( false ) {
2022-10-29 09:04:21 +02:00
sizes [ language ] += f . Size ( )
} else if len ( sizes ) == 0 && ( firstExcludedLanguage == "" || firstExcludedLanguage == language ) {
firstExcludedLanguage = language
firstExcludedLanguageSize += f . Size ( )
}
2020-12-17 14:00:47 +00:00
}
2022-10-29 09:04:21 +02:00
// If there are no included languages add the first excluded language
if len ( sizes ) == 0 && firstExcludedLanguage != "" {
sizes [ firstExcludedLanguage ] = firstExcludedLanguageSize
2020-12-17 14:00:47 +00:00
}
2023-05-25 03:37:36 +08:00
return mergeLanguageStats ( sizes ) , nil
2020-12-17 14:00:47 +00:00
}