2020-12-17 17:00:47 +03:00
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
2021-08-24 19:47:09 +03:00
//go:build !gogit
2020-12-17 17:00:47 +03:00
// +build !gogit
package git
import (
2021-02-17 22:32:47 +03:00
"bufio"
2020-12-17 17:00:47 +03:00
"bytes"
2021-09-09 23:13:36 +03:00
"context"
2020-12-17 17:00:47 +03:00
"io"
2021-02-17 22:32:47 +03:00
"math"
2021-09-22 08:38:34 +03:00
"os"
2020-12-17 17:00:47 +03:00
"code.gitea.io/gitea/modules/analyze"
2021-06-25 19:54:08 +03:00
"code.gitea.io/gitea/modules/log"
2021-09-20 22:46:51 +03:00
"code.gitea.io/gitea/modules/util"
2020-12-17 17:00:47 +03:00
"github.com/go-enry/go-enry/v2"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func ( repo * Repository ) GetLanguageStats ( commitID string ) ( map [ string ] int64 , error ) {
2021-02-17 22:32:47 +03:00
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout
2021-05-10 04:27:03 +03:00
batchStdinWriter , batchReader , cancel := repo . CatFileBatch ( )
2021-03-04 05:57:01 +03:00
defer cancel ( )
2021-02-17 22:32:47 +03:00
writeID := func ( id string ) error {
2021-06-17 01:16:47 +03:00
_ , err := batchStdinWriter . Write ( [ ] byte ( id + "\n" ) )
2021-02-17 22:32:47 +03:00
return err
}
if err := writeID ( commitID ) ; err != nil {
return nil , err
}
shaBytes , typ , size , err := ReadBatchLine ( batchReader )
if typ != "commit" {
2021-06-25 19:54:08 +03:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2021-02-17 22:32:47 +03:00
return nil , ErrNotExist { commitID , "" }
}
sha , err := NewIDFromString ( string ( shaBytes ) )
2020-12-17 17:00:47 +03:00
if err != nil {
2021-06-25 19:54:08 +03:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2021-02-17 22:32:47 +03:00
return nil , ErrNotExist { commitID , "" }
}
commit , err := CommitFromReader ( repo , sha , io . LimitReader ( batchReader , size ) )
if err != nil {
2021-06-25 19:54:08 +03:00
log . Debug ( "Unable to get commit for: %s. Err: %v" , commitID , err )
2020-12-17 17:00:47 +03:00
return nil , err
}
2021-06-21 01:00:46 +03:00
if _ , err = batchReader . Discard ( 1 ) ; err != nil {
return nil , err
}
2020-12-17 17:00:47 +03:00
tree := commit . Tree
entries , err := tree . ListEntriesRecursive ( )
if err != nil {
return nil , err
}
2021-09-09 23:13:36 +03:00
var checker * CheckAttributeReader
if CheckGitVersionAtLeast ( "1.7.8" ) == nil {
indexFilename , deleteTemporaryFile , err := repo . ReadTreeToTemporaryIndex ( commitID )
if err == nil {
defer deleteTemporaryFile ( )
2021-09-22 08:38:34 +03:00
tmpWorkTree , err := os . MkdirTemp ( "" , "empty-work-dir" )
2021-09-20 22:46:51 +03:00
if err == nil {
defer func ( ) {
_ = util . RemoveAll ( tmpWorkTree )
2021-09-09 23:13:36 +03:00
} ( )
2021-09-20 22:46:51 +03:00
checker = & CheckAttributeReader {
Attributes : [ ] string { "linguist-vendored" , "linguist-generated" , "linguist-language" } ,
Repo : repo ,
IndexFile : indexFilename ,
WorkTree : tmpWorkTree ,
}
ctx , cancel := context . WithCancel ( DefaultContext )
if err := checker . Init ( ctx ) ; err != nil {
log . Error ( "Unable to open checker for %s. Error: %v" , commitID , err )
} else {
go func ( ) {
err = checker . Run ( )
if err != nil {
log . Error ( "Unable to open checker for %s. Error: %v" , commitID , err )
cancel ( )
}
} ( )
}
defer cancel ( )
2021-09-09 23:13:36 +03:00
}
}
}
2021-02-17 22:32:47 +03:00
contentBuf := bytes . Buffer { }
var content [ ] byte
2020-12-17 17:00:47 +03:00
sizes := make ( map [ string ] int64 )
for _ , f := range entries {
2021-02-17 22:32:47 +03:00
contentBuf . Reset ( )
content = contentBuf . Bytes ( )
2021-09-09 23:13:36 +03:00
if f . Size ( ) == 0 {
continue
}
notVendored := false
notGenerated := false
if checker != nil {
attrs , err := checker . CheckPath ( f . Name ( ) )
if err == nil {
if vendored , has := attrs [ "linguist-vendored" ] ; has {
if vendored == "set" || vendored == "true" {
continue
}
notVendored = vendored == "false"
}
if generated , has := attrs [ "linguist-generated" ] ; has {
if generated == "set" || generated == "true" {
continue
}
notGenerated = generated == "false"
}
if language , has := attrs [ "linguist-language" ] ; has && language != "unspecified" && language != "" {
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
2021-09-20 22:46:51 +03:00
if len ( group ) != 0 {
2021-09-09 23:13:36 +03:00
language = group
}
sizes [ language ] += f . Size ( )
continue
}
}
}
if ( ! notVendored && analyze . IsVendor ( f . Name ( ) ) ) || enry . IsDotFile ( f . Name ( ) ) ||
2020-12-17 17:00:47 +03:00
enry . IsDocumentation ( f . Name ( ) ) || enry . IsConfiguration ( f . Name ( ) ) {
continue
}
// If content can not be read or file is too big just do detection by filename
2021-02-17 22:32:47 +03:00
2020-12-17 17:00:47 +03:00
if f . Size ( ) <= bigFileSize {
2021-02-17 22:32:47 +03:00
if err := writeID ( f . ID . String ( ) ) ; err != nil {
return nil , err
}
_ , _ , size , err := ReadBatchLine ( batchReader )
if err != nil {
2021-06-25 19:54:08 +03:00
log . Debug ( "Error reading blob: %s Err: %v" , f . ID . String ( ) , err )
2021-02-17 22:32:47 +03:00
return nil , err
}
sizeToRead := size
2021-06-17 01:16:47 +03:00
discard := int64 ( 1 )
2021-02-17 22:32:47 +03:00
if size > fileSizeLimit {
sizeToRead = fileSizeLimit
2021-06-17 01:16:47 +03:00
discard = size - fileSizeLimit + 1
2021-02-17 22:32:47 +03:00
}
_ , err = contentBuf . ReadFrom ( io . LimitReader ( batchReader , sizeToRead ) )
if err != nil {
return nil , err
}
content = contentBuf . Bytes ( )
err = discardFull ( batchReader , discard )
if err != nil {
return nil , err
}
2020-12-17 17:00:47 +03:00
}
2021-09-09 23:13:36 +03:00
if ! notGenerated && enry . IsGenerated ( f . Name ( ) , content ) {
2020-12-17 17:00:47 +03:00
continue
}
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze . GetCodeLanguage ( f . Name ( ) , content )
if language == enry . OtherLanguage || language == "" {
continue
}
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry . GetLanguageGroup ( language )
if group != "" {
language = group
}
sizes [ language ] += f . Size ( )
continue
}
// filter special languages unless they are the only language
if len ( sizes ) > 1 {
for language := range sizes {
langtype := enry . GetLanguageType ( language )
if langtype != enry . Programming && langtype != enry . Markup {
delete ( sizes , language )
}
}
}
return sizes , nil
}
2021-02-17 22:32:47 +03:00
func discardFull ( rd * bufio . Reader , discard int64 ) error {
if discard > math . MaxInt32 {
n , err := rd . Discard ( math . MaxInt32 )
discard -= int64 ( n )
if err != nil {
return err
}
2020-12-17 17:00:47 +03:00
}
2021-02-17 22:32:47 +03:00
for discard > 0 {
n , err := rd . Discard ( int ( discard ) )
discard -= int64 ( n )
if err != nil {
return err
}
2020-12-17 17:00:47 +03:00
}
2021-02-17 22:32:47 +03:00
return nil
2020-12-17 17:00:47 +03:00
}