Merge different languages for language stats (#24900) (#24921)

Backport #24900 Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
2023-05-25 13:12:21 +08:00 · 2023-05-25 13:12:21 +08:00 · a83d597989
parent c5dee8823c
commit a83d597989
4 changed files with 58 additions and 5 deletions
--- a/modules/git/repo_language_stats.go
+++ b/modules/git/repo_language_stats.go
@ -3,7 +3,46 @@

 package git

+import (
+	"strings"
+	"unicode"
+)
+
 const (
 	fileSizeLimit int64 = 16 * 1024   // 16 KiB
 	bigFileSize   int64 = 1024 * 1024 // 1 MiB
 )
+
+// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
+func mergeLanguageStats(stats map[string]int64) map[string]int64 {
+	names := map[string]struct {
+		uniqueName string
+		upperCount int
+	}{}
+
+	countUpper := func(s string) (count int) {
+		for _, r := range s {
+			if unicode.IsUpper(r) {
+				count++
+			}
+		}
+		return count
+	}
+
+	for name := range stats {
+		cnt := countUpper(name)
+		lower := strings.ToLower(name)
+		if cnt >= names[lower].upperCount {
+			names[lower] = struct {
+				uniqueName string
+				upperCount int
+			}{uniqueName: name, upperCount: cnt}
+		}
+	}
+
+	res := make(map[string]int64, len(names))
+	for name, num := range stats {
+		res[names[strings.ToLower(name)].uniqueName] += num
+	}
+	return res
+}
--- a/modules/git/repo_language_stats_gogit.go
+++ b/modules/git/repo_language_stats_gogit.go
@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 	}

-	return sizes, nil
+	return mergeLanguageStats(sizes), nil
 }

 func readFile(f *object.File, limit int64) ([]byte, error) {
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
 		// - eg. do the all the detection tests using filename first before reading content.
 		language := analyze.GetCodeLanguage(f.Name(), content)
-		if language == enry.OtherLanguage || language == "" {
+		if language == "" {
 			continue
 		}

@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err

 		included, checked := includedLanguage[language]
 		if !checked {
-			langtype := enry.GetLanguageType(language)
-			included = langtype == enry.Programming || langtype == enry.Markup
+			langType := enry.GetLanguageType(language)
+			included = langType == enry.Programming || langType == enry.Markup
 			includedLanguage[language] = included
 		}
 		if included {
@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 	}

-	return sizes, nil
+	return mergeLanguageStats(sizes), nil
 }

 func discardFull(rd *bufio.Reader, discard int64) error {
--- a/modules/git/repo_language_stats_test.go
+++ b/modules/git/repo_language_stats_test.go
@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
 		"Java":   112,
 	}, stats)
 }
+
+func TestMergeLanguageStats(t *testing.T) {
+	assert.EqualValues(t, map[string]int64{
+		"PHP":    1,
+		"python": 10,
+		"JAVA":   700,
+	}, mergeLanguageStats(map[string]int64{
+		"PHP":    1,
+		"python": 10,
+		"Java":   100,
+		"java":   200,
+		"JAVA":   400,
+	}))
+}