From 1c158f9b866148246bad89f5bc6d4876c752681f Mon Sep 17 00:00:00 2001
From: Julie Tibshirani <julietibs@apache.org>
Date: Sun, 11 Feb 2024 18:11:24 -0800
Subject: [PATCH] Don't truncate file before detecting language (#740)

Currently, we truncate a file's contents to 2048 bytes before passing it to
`go-enry`. I ran into a few cases where this is causing us to misclassify
files.

This PR removes the truncation. It should still be fine in terms of
performance, since `go-enry` is quite fast in general: ~1ms in my local
testing, even for large files. And we only run language detection if we plan to
index the file, which means we skip binary files and large files.
---
 indexbuilder.go | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/indexbuilder.go b/indexbuilder.go
index 67773c6d7..6a7b5ea1a 100644
--- a/indexbuilder.go
+++ b/indexbuilder.go
@@ -397,12 +397,7 @@ func (b *IndexBuilder) addSymbols(symbols []*Symbol) {
 
 func DetermineLanguageIfUnknown(doc *Document) {
 	if doc.Language == "" {
-		c := doc.Content
-		// classifier is faster on small files without losing much accuracy
-		if len(c) > 2048 {
-			c = c[:2048]
-		}
-		doc.Language = enry.GetLanguage(doc.Name, c)
+		doc.Language = enry.GetLanguage(doc.Name, doc.Content)
 	}
 }