From 9c1a26ba49f1edfa9a745aa75359a53dd1c0c490 Mon Sep 17 00:00:00 2001 From: Djoerd Hiemstra Date: Mon, 23 Jan 2023 11:08:01 +0100 Subject: [PATCH] counts unique terms if not available in Lucene --- .../java/io/anserini/index/IndexReaderUtils.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 6211e6432..07f071c1b 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -773,6 +773,20 @@ public static String convertLuceneDocidToDocid(IndexReader reader, int docid) { } } + // Internal helper: counts the number of unique terms + private static long getUniqueTerms(IndexReader reader) throws IOException { + Terms terms = MultiTerms.getTerms(reader, Constants.CONTENTS); + long UniqueTerms = terms.size(); + if (UniqueTerms == -1) { // terms.size() may not be available in Lucene + UniqueTerms = 0; + TermsEnum it = terms.iterator(); + while (it.next() != null) { + UniqueTerms += 1; + } + } + return UniqueTerms; + } + /** * Returns index statistics. * @@ -786,7 +800,7 @@ public static Map getIndexStats(IndexReader reader) { indexStats.put("documents", reader.numDocs()); indexStats.put("non_empty_documents", reader.getDocCount(Constants.CONTENTS)); - indexStats.put("unique_terms", terms.size()); + indexStats.put("unique_terms", getUniqueTerms(reader)); indexStats.put("total_terms", reader.getSumTotalTermFreq(Constants.CONTENTS)); } catch (IOException e) { // Eat any exceptions and just return null.