Skip to content

Commit

Permalink
Add test case for IndexHnswDenseVectors (#2254)
Browse files Browse the repository at this point in the history
+ Add custom log appender to intercept and verify logging output
+ Clean up IndexInvertedDenseVectors
+ Fix code coverage (wasn't running properly before)
+ Other light clean-up
  • Loading branch information
lintool authored Nov 10, 2023
1 parent c6c5583 commit 34b5437
Show file tree
Hide file tree
Showing 11 changed files with 286 additions and 79 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ jobs:
run: mvn -B package --file pom.xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
# https://github.com/codecov/codecov-action
with:
token: ${{ secrets.CODECOV_TOKEN }}
3 changes: 2 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@
<artifactId>maven-surefire-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<argLine>-Xmx8192m</argLine>
<!-- Need to add @{argLine} for JaCoCo to work; see https://stackoverflow.com/questions/71568474/getting-skipping-jacoco-execution-due-to-missing-execution-data-file-with-jaco -->
<argLine>@{argLine} -Xmx8192m</argLine>
</configuration>
</plugin>
<plugin>
Expand Down
14 changes: 7 additions & 7 deletions src/main/java/io/anserini/eval/RelevanceJudgments.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
public class RelevanceJudgments {
final private Map<String, Map<String, Integer>> qrels;
final private static String CACHE_DIR = Paths.get(System.getProperty("user.home"), "/.cache/anserini/topics-and-qrels").toString();
final private static String CLOUD_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";
final private static String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";

public static RelevanceJudgments fromQrels(Qrels qrels) throws IOException {
return new RelevanceJudgments("src/main/resources/" + qrels.path);
Expand All @@ -46,7 +46,7 @@ public RelevanceJudgments(String file) throws IOException {
try {
qrelsPath = getQrelsPath(qrelsPath);
} catch (IOException e) {
System.out.println("Qrels file not found at " + qrelsPath.toString());
System.out.println("Qrels file not found at " + qrelsPath);
}

try (BufferedReader br = new BufferedReader(new FileReader(qrelsPath.toString()))) {
Expand All @@ -66,7 +66,7 @@ public RelevanceJudgments(String file) throws IOException {
}
}
} catch (IOException e) {
throw new IOException("Could not read qrels file");
throw new IOException("Could not read qrels file!");
}
}

Expand Down Expand Up @@ -136,7 +136,7 @@ public static String getQrelsResource(Path qrelsPath) throws IOException {
try {
resultPath = getQrelsPath(qrelsPath);
} catch (Exception e) {
throw new IOException("Could not get qrels file from cloud or local file system");
throw new IOException("Could not get qrels file either from server or local file system!");
}

InputStream inputStream = Files.newInputStream(resultPath);
Expand Down Expand Up @@ -184,13 +184,13 @@ public static Path getNewQrelAbsPath(Path qrelsPath) {
* @throws IOException if qrels file is not found
*/
public static Path downloadQrels(Path qrelsPath) throws IOException {
String qrelsURL = CLOUD_PATH + qrelsPath.getFileName().toString().toString();
System.out.println("Downloading qrels from cloud " + qrelsURL.toString());
String qrelsURL = SERVER_PATH + qrelsPath.getFileName().toString();
System.out.println("Downloading qrels from " + qrelsURL);
File qrelsFile = new File(getCacheDir(), qrelsPath.getFileName().toString());
try {
FileUtils.copyURLToFile(new URL(qrelsURL), qrelsFile);
} catch (Exception e) {
System.out.println("Error downloading topics from cloud " + qrelsURL.toString());
System.out.println("Error downloading topics from " + qrelsURL);
throw e;
}
return qrelsFile.toPath();
Expand Down
12 changes: 5 additions & 7 deletions src/main/java/io/anserini/index/IndexInvertedDenseVectors.java
Original file line number Diff line number Diff line change
Expand Up @@ -546,15 +546,13 @@ public Counters run() throws IOException {
}

writer.commit();
System.out.println(String.format("%s docs indexed", counters.indexed.get()));
long space = FileUtils.sizeOfDirectory(indexPath.toFile()) / (1024L * 1024L);
System.out.println(String.format("Index size: %dMB", space));
writer.close();

final long durationMillis =
TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
System.out.println(String.format("Total time: %s",
DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
LOG.info(String.format("Indexing Complete! %,d documents indexed", counters.indexed.get()));

final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info(String.format("Total %,d documents indexed in %s", counters.indexed.get(),
DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
}

return counters;
Expand Down
42 changes: 0 additions & 42 deletions src/main/java/io/anserini/index/SimpleIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -280,46 +280,4 @@ public void close(boolean optimize) {
}
}
}

// Main here exists only as a convenience. Once we're happy with the APIs, there should *not* be
// a separate code entry point; one less thing to debug, one less thing to go wrong.
public static void main(String[] argv) throws Exception {
Args args = new Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100));

try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + SimpleIndexer.class.getSimpleName() +
parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}

final long start = System.nanoTime();
JsonCollection collection = new JsonCollection(Paths.get(args.input));

int cnt = 0;
SimpleIndexer indexer = new SimpleIndexer(args);

LOG.info("input: " + args.input);
LOG.info("collection: " + args.index);

for (FileSegment<JsonCollection.Document> segment : collection ) {
for (JsonCollection.Document doc : segment) {
indexer.addRawDocument(doc.raw());
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs indexed");
}
}
segment.close();
}

indexer.close();
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info(String.format("Total %,d documents indexed in %s", cnt,
DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
}
}
8 changes: 4 additions & 4 deletions src/main/java/io/anserini/search/topicreader/TopicReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public abstract class TopicReader<K> {
protected final int BUFFER_SIZE = 1 << 16; // 64K
protected Path topicFile;
final private static String CACHE_DIR = Paths.get(System.getProperty("user.home"), "/.cache/anserini/topics-and-qrels").toString();
final private static String CLOUD_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";
final private static String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";


static private final Map<String, Class<? extends TopicReader>> TOPIC_FILE_TO_TYPE = new HashMap<>();
Expand Down Expand Up @@ -232,13 +232,13 @@ private static String getCacheDir() {
* @throws IOException if error encountered downloading topics
*/
public static Path getTopicsFromCloud(Path topicPath) throws IOException{
String topicURL = CLOUD_PATH + topicPath.getFileName().toString();
System.out.println("Downloading topics from cloud " + topicURL.toString());
String topicURL = SERVER_PATH + topicPath.getFileName().toString();
System.out.println("Downloading topics from " + topicURL);
File topicFile = new File(getCacheDir(), topicPath.getFileName().toString());
try{
FileUtils.copyURLToFile(new URL(topicURL), topicFile);
}catch (Exception e){
System.out.println("Error downloading topics from cloud " + topicURL.toString());
System.out.println("Error downloading topics from " + topicURL);
throw e;
}
return topicFile.toPath();
Expand Down
37 changes: 37 additions & 0 deletions src/test/java/io/anserini/CustomAppender.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini;

import org.apache.logging.log4j.core.LogEvent;
import org.apache.logging.log4j.core.appender.AbstractAppender;

public class CustomAppender extends AbstractAppender {
private String lastLog = null;

public CustomAppender(String name) {
super(name, null, null, true, null);
}

public String getLastLog() {
return lastLog;
}

@Override
public void append(LogEvent event) {
lastLog = event.getMessage().getFormattedMessage();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ public void setUp() throws Exception {

@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
// System.out.println(((Iso19115Collection.Document) doc).getThesaurusName());
assertTrue(doc.indexable());
assertEquals(expected.get("id"), doc.id());
assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle());
Expand Down
78 changes: 78 additions & 0 deletions src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

import io.anserini.CustomAppender;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.config.Configurator;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import java.util.LinkedList;
import java.util.List;

import static org.junit.Assert.assertTrue;

/**
* Tests for {@link IndexHnswDenseVectors}
*/
public class IndexHnswDenseVectorsTest {
private static final Logger LOGGER = LogManager.getLogger(IndexHnswDenseVectors.class);
private static CustomAppender APPENDER;

@BeforeClass
public static void setupClass() {
APPENDER = new CustomAppender("CustomAppender");
APPENDER.start();

((org.apache.logging.log4j.core.Logger) LOGGER).addAppender(APPENDER);

Configurator.setLevel(IndexHnswDenseVectors.class.getName(), Level.INFO);
}

@Test
public void test1() throws Exception {
List<String> args = new LinkedList<>();
args.add("-collection");
args.add("JsonDenseVectorCollection");
args.add("-input");
args.add("src/test/resources/sample_docs/openai_ada2/json_vector");
args.add("-index");
args.add("target/idx-sample-hnsw" + System.currentTimeMillis());
args.add("-generator");
args.add("LuceneDenseVectorDocumentGenerator");
args.add("-threads");
args.add("1");
args.add("-M");
args.add("16");
args.add("-efC");
args.add("100");

IndexHnswDenseVectors.main(args.toArray(new String[0]));

System.out.println(APPENDER.getLastLog());
assertTrue(APPENDER.getLastLog().contains("Total 100 documents indexed"));
}

@AfterClass
public static void teardownClass() {
((org.apache.logging.log4j.core.Logger) LOGGER).removeAppender(APPENDER);
}
}
Loading

0 comments on commit 34b5437

Please sign in to comment.