Skip to content

Commit

Permalink
Make date parsing more flexible for linedocsfile (europarl, enwiki) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss authored Feb 5, 2024
1 parent 9ab84f4 commit 635d090
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,10 @@

import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import java.io.IOException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Random;
import java.util.TimeZone;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
Expand Down Expand Up @@ -164,26 +161,17 @@ protected void createIndex(Directory directory) throws IOException {
conf.setIndexSort(new Sort(new SortField("dateDV", SortField.Type.LONG, true)));
IndexWriter writer = new IndexWriter(directory, conf);
LineFileDocs docs = new LineFileDocs(new Random(0));
SimpleDateFormat parser = new SimpleDateFormat("yyyy-MM-dd", Locale.ROOT);
parser.setTimeZone(TimeZone.getTimeZone("UTC"));
ParsePosition position = new ParsePosition(0);

for (int i = 0; i < 50; i++) {
Document doc = TestUtil.cloneDocument(docs.nextDoc());
String dateString = doc.get("date");
position.setIndex(0);
Date date = parser.parse(dateString, position);
if (position.getErrorIndex() != -1) {
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
}
if (position.getIndex() != dateString.length()) {
throw new AssertionError("failed to parse \"" + dateString + "\" as date");
}
LocalDateTime date = LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply(dateString);
doc.add(
new NumericDocValuesField(
"docid_intDV", doc.getField("docid_int").numericValue().longValue()));
doc.add(
new SortedDocValuesField("titleDV", new BytesRef(doc.getField("title").stringValue())));
doc.add(new NumericDocValuesField("dateDV", date.getTime()));
doc.add(new NumericDocValuesField("dateDV", date.toInstant(ZoneOffset.UTC).toEpochMilli()));
if (i % 10 == 0) { // commit every 10 documents
writer.commit();
}
Expand All @@ -206,9 +194,6 @@ public static void searchExampleIndex(DirectoryReader reader) throws IOException
topDocs = searcher.search(new FieldExistsQuery("titleDV"), 10);
assertEquals(50, topDocs.totalHits.value);

topDocs = searcher.search(new TermQuery(new Term("body", "ja")), 10);
assertTrue(topDocs.totalHits.value > 0);

topDocs =
searcher.search(
IntPoint.newRangeQuery("docid_int", 42, 44),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,16 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
Expand All @@ -53,6 +59,35 @@
* created by benchmark's WriteLineDoc task
*/
public class LineFileDocs implements Closeable {
/**
* Converts date formats for europarl ("2023-02-23") and enwiki ("12-JAN-2010 12:32:45.000") into
* {@link LocalDateTime}.
*/
public static final Function<String, LocalDateTime> DATE_FIELD_VALUE_TO_LOCALDATETIME =
new Function<>() {
final DateTimeFormatter euroParl =
new DateTimeFormatterBuilder()
.parseStrict()
.parseCaseInsensitive()
.appendPattern("uuuu-MM-dd")
.toFormatter(Locale.ROOT);

final DateTimeFormatter enwiki =
new DateTimeFormatterBuilder()
.parseStrict()
.parseCaseInsensitive()
.appendPattern("dd-MMM-uuuu HH:mm:ss['.'SSS]")
.toFormatter(Locale.ROOT);

@Override
public LocalDateTime apply(String s) {
if (s.matches("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")) {
return euroParl.parse(s, LocalDate::from).atStartOfDay();
} else {
return enwiki.parse(s, LocalDateTime::from);
}
}
};

private BufferedReader reader;
private static final int BUFFER_SIZE = 1 << 16; // 64K
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.tests.util;

import java.time.LocalDateTime;

public class TestLineFileDocs extends LuceneTestCase {
/**
* Tests that {@link #expectThrows} behaves correctly when the Runnable throws (an instance of a
* subclass of) the expected Exception type: by returning that Exception.
*/
public void testDateFieldNormalization() {
// europarl corpus uses this data format.
assertEquals(
LocalDateTime.of(2023, 2, 23, 0, 0),
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("2023-02-23"));
// enwiki uses this data format.
assertEquals(
LocalDateTime.of(2010, 1, 12, 12, 32, 45),
LineFileDocs.DATE_FIELD_VALUE_TO_LOCALDATETIME.apply("12-JAN-2010 12:32:45.000"));
}
}

0 comments on commit 635d090

Please sign in to comment.