Skip to content

Commit

Permalink
#4 special support for char class search
Browse files Browse the repository at this point in the history
  • Loading branch information
almondtools committed Nov 26, 2016
1 parent 5e956ca commit 6e22179
Show file tree
Hide file tree
Showing 12 changed files with 491 additions and 101 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ public StringMatch findNext() {
if (j <= 0) {
bytes.next();
} else {
bytes.forward(j + 1);
bytes.forward(j + 2);
}
}
return null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package net.amygdalum.stringsearchalgorithms.search.bytes;

import java.util.LinkedHashSet;
import java.util.Set;

import net.amygdalum.util.map.ByteObjectMap;

public class TrieNode<T> {
Expand All @@ -17,6 +20,12 @@ public TrieNode() {
this.max = 3;
}

public void reset() {
this.nexts = trieNodes(4);
this.min = 0;
this.max = 3;
}

@SuppressWarnings("unchecked")
private static <T> TrieNode<T>[] trieNodes(int len) {
return new TrieNode[len];
Expand Down Expand Up @@ -137,6 +146,23 @@ public TrieNode<T> nextNode(byte[] bytes) {
return current;
}

public Set<TrieNode<T>> nodes() {
Set<TrieNode<T>> nodes = new LinkedHashSet<>();
colllectNodes(nodes);
return nodes;
}

private void colllectNodes(Set<TrieNode<T>> nodes) {
if (nodes.contains(this)) {
return;
}
nodes.add(this);
for (ByteObjectMap<TrieNode<T>>.Entry entry : getNexts().cursor()) {
TrieNode<T> node = entry.value;
node.colllectNodes(nodes);
}
}

@Override
public String toString() {
if (attached != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package net.amygdalum.stringsearchalgorithms.search.chars;

import static java.lang.Math.max;
import static java.util.Arrays.fill;
import static net.amygdalum.util.text.CharUtils.computeMaxChar;
import static net.amygdalum.util.text.CharUtils.computeMinChar;

import java.util.Arrays;

Expand All @@ -12,6 +11,8 @@
import net.amygdalum.stringsearchalgorithms.search.StringFinderOption;
import net.amygdalum.stringsearchalgorithms.search.StringMatch;
import net.amygdalum.util.map.CharLongMap;
import net.amygdalum.util.text.CharAlphabet;
import net.amygdalum.util.text.CharMapping;

/**
* An implementation of the String Search Algorithm BNDM (Backward Nondeterministic Dawg Matching).
Expand All @@ -24,32 +25,32 @@ public class BNDM implements StringSearchAlgorithm {
private BitMapStates states;

public BNDM(String pattern) {
this(pattern, CharMapping.IDENTITY);
}

public BNDM(String pattern, CharMapping mapping) {
this.patternLength = pattern.length();
this.states = computeStates(pattern.toCharArray());
this.states = computeStates(pattern.toCharArray(), mapping);
}

private static BitMapStates computeStates(char[] pattern) {
if (isCompactRange(pattern)) {
private static BitMapStates computeStates(char[] pattern, CharMapping mapping) {
CharAlphabet alphabet = CharAlphabet.ranged(pattern, mapping);
int compactSize = max(256, pattern.length * 2);
if (alphabet.getRange() < compactSize) {
if (pattern.length > 64) {
return new QuickMultiLongStates(pattern);
return new QuickMultiLongStates(pattern, alphabet, mapping);
} else {
return new QuickSingleLongStates(pattern);
return new QuickSingleLongStates(pattern, alphabet, mapping);
}
} else {
if (pattern.length > 64) {
return new SmartMultiLongStates(pattern);
return new SmartMultiLongStates(pattern, mapping);
} else {
return new SmartSingleLongStates(pattern);
return new SmartSingleLongStates(pattern, mapping);
}
}
}

private static boolean isCompactRange(char[] pattern) {
char minChar = computeMinChar(pattern);
char maxChar = computeMaxChar(pattern);
return maxChar - minChar < 256 || maxChar - minChar < pattern.length * 2;
}

@Override
public int getPatternLength() {
return patternLength;
Expand Down Expand Up @@ -237,11 +238,23 @@ private int max(int[] values, int last) {

}

public static class Factory implements StringSearchAlgorithmFactory {
public static class Factory implements StringSearchAlgorithmFactory, SupportsCharClasses<Factory> {

private CharMapping mapping;

@Override
public Factory withCharClasses(CharMapping mapping) {
this.mapping = mapping;
return this;
}

@Override
public StringSearchAlgorithm of(String pattern) {
return new BNDM(pattern);
if (mapping == null) {
return new BNDM(pattern);
} else {
return new BNDM(pattern, mapping);
}
}

}
Expand Down Expand Up @@ -279,18 +292,19 @@ private static class QuickSingleLongStates extends SingleLongBitMapStates {
private char maxChar;
private long[] characters;

public QuickSingleLongStates(char[] pattern) {
this.minChar = computeMinChar(pattern);
this.maxChar = computeMaxChar(pattern);
this.characters = computeStates(pattern, this.minChar, this.maxChar);
public QuickSingleLongStates(char[] pattern, CharAlphabet alphabet, CharMapping mapping) {
this.minChar = alphabet.minChar();
this.maxChar = alphabet.maxChar();
this.characters = computeStates(pattern, mapping, this.minChar, this.maxChar);
}

private static long[] computeStates(char[] pattern, char min, char max) {
private static long[] computeStates(char[] pattern, CharMapping mapping, char min, char max) {
long[] characters = new long[max - min + 1];
for (int i = 0; i < pattern.length; i++) {
char c = pattern[i];
int j = pattern.length - i - 1;
characters[c - min] |= 1l << j;
for (char c : mapping.map(pattern[i])) {
characters[c - min] |= 1l << j;
}
}
return characters;
}
Expand All @@ -309,17 +323,18 @@ private static class SmartSingleLongStates extends SingleLongBitMapStates {

private CharLongMap states;

public SmartSingleLongStates(char[] pattern) {
this.states = computeStates(pattern);
public SmartSingleLongStates(char[] pattern, CharMapping mapping) {
this.states = computeStates(pattern, mapping);
}

private static CharLongMap computeStates(char[] pattern) {
private static CharLongMap computeStates(char[] pattern, CharMapping mapping) {
CharLongMap map = new CharLongMap(0l);
for (int i = 0; i < pattern.length; i++) {
char c = pattern[i];
int j = pattern.length - i - 1;
long newState = map.get(c) | (1l << j);
map.put(c, newState);
for (char c : mapping.map(pattern[i])) {
long newState = map.get(c) | (1l << j);
map.put(c, newState);
}
}
return map;
}
Expand Down Expand Up @@ -351,30 +366,31 @@ private static class QuickMultiLongStates extends MultiLongBitMapStates {
private char maxChar;
private long[][] characters;

public QuickMultiLongStates(char[] pattern) {
this.minChar = computeMinChar(pattern);
this.maxChar = computeMaxChar(pattern);
this.characters = computeStates(pattern, this.minChar, this.maxChar);
public QuickMultiLongStates(char[] pattern, CharAlphabet alphabet, CharMapping mapping) {
this.minChar = alphabet.minChar();
this.maxChar = alphabet.maxChar();
this.characters = computeStates(pattern, mapping, this.minChar, this.maxChar);
}

private static long[][] computeStates(char[] pattern, char min, char max) {
private static long[][] computeStates(char[] pattern, CharMapping mapping, char min, char max) {
int numberOfSubpatterns = ((pattern.length - 1) / 64) + 1;
long[][] characters = new long[numberOfSubpatterns][];
for (int i = 0; i < characters.length; i++) {
int start = i * 64;
int end = i == characters.length - 1 ? pattern.length : (i + 1) * 64;
char[] subpattern = Arrays.copyOfRange(pattern, start, end);
characters[i] = computeSubStates(subpattern, min, max);
characters[i] = computeSubStates(subpattern, mapping, min, max);
}
return characters;
}

private static long[] computeSubStates(char[] pattern, char min, char max) {
private static long[] computeSubStates(char[] pattern, CharMapping mapping, char min, char max) {
long[] characters = new long[max - min + 1];
for (int i = 0; i < pattern.length; i++) {
char c = pattern[i];
int j = pattern.length - i - 1;
characters[c - min] |= 1l << j;
for (char c : mapping.map(pattern[i])) {
characters[c - min] |= 1l << j;
}
}
return characters;
}
Expand All @@ -393,29 +409,30 @@ private static class SmartMultiLongStates extends MultiLongBitMapStates {

private CharLongMap[] states;

public SmartMultiLongStates(char[] pattern) {
this.states = computeStates(pattern);
public SmartMultiLongStates(char[] pattern, CharMapping mapping) {
this.states = computeStates(pattern, mapping);
}

private static CharLongMap[] computeStates(char[] pattern) {
private static CharLongMap[] computeStates(char[] pattern, CharMapping mapping) {
int numberOfSubpatterns = ((pattern.length - 1) / 64) + 1;
CharLongMap[] characters = new CharLongMap[numberOfSubpatterns];
for (int i = 0; i < characters.length; i++) {
int start = i * 64;
int end = i == characters.length - 1 ? pattern.length : (i + 1) * 64;
char[] subpattern = Arrays.copyOfRange(pattern, start, end);
characters[i] = computeSubStates(subpattern);
characters[i] = computeSubStates(subpattern, mapping);
}
return characters;
}

private static CharLongMap computeSubStates(char[] pattern) {
private static CharLongMap computeSubStates(char[] pattern, CharMapping mapping) {
CharLongMap map = new CharLongMap(0l);
for (int i = 0; i < pattern.length; i++) {
char c = pattern[i];
int j = pattern.length - i - 1;
long newState = map.get(c) | (1l << j);
map.put(c, newState);
for (char c : mapping.map(pattern[i])) {
long newState = map.get(c) | (1l << j);
map.put(c, newState);
}
}
return map;
}
Expand All @@ -426,4 +443,5 @@ public long select(int i, char c) {
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import net.amygdalum.stringsearchalgorithms.io.CharProvider;
import net.amygdalum.stringsearchalgorithms.search.AbstractStringFinder;
import net.amygdalum.stringsearchalgorithms.search.StringFinder;
import net.amygdalum.stringsearchalgorithms.search.StringFinderOption;
import net.amygdalum.stringsearchalgorithms.search.StringMatch;
import net.amygdalum.util.map.CharObjectMap;
import net.amygdalum.util.text.CharMapping;

/**
* An implementation of the String Search Algorithm BOM (Backward Oracle Matching).
Expand All @@ -27,8 +29,34 @@ public class BOM implements StringSearchAlgorithm {
private int patternLength;

public BOM(String pattern) {
this(pattern, CharMapping.IDENTITY);
}

public BOM(String pattern, CharMapping mapping) {
this.patternLength = pattern.length();
this.trie = computeTrie(pattern.toCharArray(), patternLength);
this.trie = computeTrie(mapping.normalized(pattern.toCharArray()), patternLength);
if (mapping != CharMapping.IDENTITY) {
applyMapping(mapping);
}
}

private void applyMapping(CharMapping mapping) {
Set<TrieNode<String>> nodes = trie.nodes();
for (TrieNode<String> node : nodes) {
applyMapping(node, mapping);
}
}

private void applyMapping(TrieNode<String> node, CharMapping mapping) {
CharObjectMap<TrieNode<String>> nexts = node.getNexts();
node.reset();
for (CharObjectMap<TrieNode<String>>.Entry entry : nexts.cursor()) {
char ec = entry.key;
TrieNode<String> next = entry.value;
for (char c : mapping.map(ec)) {
node.addNext(c, next);
}
}
}

private static TrieNode<String> computeTrie(char[] pattern, int length) {
Expand Down Expand Up @@ -128,7 +156,7 @@ public StringMatch findNext() {
if (j <= 0) {
chars.next();
} else {
chars.forward(j + 1);
chars.forward(j + 2);
}
}
return null;
Expand All @@ -141,11 +169,23 @@ private StringMatch createMatch(long start, long end) {

}

public static class Factory implements StringSearchAlgorithmFactory {
public static class Factory implements StringSearchAlgorithmFactory, SupportsCharClasses<Factory> {

private CharMapping mapping;

@Override
public Factory withCharClasses(CharMapping mapping) {
this.mapping = mapping;
return this;
}

@Override
public StringSearchAlgorithm of(String pattern) {
return new BOM(pattern);
if (mapping == null) {
return new BOM(pattern);
} else {
return new BOM(pattern, mapping);
}
}

}
Expand Down
Loading

0 comments on commit 6e22179

Please sign in to comment.