Skip to content

Commit

Permalink
131 Adding option to email filter for just email addresses with valid…
Browse files Browse the repository at this point in the history
… TLDs.
  • Loading branch information
jzonthemtn committed Aug 25, 2024
1 parent c2d32c0 commit 4d8abac
Show file tree
Hide file tree
Showing 6 changed files with 2,783 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,10 @@ public List<Filter> getFiltersForPolicy(final Policy policy, final Map<String, M
.withWindowSize(phileasConfiguration.spanWindowSize())
.build();

final Filter filter = new EmailAddressFilter(filterConfiguration);
final boolean isStrict = policy.getIdentifiers().getEmailAddress().isOnlyStrictMatches();
final boolean onlyValidTLDs = policy.getIdentifiers().getEmailAddress().isOnlyValidTLDs();

final Filter filter = new EmailAddressFilter(filterConfiguration, isStrict, onlyValidTLDs);
enabledFilters.add(filter);
filterCache.get(policy.getName()).put(FilterType.EMAIL_ADDRESS, filter);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,43 @@
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

public class EmailAddressFilter extends RegexFilter {

public EmailAddressFilter(FilterConfiguration filterConfiguration) {
private final boolean onlyValidTLDs;

private Collection<String> tlds = null;

public EmailAddressFilter(FilterConfiguration filterConfiguration, boolean onlyStrictMatches, boolean onlyValidTLDs) throws IOException {
super(FilterType.EMAIL_ADDRESS, filterConfiguration);

final Pattern emailAddressPattern = Pattern.compile("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", Pattern.CASE_INSENSITIVE);
final Pattern emailAddressPattern = onlyStrictMatches
? Pattern.compile("\\b(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\b])", Pattern.CASE_INSENSITIVE)
: Pattern.compile("\\b[\\w.-]+?@(?:([a-zA-Z\\d\\-])+?\\.)+(?:[a-zA-Z\\d]{2,4})+\\b");

final FilterPattern email1 = new FilterPattern.FilterPatternBuilder(emailAddressPattern, 0.90).build();

this.contextualTerms = new HashSet<>();
this.contextualTerms.add("email");
this.contextualTerms.add("e-mail");

this.analyzer = new Analyzer(contextualTerms, email1);
this.onlyValidTLDs = onlyValidTLDs;

if(onlyValidTLDs) {
final File file = new File(getClass().getClassLoader().getResource("tlds-alpha-by-domain.txt").getFile());
final List<String> rawTlds = Files.readAllLines(file.toPath(), Charset.defaultCharset());
this.tlds = rawTlds.stream().filter(str->!str.startsWith("#")).map(String::toLowerCase).map(s -> "." + s).toList();
}

}

Expand All @@ -50,6 +69,10 @@ public FilterResult filter(Policy policy, String context, String documentId, int

final List<Span> spans = findSpans(policy, analyzer, input, context, documentId, attributes);

if(onlyValidTLDs) {
spans.removeIf(str -> tlds.stream().noneMatch(str.getText()::endsWith));
}

return new FilterResult(context, documentId, spans);

}
Expand Down
Loading

0 comments on commit 4d8abac

Please sign in to comment.