Skip to content

Commit

Permalink
better author-email matching
Browse files Browse the repository at this point in the history
  • Loading branch information
Dominika Tkaczyk committed Oct 17, 2016
1 parent 237cc75 commit b8b2a1f
Showing 1 changed file with 22 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import pl.edu.icm.cermine.metadata.model.DocumentAuthor;
import pl.edu.icm.cermine.metadata.model.DocumentMetadata;
import pl.edu.icm.cermine.structure.model.BxZone;
Expand Down Expand Up @@ -55,7 +56,9 @@ protected boolean enhanceMetadata(BxZone zone, DocumentMetadata metadata) {
String domain = matcher.group(2);
String[] names = emails.split("[\\|, ]+");
for (String name : names) {
addEmail(metadata, name+"@"+domain);
if (!name.isEmpty()) {
addEmail(metadata, name+"@"+domain);
}
}
}
matcher = PATTERN.matcher(zone.toText());
Expand All @@ -74,17 +77,27 @@ private void addEmail(DocumentMetadata metadata, String email) {

for (DocumentAuthor a : metadata.getAuthors()) {
String[] names = a.getName().split(" ");
for (String namePart : names) {
if (namePart.length() > 2 && email.toLowerCase().contains(namePart.toLowerCase())) {
if (author == null) {
author = a;
break;
} else {
one = false;
String fname = StringUtils.join(names, "");
if (fname.toLowerCase().contains(email.toLowerCase().replaceFirst("@.*", ""))) {
if (author == null) {
author = a;
break;
} else {
one = false;
}
} else {
for (String namePart : names) {
if (namePart.length() > 2 && email.toLowerCase().contains(namePart.toLowerCase())) {
if (author == null) {
author = a;
break;
} else {
one = false;
}
}
}
}
}
}

if (author != null && one) {
author.addEmail(email);
Expand Down

0 comments on commit b8b2a1f

Please sign in to comment.