Skip to content

Commit

Permalink
#470: Faster algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
berndmoos committed May 6, 2024
1 parent cf80762 commit 3e3f99a
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 29 deletions.
8 changes: 4 additions & 4 deletions src/org/exmaralda/partitureditor/fsm/FSMException.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ public FSMException(String message, String po) {
tierID = new String();
}

public FSMException(String message, String po, String tl, String ti) {
public FSMException(String message, String processedOutput, String tli, String tierID) {
super(message);
processedOutput = po;
tli = tl;
tierID = ti;
this.processedOutput = processedOutput;
this.tli = tli;
this.tierID = tierID;
}

public String getProcessedOutput(){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,29 @@
*/
package org.exmaralda.partitureditor.jexmaralda.segment;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import org.exmaralda.common.jdomutilities.IOUtilities;
import org.exmaralda.exakt.utilities.FileIO;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.tree.DefaultMutableTreeNode;
import javax.swing.tree.MutableTreeNode;
import org.exmaralda.partitureditor.fsm.FSMException;
import org.exmaralda.partitureditor.jexmaralda.AbstractSegment;
import org.exmaralda.partitureditor.jexmaralda.Annotation;
import org.exmaralda.partitureditor.jexmaralda.AtomicTimedSegment;
import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
import org.exmaralda.partitureditor.jexmaralda.Identifiable;
import org.exmaralda.partitureditor.jexmaralda.NonTimedSegment;
import org.exmaralda.partitureditor.jexmaralda.SegmentList;
import org.exmaralda.partitureditor.jexmaralda.Segmentation;
import org.exmaralda.partitureditor.jexmaralda.SegmentedTier;
import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
import org.exmaralda.partitureditor.jexmaralda.convert.StylesheetFactory;
import org.exmaralda.partitureditor.jexmaralda.sax.SegmentedTranscriptionSaxReader;
import org.jdom.Document;
import org.jdom.JDOMException;
import org.exmaralda.partitureditor.jexmaralda.TimedAnnotation;
import org.exmaralda.partitureditor.jexmaralda.TimedSegment;
import org.xml.sax.SAXException;

/**
Expand All @@ -29,6 +36,32 @@
public class InelEventBasedSegmentation extends AbstractSegmentation {

String XSL = "/org/exmaralda/partitureditor/jexmaralda/xsl/InelEventBasedSegmentation.xsl";
String[] WORD_EXTERNAL_PUNCUTATION = {
"\\s", // white space
"\\(", // opening round parenthesis
"\\)", // closing round parenthesis
"\\[", // opening square bracket
"\\]", // closing square bracket
"\\.", // period
"\\?", // question mark
"!", // exclamation mark
"…", // ellipsis (one symbol)
",", // comma
"–", // n-dash
"—", // m-dash
"‐", // U+2010 HYPHEN
"‑", // U+2011 NON-BREAKING HYPHEN
"=", // equals
"\"", // straight double quotation mark
"“", // left double quotation mark
"”", // right double quotation mark
"«", // left double angle quotation mark
"»", // right double angle quotation mark
";", // semicolon
":", // colon
};

String WORD_EXTERNAL_PUNCUTATION_REGEX = String.join("", WORD_EXTERNAL_PUNCUTATION);

@Override
public Vector getSegmentationErrors(BasicTranscription bt) throws SAXException {
Expand All @@ -38,21 +71,151 @@ public Vector getSegmentationErrors(BasicTranscription bt) throws SAXException {

@Override
public SegmentedTranscription BasicToSegmented(BasicTranscription bt) throws SAXException, FSMException {
try {
SegmentedTranscription plainSegmented = bt.toSegmentedTranscription();
String plainSegmentedXML = plainSegmented.toXML();
StylesheetFactory stylesheetFactory = new StylesheetFactory(true);
String inelSegmentedXML = stylesheetFactory.applyInternalStylesheetToString(XSL, plainSegmentedXML);
Document inelSegmentedDoc = IOUtilities.readDocumentFromString(inelSegmentedXML);
File tempFile = File.createTempFile("INEL_SEGMENTED", ".exs");
tempFile.deleteOnExit();
FileIO.writeDocumentToLocalFile(tempFile, inelSegmentedDoc);
SegmentedTranscription result = new SegmentedTranscriptionSaxReader().readFromFile(tempFile.getAbsolutePath());
tempFile.delete();
SegmentedTranscription plainSegmented = bt.toSegmentedTranscription();
for (int i=0; i<plainSegmented.getBody().getNumberOfTiers(); i++){
SegmentedTier segmentedTier = plainSegmented.getBody().getSegmentedTierAt(i);
String tierID = segmentedTier.getID();
Segmentation sourceSegmentation = segmentedTier.getSegmentationWithName("SpeakerContribution_Event");
Segmentation targetSegmentation = new Segmentation();
targetSegmentation.setName("SpeakerContribution_Utterance_Word");
targetSegmentation.setTierReference(sourceSegmentation.getTierReference());
segmentedTier.addSegmentation(targetSegmentation);

Annotation refAnnotation = segmentedTier.getAnnotationWithName("ref");
Set<String> utteranceEndPoints = new HashSet<>();
for (int j=0; j<refAnnotation.getNumberOfSegments(); j++){
utteranceEndPoints.add(((TimedAnnotation)(refAnnotation.get(j))).getEnd());
}

SegmentList allSegmentChains = sourceSegmentation.getAllSegmentsWithName("sc");
int uttCount = 1;
for (int j=0; j<allSegmentChains.size(); j++){
TimedSegment segmentChain = (TimedSegment)(allSegmentChains.get(j));

TimedSegment targetSegmentChain = new TimedSegment();
targetSegmentChain.setStart(segmentChain.getStart());
targetSegmentChain.setEnd(segmentChain.getEnd());
targetSegmentChain.setName("sc");
targetSegmentation.addSegment(targetSegmentChain);

TimedSegment currentUtterance = new TimedSegment();
currentUtterance.setID(tierID + ".u" + Integer.toString(uttCount));
currentUtterance.setName("INEL:u");
uttCount++;
currentUtterance.setStart(segmentChain.getStart());
targetSegmentChain.add(currentUtterance);
Vector sourceEvents = segmentChain.getAllSegmentsWithName("e");

for (int k=0; k<sourceEvents.size(); k++){
TimedSegment event = (TimedSegment) sourceEvents.elementAt(k);
try {
List<MutableTreeNode> parsedSegments = parseEvent(event);
for (MutableTreeNode as : parsedSegments){
currentUtterance.add(as);
}
if (utteranceEndPoints.contains(event.getEnd())){
currentUtterance.setEnd(event.getEnd());
currentUtterance = new TimedSegment();
currentUtterance.setID(tierID + ".u" + Integer.toString(uttCount));
currentUtterance.setName("INEL:u");
uttCount++;
currentUtterance.setStart(event.getEnd());
targetSegmentChain.add(currentUtterance);
}
} catch (FSMException ex){
ex.setTierID(tierID);
throw(ex);
}
}
if (!currentUtterance.children().hasMoreElements()){
targetSegmentChain.remove(currentUtterance);
}
}
}
return plainSegmented;

//*******************************
// old solution, using XSL transformation
// works, but too slow
/*String plainSegmentedXML = plainSegmented.toXML();
StylesheetFactory stylesheetFactory = new StylesheetFactory(true);
String inelSegmentedXML = stylesheetFactory.applyInternalStylesheetToString(XSL, plainSegmentedXML);
Document inelSegmentedDoc = IOUtilities.readDocumentFromString(inelSegmentedXML);
File tempFile = File.createTempFile("INEL_SEGMENTED", ".exs");
tempFile.deleteOnExit();
FileIO.writeDocumentToLocalFile(tempFile, inelSegmentedDoc);
SegmentedTranscription result = new SegmentedTranscriptionSaxReader().readFromFile(tempFile.getAbsolutePath());
tempFile.delete();
return result;*/
}

private List<MutableTreeNode> parseEvent(TimedSegment event) throws FSMException {
List<MutableTreeNode> result = new ArrayList<>();
String text = event.getDescription();
if (text.startsWith("((")){
result.addAll(makeNonTimedSegments("((", event.getID() + ".1"));
int endIndex = text.lastIndexOf("))");
if (endIndex<0){
FSMException ex = new FSMException("Unclosed double round brackets", text, event.getStart(), null);
throw(ex);
}
AtomicTimedSegment ats = new AtomicTimedSegment();
ats.setStart(event.getStart());
ats.setEnd(event.getEnd());
ats.setName(("INEL:non-pho"));
ats.setDescription(text.substring(2, endIndex));
ats.setID(event.getID() + ".ats");
result.add(ats);
result.addAll(makeNonTimedSegments(text.substring(endIndex), event.getID() + ".2"));
return result;
} catch (ParserConfigurationException | IOException | TransformerException | JDOMException ex) {
throw new SAXException(ex);
}
Pattern p = Pattern.compile("^[" + WORD_EXTERNAL_PUNCUTATION_REGEX + "]+");
Matcher m = p.matcher(text);
if (m.find()){
int end = m.end();
String punctuation = text.substring(0, end);
text = text.substring(end);
result.addAll(makeNonTimedSegments(punctuation, event.getID() + ".1"));
}
p = Pattern.compile("[" + WORD_EXTERNAL_PUNCUTATION_REGEX + "]+$");
m = p.matcher(text);
if (m.find()){
int start = m.start();
String word = text.substring(0, start);
TimedSegment ts = new TimedSegment();
ts.setName("INEL:w");
ts.setStart(event.getStart());
ts.setEnd(event.getEnd());
ts.setDescription(word);
ts.setID(event.getID() + ".w");
result.add(ts);
String punctuation = text.substring(start);
result.addAll(makeNonTimedSegments(punctuation, event.getID() + ".2"));
} else {
TimedSegment ts = new TimedSegment();
ts.setName("INEL:w");
ts.setStart(event.getStart());
ts.setEnd(event.getEnd());
ts.setDescription(text);
ts.setID(event.getID() + ".w");
result.add(ts);
}

return result;
}

private List<NonTimedSegment> makeNonTimedSegments(String text, String baseID) {
List<NonTimedSegment> result = new ArrayList<>();
int i=1;
for (char c : text.toCharArray()){
NonTimedSegment nts = new NonTimedSegment();
nts.setName("INEL:ip");
nts.setID(baseID + "." + Integer.toString(i));
i++;
nts.setDescription(Character.toString(c));
result.add(nts);
}
return result;
}

}
41 changes: 41 additions & 0 deletions src/org/exmaralda/partitureditor/jexmaralda/segment/TestINEL.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package org.exmaralda.partitureditor.jexmaralda.segment;

import java.io.IOException;
import java.util.Vector;
import org.exmaralda.partitureditor.fsm.FSMException;
import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
import org.jdom.Element;
import org.xml.sax.SAXException;

/**
*
* @author thomas
*/
public class TestINEL {

/**
* @param args the command line arguments
*/
public static void main(String[] args) {
try {
BasicTranscription bt = new BasicTranscription("G:\\Meine Ablage\\INEL_DOLGAN\\flk\\AkEE_19900810_GirlAnys_flk\\AkEE_19900810_GirlAnys_flk.exb");
InelEventBasedSegmentation segmenter = new org.exmaralda.partitureditor.jexmaralda.segment.InelEventBasedSegmentation();
SegmentedTranscription st = segmenter.BasicToSegmented(bt);
st.writeXMLToFile("C:\\Users\\bernd\\Dropbox\\work\\EXMARaLDA_Support\\2024_04_24_INEL_SEGMENTATION\\AkEE_19900810_GirlAnys_flk.exs", "none");
Vector<Element> wordList = st.getBody().getWordList();
for (Element e : wordList){
System.out.println(e);
}
} catch (IOException | FSMException | JexmaraldaException | SAXException ex) {
ex.printStackTrace();
}
}

}

0 comments on commit 3e3f99a

Please sign in to comment.