From 3e3f99a513639906f3b6ab44d7a7f389825e1c92 Mon Sep 17 00:00:00 2001 From: berndmoos Date: Mon, 6 May 2024 07:55:40 +0200 Subject: [PATCH] #470: Faster algorithm --- .../partitureditor/fsm/FSMException.java | 8 +- .../segment/InelEventBasedSegmentation.java | 213 ++++++++++++++++-- .../jexmaralda/segment/TestINEL.java | 41 ++++ 3 files changed, 233 insertions(+), 29 deletions(-) create mode 100644 src/org/exmaralda/partitureditor/jexmaralda/segment/TestINEL.java diff --git a/src/org/exmaralda/partitureditor/fsm/FSMException.java b/src/org/exmaralda/partitureditor/fsm/FSMException.java index 2f6f2caf..c5411a7b 100644 --- a/src/org/exmaralda/partitureditor/fsm/FSMException.java +++ b/src/org/exmaralda/partitureditor/fsm/FSMException.java @@ -25,11 +25,11 @@ public FSMException(String message, String po) { tierID = new String(); } - public FSMException(String message, String po, String tl, String ti) { + public FSMException(String message, String processedOutput, String tli, String tierID) { super(message); - processedOutput = po; - tli = tl; - tierID = ti; + this.processedOutput = processedOutput; + this.tli = tli; + this.tierID = tierID; } public String getProcessedOutput(){ diff --git a/src/org/exmaralda/partitureditor/jexmaralda/segment/InelEventBasedSegmentation.java b/src/org/exmaralda/partitureditor/jexmaralda/segment/InelEventBasedSegmentation.java index feb05155..34806bf7 100644 --- a/src/org/exmaralda/partitureditor/jexmaralda/segment/InelEventBasedSegmentation.java +++ b/src/org/exmaralda/partitureditor/jexmaralda/segment/InelEventBasedSegmentation.java @@ -4,22 +4,29 @@ */ package org.exmaralda.partitureditor.jexmaralda.segment; -import java.io.File; -import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import java.util.Vector; -import java.util.logging.Level; -import java.util.logging.Logger; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.TransformerException; -import org.exmaralda.common.jdomutilities.IOUtilities; -import org.exmaralda.exakt.utilities.FileIO; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javax.swing.tree.DefaultMutableTreeNode; +import javax.swing.tree.MutableTreeNode; import org.exmaralda.partitureditor.fsm.FSMException; +import org.exmaralda.partitureditor.jexmaralda.AbstractSegment; +import org.exmaralda.partitureditor.jexmaralda.Annotation; +import org.exmaralda.partitureditor.jexmaralda.AtomicTimedSegment; import org.exmaralda.partitureditor.jexmaralda.BasicTranscription; +import org.exmaralda.partitureditor.jexmaralda.Identifiable; +import org.exmaralda.partitureditor.jexmaralda.NonTimedSegment; +import org.exmaralda.partitureditor.jexmaralda.SegmentList; +import org.exmaralda.partitureditor.jexmaralda.Segmentation; +import org.exmaralda.partitureditor.jexmaralda.SegmentedTier; import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription; -import org.exmaralda.partitureditor.jexmaralda.convert.StylesheetFactory; -import org.exmaralda.partitureditor.jexmaralda.sax.SegmentedTranscriptionSaxReader; -import org.jdom.Document; -import org.jdom.JDOMException; +import org.exmaralda.partitureditor.jexmaralda.TimedAnnotation; +import org.exmaralda.partitureditor.jexmaralda.TimedSegment; import org.xml.sax.SAXException; /** @@ -29,6 +36,32 @@ public class InelEventBasedSegmentation extends AbstractSegmentation { String XSL = "/org/exmaralda/partitureditor/jexmaralda/xsl/InelEventBasedSegmentation.xsl"; + String[] WORD_EXTERNAL_PUNCUTATION = { + "\\s", // white space + "\\(", // opening round parenthesis + "\\)", // closing round parenthesis + "\\[", // opening square bracket + "\\]", // closing square bracket + "\\.", // period + "\\?", // question mark + "!", // exclamation mark + "…", // ellipsis (one symbol) + ",", // comma + "–", // n-dash + "—", // m-dash + "‐", // U+2010 HYPHEN + "‑", // U+2011 NON-BREAKING HYPHEN + "=", // equals + "\"", // straight double quotation mark + "“", // left double quotation mark + "”", // right double quotation mark + "«", // left double angle quotation mark + "»", // right double angle quotation mark + ";", // semicolon + ":", // colon + }; + + String WORD_EXTERNAL_PUNCUTATION_REGEX = String.join("", WORD_EXTERNAL_PUNCUTATION); @Override public Vector getSegmentationErrors(BasicTranscription bt) throws SAXException { @@ -38,21 +71,151 @@ public Vector getSegmentationErrors(BasicTranscription bt) throws SAXException { @Override public SegmentedTranscription BasicToSegmented(BasicTranscription bt) throws SAXException, FSMException { - try { - SegmentedTranscription plainSegmented = bt.toSegmentedTranscription(); - String plainSegmentedXML = plainSegmented.toXML(); - StylesheetFactory stylesheetFactory = new StylesheetFactory(true); - String inelSegmentedXML = stylesheetFactory.applyInternalStylesheetToString(XSL, plainSegmentedXML); - Document inelSegmentedDoc = IOUtilities.readDocumentFromString(inelSegmentedXML); - File tempFile = File.createTempFile("INEL_SEGMENTED", ".exs"); - tempFile.deleteOnExit(); - FileIO.writeDocumentToLocalFile(tempFile, inelSegmentedDoc); - SegmentedTranscription result = new SegmentedTranscriptionSaxReader().readFromFile(tempFile.getAbsolutePath()); - tempFile.delete(); + SegmentedTranscription plainSegmented = bt.toSegmentedTranscription(); + for (int i=0; i utteranceEndPoints = new HashSet<>(); + for (int j=0; j parsedSegments = parseEvent(event); + for (MutableTreeNode as : parsedSegments){ + currentUtterance.add(as); + } + if (utteranceEndPoints.contains(event.getEnd())){ + currentUtterance.setEnd(event.getEnd()); + currentUtterance = new TimedSegment(); + currentUtterance.setID(tierID + ".u" + Integer.toString(uttCount)); + currentUtterance.setName("INEL:u"); + uttCount++; + currentUtterance.setStart(event.getEnd()); + targetSegmentChain.add(currentUtterance); + } + } catch (FSMException ex){ + ex.setTierID(tierID); + throw(ex); + } + } + if (!currentUtterance.children().hasMoreElements()){ + targetSegmentChain.remove(currentUtterance); + } + } + } + return plainSegmented; + + //******************************* + // old solution, using XSL transformation + // works, but too slow + /*String plainSegmentedXML = plainSegmented.toXML(); + StylesheetFactory stylesheetFactory = new StylesheetFactory(true); + String inelSegmentedXML = stylesheetFactory.applyInternalStylesheetToString(XSL, plainSegmentedXML); + Document inelSegmentedDoc = IOUtilities.readDocumentFromString(inelSegmentedXML); + File tempFile = File.createTempFile("INEL_SEGMENTED", ".exs"); + tempFile.deleteOnExit(); + FileIO.writeDocumentToLocalFile(tempFile, inelSegmentedDoc); + SegmentedTranscription result = new SegmentedTranscriptionSaxReader().readFromFile(tempFile.getAbsolutePath()); + tempFile.delete(); + return result;*/ + } + + private List parseEvent(TimedSegment event) throws FSMException { + List result = new ArrayList<>(); + String text = event.getDescription(); + if (text.startsWith("((")){ + result.addAll(makeNonTimedSegments("((", event.getID() + ".1")); + int endIndex = text.lastIndexOf("))"); + if (endIndex<0){ + FSMException ex = new FSMException("Unclosed double round brackets", text, event.getStart(), null); + throw(ex); + } + AtomicTimedSegment ats = new AtomicTimedSegment(); + ats.setStart(event.getStart()); + ats.setEnd(event.getEnd()); + ats.setName(("INEL:non-pho")); + ats.setDescription(text.substring(2, endIndex)); + ats.setID(event.getID() + ".ats"); + result.add(ats); + result.addAll(makeNonTimedSegments(text.substring(endIndex), event.getID() + ".2")); return result; - } catch (ParserConfigurationException | IOException | TransformerException | JDOMException ex) { - throw new SAXException(ex); } + Pattern p = Pattern.compile("^[" + WORD_EXTERNAL_PUNCUTATION_REGEX + "]+"); + Matcher m = p.matcher(text); + if (m.find()){ + int end = m.end(); + String punctuation = text.substring(0, end); + text = text.substring(end); + result.addAll(makeNonTimedSegments(punctuation, event.getID() + ".1")); + } + p = Pattern.compile("[" + WORD_EXTERNAL_PUNCUTATION_REGEX + "]+$"); + m = p.matcher(text); + if (m.find()){ + int start = m.start(); + String word = text.substring(0, start); + TimedSegment ts = new TimedSegment(); + ts.setName("INEL:w"); + ts.setStart(event.getStart()); + ts.setEnd(event.getEnd()); + ts.setDescription(word); + ts.setID(event.getID() + ".w"); + result.add(ts); + String punctuation = text.substring(start); + result.addAll(makeNonTimedSegments(punctuation, event.getID() + ".2")); + } else { + TimedSegment ts = new TimedSegment(); + ts.setName("INEL:w"); + ts.setStart(event.getStart()); + ts.setEnd(event.getEnd()); + ts.setDescription(text); + ts.setID(event.getID() + ".w"); + result.add(ts); + } + + return result; + } + + private List makeNonTimedSegments(String text, String baseID) { + List result = new ArrayList<>(); + int i=1; + for (char c : text.toCharArray()){ + NonTimedSegment nts = new NonTimedSegment(); + nts.setName("INEL:ip"); + nts.setID(baseID + "." + Integer.toString(i)); + i++; + nts.setDescription(Character.toString(c)); + result.add(nts); + } + return result; } } diff --git a/src/org/exmaralda/partitureditor/jexmaralda/segment/TestINEL.java b/src/org/exmaralda/partitureditor/jexmaralda/segment/TestINEL.java new file mode 100644 index 00000000..e6a7625a --- /dev/null +++ b/src/org/exmaralda/partitureditor/jexmaralda/segment/TestINEL.java @@ -0,0 +1,41 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package org.exmaralda.partitureditor.jexmaralda.segment; + +import java.io.IOException; +import java.util.Vector; +import org.exmaralda.partitureditor.fsm.FSMException; +import org.exmaralda.partitureditor.jexmaralda.BasicTranscription; +import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException; +import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription; +import org.jdom.Element; +import org.xml.sax.SAXException; + +/** + * + * @author thomas + */ +public class TestINEL { + + /** + * @param args the command line arguments + */ + public static void main(String[] args) { + try { + BasicTranscription bt = new BasicTranscription("G:\\Meine Ablage\\INEL_DOLGAN\\flk\\AkEE_19900810_GirlAnys_flk\\AkEE_19900810_GirlAnys_flk.exb"); + InelEventBasedSegmentation segmenter = new org.exmaralda.partitureditor.jexmaralda.segment.InelEventBasedSegmentation(); + SegmentedTranscription st = segmenter.BasicToSegmented(bt); + st.writeXMLToFile("C:\\Users\\bernd\\Dropbox\\work\\EXMARaLDA_Support\\2024_04_24_INEL_SEGMENTATION\\AkEE_19900810_GirlAnys_flk.exs", "none"); + Vector wordList = st.getBody().getWordList(); + for (Element e : wordList){ + System.out.println(e); + } + } catch (IOException | FSMException | JexmaraldaException | SAXException ex) { + ex.printStackTrace(); + } + } + +}