Skip to content

Commit

Permalink
Fix 'up to 1,700 kilograms' in SD and UD, even with RP error
Browse files Browse the repository at this point in the history
  • Loading branch information
manning committed Jul 31, 2023
1 parent a000fe3 commit 6e14527
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 57 deletions.
63 changes: 31 additions & 32 deletions src/edu/stanford/nlp/trees/CoordinationTransformer.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
package edu.stanford.nlp.trees;
import edu.stanford.nlp.util.logging.Redwood;


import edu.stanford.nlp.ling.LabelFactory;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.BufferedReader;
import java.io.FileInputStream;
Expand Down Expand Up @@ -44,7 +43,7 @@
public class CoordinationTransformer implements TreeTransformer {

/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(CoordinationTransformer.class);
private static final Redwood.RedwoodChannels log = Redwood.channels(CoordinationTransformer.class);

private static final boolean VERBOSE = System.getProperty("CoordinationTransformer", null) != null;
private final TreeTransformer tn = new DependencyTreeTransformer(); //to get rid of unwanted nodes and tag
Expand Down Expand Up @@ -156,10 +155,10 @@ public Tree transformTree(Tree t) {
return t;
}

private static TregexPattern rearrangeNowThatTregex =
private static final TregexPattern rearrangeNowThatTregex =
TregexPattern.compile("ADVP=advp <1 (RB < /^(?i:now)$/) <2 (SBAR=sbar <1 (IN < /^(?i:that)$/))");

private static TsurgeonPattern rearrangeNowThatTsurgeon =
private static final TsurgeonPattern rearrangeNowThatTsurgeon =
Tsurgeon.parseOperation("[relabel advp SBAR] [excise sbar sbar]");

private static Tree rearrangeNowThat(Tree t) {
Expand All @@ -170,10 +169,10 @@ private static Tree rearrangeNowThat(Tree t) {
}


private static TregexPattern changeSbarToPPTregex =
private static final TregexPattern changeSbarToPPTregex =
TregexPattern.compile("NP < (NP $++ (SBAR=sbar < (IN < /^(?i:after|before|until|since|during)$/ $++ S)))");

private static TsurgeonPattern changeSbarToPPTsurgeon =
private static final TsurgeonPattern changeSbarToPPTsurgeon =
Tsurgeon.parseOperation("relabel sbar PP");

/**
Expand All @@ -191,7 +190,7 @@ private static Tree changeSbarToPP(Tree t) {
return Tsurgeon.processPattern(changeSbarToPPTregex, changeSbarToPPTsurgeon, t);
}

private static TregexPattern findFlatConjpTregex =
private static final TregexPattern findFlatConjpTregex =
// TODO: add more patterns, perhaps ignore case
// for example, what should we do with "and not"? Is it right to
// generally add the "not" to the following tree with moveRB, or
Expand All @@ -202,7 +201,7 @@ private static Tree changeSbarToPP(Tree t) {
" (< and $+ (RB=end < so)) | " +
" (< and $+ (ADVP=end < (RB|IN < so))) ] ))"); // TODO: this structure needs a dependency

private static TsurgeonPattern addConjpTsurgeon =
private static final TsurgeonPattern addConjpTsurgeon =
Tsurgeon.parseOperation("createSubtree CONJP start end");

private static Tree combineConjp(Tree t) {
Expand All @@ -212,13 +211,13 @@ private static Tree combineConjp(Tree t) {
return Tsurgeon.processPattern(findFlatConjpTregex, addConjpTsurgeon, t);
}

private static TregexPattern[] moveRBTregex = {
private static final TregexPattern[] moveRBTregex = {
TregexPattern.compile("/^S|PP|VP|NP/ < (/^(S|PP|VP|NP)/ $++ (/^(,|CC|CONJP)$/ [ $+ (RB=adv [ < not | < then ]) | $+ (ADVP=adv <: RB) ])) : (=adv $+ /^(S(?!YM)|PP|VP|NP)/=dest) "),
TregexPattern.compile("/^ADVP/ < (/^ADVP/ $++ (/^(,|CC|CONJP)$/ [$+ (RB=adv [ < not | < then ]) | $+ (ADVP=adv <: RB)])) : (=adv $+ /^NP-ADV|ADVP|PP/=dest)"),
TregexPattern.compile("/^FRAG/ < (ADVP|RB=adv $+ VP=dest)"),
};

private static TsurgeonPattern moveRBTsurgeon =
private static final TsurgeonPattern moveRBTsurgeon =
Tsurgeon.parseOperation("move adv >0 dest");

static Tree moveRB(Tree t) {
Expand All @@ -236,7 +235,7 @@ static Tree moveRB(Tree t) {
//
// TODO: maybe we want to catch more complicated tree structures
// with something in between the WH and the actual question.
private static TregexPattern flattenSQTregex =
private static final TregexPattern flattenSQTregex =
TregexPattern.compile("SBARQ < ((WHNP=what < WP) $+ (SQ=sq < (/^VB/=verb < " + EnglishPatterns.copularWordRegex + ") " +
// match against "is running" if the verb is under just a VBG
" !< (/^VB/ < !" + EnglishPatterns.copularWordRegex + ") " +
Expand All @@ -249,7 +248,7 @@ static Tree moveRB(Tree t) {
// match against "good at"
" !< (ADJP < (PP <: IN|TO))))");

private static TsurgeonPattern flattenSQTsurgeon = Tsurgeon.parseOperation("excise sq sq");
private static final TsurgeonPattern flattenSQTsurgeon = Tsurgeon.parseOperation("excise sq sq");

/**
* Removes the SQ structure under a WHNP question, such as "Who am I
Expand All @@ -271,10 +270,10 @@ public Tree SQflatten(Tree t) {
return Tsurgeon.processPattern(flattenSQTregex, flattenSQTsurgeon, t);
}

private static TregexPattern removeXOverXTregex =
private static final TregexPattern removeXOverXTregex =
TregexPattern.compile("__=repeat <: (~repeat < __)");

private static TsurgeonPattern removeXOverXTsurgeon = Tsurgeon.parseOperation("excise repeat repeat");
private static final TsurgeonPattern removeXOverXTsurgeon = Tsurgeon.parseOperation("excise repeat repeat");

public static Tree removeXOverX(Tree t) {
return Tsurgeon.processPattern(removeXOverXTregex, removeXOverXTsurgeon, t);
Expand Down Expand Up @@ -660,7 +659,7 @@ private static Tree findCCparent(Tree t, Tree root) {
/**
* Multi-word expression patterns
*/
private static TregexPattern[] MWE_PATTERNS = {
private static final TregexPattern[] MWE_PATTERNS = {
TregexPattern.compile("@CONJP <1 (RB=node1 < /^(?i)as$/) <2 (RB=node2 < /^(?i)well$/) <- (IN=node3 < /^(?i)as$/)"), //as well as
TregexPattern.compile("@ADVP|CONJP <1 (RB=node1 < /^(?i)as$/) <- (IN|RB=node2 < /^(?i)well$/)"), //as well
TregexPattern.compile("@PP < ((JJ=node1 < /^(?i)such$/) $+ (IN=node2 < /^(?i)as$/))"), //such as
Expand All @@ -686,36 +685,37 @@ private static Tree findCCparent(Tree t, Tree root) {
TregexPattern.compile("@WHADVP < ((WRB=node1 < /^(?i:how)$/) $+ (VB=node2 < /^(?i)come$/))"), //how come
TregexPattern.compile("@VP < ((VBD=node1 < had|'d) $+ (@PRT|ADVP=node2 <: (RBR < /^(?i)better$/)))"), //had better
TregexPattern.compile("@QP|XS < ((JJR|RBR|IN=node1 < /^(?i)(more|less)$/) $+ (IN=node2 < /^(?i)than$/))"), //more/less than
TregexPattern.compile("@QP < ((JJR|RBR|RP|IN=node1 < /^(?i)up$/) $+ (IN|TO=node2 < /^(?i)to$/))"), //up to
TregexPattern.compile("@QP|XS < ((JJR|RBR||RB|RP|IN=node1 < /^(?i)(up)$/) $+ (IN|TO=node2 < /^(?i)to$/))"), // up to
TregexPattern.compile("@QP < ((JJR|RBR|RB|RP|IN=node1 < /^(?i)up$/) $+ (IN|TO=node2 < /^(?i)to$/))"), //up to
TregexPattern.compile("@S|SQ|VP|ADVP|PP < (@ADVP < ((IN|RB=node1 < /^(?i)at$/) $+ (JJS|RBS=node2 < /^(?i)least$/)) !$+ (RB < /(?i)(once|twice)/))"), //at least

};

private static TsurgeonPattern MWE_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1 node2] [if exists node3 move node3 $- node2]");
private static final TsurgeonPattern MWE_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1 node2] [if exists node3 move node3 $- node2]");

private static TregexPattern ACCORDING_TO_PATTERN = TregexPattern.compile("PP=pp1 < (VBG=node1 < /^(?i)according$/ $+ (PP=pp2 < (TO|IN=node2 < to)))");
private static TsurgeonPattern ACCORDING_TO_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1] [move node2 $- node1] [excise pp2 pp2]");
private static final TregexPattern ACCORDING_TO_PATTERN = TregexPattern.compile("PP=pp1 < (VBG=node1 < /^(?i)according$/ $+ (PP=pp2 < (TO|IN=node2 < to)))");
private static final TsurgeonPattern ACCORDING_TO_OPERATION = Tsurgeon.parseOperation("[createSubtree MWE node1] [move node2 $- node1] [excise pp2 pp2]");

/* "but also" is not a MWE, so break up the CONJP. */
private static TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
private static TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");
private static final TregexPattern BUT_ALSO_PATTERN = TregexPattern.compile("CONJP=conjp < (CC=cc < but) < (RB=rb < also) ?$+ (__=nextNode < (__ < __))");
private static final TsurgeonPattern BUT_ALSO_OPERATION = Tsurgeon.parseOperation("[move cc $- conjp] [move rb $- cc] [if exists nextNode move rb >1 nextNode] [createSubtree ADVP rb] [delete conjp]");

/* at least / at most / at best / at worst / ... should be treated as if "at"
was a preposition and the RBS was a noun. Assumes that the MWE "at least"
has already been extracted. */
private static TregexPattern AT_RBS_PATTERN = TregexPattern.compile("@ADVP|QP < ((IN|RB=node1 < /^(?i)at$/) $+ (JJS|RBS=node2))");
private static TsurgeonPattern AT_RBS_OPERATION = Tsurgeon.parseOperation("[relabel node1 IN] [createSubtree ADVP node1] [move node2 $- node1] [createSubtree NP node2]");
private static final TregexPattern AT_RBS_PATTERN = TregexPattern.compile("@ADVP|QP < ((IN|RB=node1 < /^(?i)at$/) $+ (JJS|RBS=node2))");
private static final TsurgeonPattern AT_RBS_OPERATION = Tsurgeon.parseOperation("[relabel node1 IN] [createSubtree ADVP node1] [move node2 $- node1] [createSubtree NP node2]");

/* at all should be treated like a PP. */
private static TregexPattern AT_ALL_PATTERN = TregexPattern.compile("@ADVP=head < (RB|IN=node1 < /^(?i)at$/ $+ (RB|DT=node2 < /^(?i)all$/))");
private static TsurgeonPattern AT_ALL_OPERATION = Tsurgeon.parseOperation("[relabel head PP] [relabel node1 IN] [createSubtree NP node2]");
private static final TregexPattern AT_ALL_PATTERN = TregexPattern.compile("@ADVP=head < (RB|IN=node1 < /^(?i)at$/ $+ (RB|DT=node2 < /^(?i)all$/))");
private static final TsurgeonPattern AT_ALL_OPERATION = Tsurgeon.parseOperation("[relabel head PP] [relabel node1 IN] [createSubtree NP node2]");

/**
* Puts all multi-word expressions below a single constituent labeled "MWE".
* Patterns for multi-word expressions are defined in MWE_PATTERNS.
*/
public static Tree MWETransform(Tree t) {
for (TregexPattern p: MWE_PATTERNS) {
for (TregexPattern p : MWE_PATTERNS) {
Tsurgeon.processPattern(p, MWE_OPERATION, t);
}

Expand All @@ -728,8 +728,8 @@ public static Tree MWETransform(Tree t) {
}


private static TregexPattern FLAT_PREP_CC_PATTERN = TregexPattern.compile("PP <, (/^(IN|TO)$/=p1 $+ (CC=cc $+ /^(IN|TO)$/=p2))");
private static TsurgeonPattern FLAT_PREP_CC_OPERATION = Tsurgeon.parseOperation("[createSubtree PCONJP p1 cc] [move p2 $- cc]");
private static final TregexPattern FLAT_PREP_CC_PATTERN = TregexPattern.compile("PP <, (/^(IN|TO)$/=p1 $+ (CC=cc $+ /^(IN|TO)$/=p2))");
private static final TsurgeonPattern FLAT_PREP_CC_OPERATION = Tsurgeon.parseOperation("[createSubtree PCONJP p1 cc] [move p2 $- cc]");

public static Tree prepCCTransform(Tree t) {

Expand All @@ -738,16 +738,15 @@ public static Tree prepCCTransform(Tree t) {
return t;
}

private static TregexPattern GAPPING_PATTERN = TregexPattern.compile("/^[^G].*/=gphrase < (/^[^V].*-ORPH.*/ $ /^[^V].*-ORPH.*/)");
private static TsurgeonPattern GAPPING_OPERATION = Tsurgeon.parseOperation("[adjoinH (GP (GAPPINGP@ )) gphrase] ");
private static final TregexPattern GAPPING_PATTERN = TregexPattern.compile("/^[^G].*/=gphrase < (/^[^V].*-ORPH.*/ $ /^[^V].*-ORPH.*/)");
private static final TsurgeonPattern GAPPING_OPERATION = Tsurgeon.parseOperation("[adjoinH (GP (GAPPINGP@ )) gphrase] ");


public static Tree gappingTransform(Tree t) {

Tsurgeon.processPattern(GAPPING_PATTERN, GAPPING_OPERATION, t);

return t;

}

public static void main(String[] args) {
Expand Down
6 changes: 3 additions & 3 deletions src/edu/stanford/nlp/trees/EnglishGrammaticalRelations.java
Original file line number Diff line number Diff line change
Expand Up @@ -984,9 +984,9 @@ private EnglishGrammaticalRelations() {}
* The "quantifier phrase modifier" grammatical relation. A quantifier
* modifier is an element modifying the head of a QP constituent.
* <br>
* Example: <br>
* "About 200 people came to the party" &rarr;
* {@code quantmod}(200, About)
* Examples: <br>
* "About 200 people came to the party" &rarr; {@code quantmod}(200, About)
* "They weigh up to 200 kilograms" &rarr; {@code quantmod}(200, to)
*/
public static final GrammaticalRelation QUANTIFIER_MODIFIER =
new GrammaticalRelation(Language.English, "quantmod", "quantifier modifier",
Expand Down
28 changes: 13 additions & 15 deletions src/edu/stanford/nlp/trees/QPTreeTransformer.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package edu.stanford.nlp.trees;



import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
Expand Down Expand Up @@ -31,7 +29,7 @@
public class QPTreeTransformer implements TreeTransformer {


private boolean universalDependencies = false;
private boolean universalDependencies; // = false;

public QPTreeTransformer() {
this(false);
Expand All @@ -46,7 +44,7 @@ public QPTreeTransformer(boolean universalDependencies) {
* Right now (Jan 2013) we only deal with the following QP structures:
* <ul>
* <li> NP (QP ...) (QP (CC and/or) ...)
* <li> QP (RB IN CD|DT ...) well over, more than
* <li> QP (RB|RP IN CD|DT ...) well over, more than, up to
* <li> QP (JJR IN CD|DT ...) fewer than
* <li> QP (IN JJS CD|DT ...) at least
* <li> QP (... CC ...) between 5 and 10
Expand All @@ -61,21 +59,21 @@ public Tree transformTree(Tree t) {
}


private static TregexPattern flattenNPoverQPTregex =
private static final TregexPattern flattenNPoverQPTregex =
TregexPattern.compile("NP < (QP=left $+ (QP=right < CC))");

private static TsurgeonPattern flattenNPoverQPTsurgeon =
private static final TsurgeonPattern flattenNPoverQPTsurgeon =
Tsurgeon.parseOperation("[createSubtree QP left right] [excise left left] [excise right right]");

private static TregexPattern multiwordXSLTregex =
private static final TregexPattern multiwordXSLTregex =
// captures "up to"
// once "up to" is captured in the XSL, the following XS operation won't accidentally grab it
TregexPattern.compile("QP < ( /^RB|IN|RP/=left < /^(?:up)$/ ) < ( /^IN|TO/=right < /^(?:to)$/ $- =left )");
TregexPattern.compile("QP < ( RB|IN|RP=left < /^(?i:up)$/ $+ ( IN|TO=right < /^(?i:to)$/ ))");

private static TsurgeonPattern multiwordXSLTsurgeon =
private static final TsurgeonPattern multiwordXSLTsurgeon =
Tsurgeon.parseOperation("createSubtree XSL left right");

private static TregexPattern multiwordXSTregex =
private static final TregexPattern multiwordXSTregex =
// TODO: should add NN and $ to the numeric expressions captured
// NN is for words such as "half" which are probably misparsed
// TODO: <3 (IN < as|than) is to avoid one weird case in PTB,
Expand All @@ -84,22 +82,22 @@ public Tree transformTree(Tree t) {
// TODO: "all but about X"
TregexPattern.compile("QP <1 /^RB|JJ|IN/=left [ ( <2 /^JJ|IN/=right <3 /^CD|DT/ ) | ( <2 /^JJ|IN/ <3 ( IN=right < /^(?i:as|than)$/ ) <4 /^CD|DT/ ) ] ");

private static TsurgeonPattern multiwordXSTsurgeon =
private static final TsurgeonPattern multiwordXSTsurgeon =
Tsurgeon.parseOperation("createSubtree XS left right");

// the old style split any flat QP with a CC in the middle
// TOD: there should be some allowances for phrases such as "or more", "or so", etc
private static TregexPattern splitCCTregex =
private static final TregexPattern splitCCTregex =
TregexPattern.compile("QP < (CC $- __=r1 $+ __=l2 ?$-- /^[$]|CC$/=lnum ?$++ /^[$]|CC$/=rnum) <1 __=l1 <- __=r2 !< (__ < (__ < __))");

private static TsurgeonPattern splitCCTsurgeon =
private static final TsurgeonPattern splitCCTsurgeon =
Tsurgeon.parseOperation("[if exists lnum createSubtree QP l1 r1] [if not exists lnum createSubtree NP l1 r1] " +
"[if exists rnum createSubtree QP l2 r2] [if not exists rnum createSubtree NP l2 r2]");

private static TregexPattern splitMoneyTregex =
private static final TregexPattern splitMoneyTregex =
TregexPattern.compile("QP < (/^[$]$/ !$++ /^(?!([$]|CD)).*$/ !$++ (__ < (__ < __)) $+ __=left) <- __=right");

private static TsurgeonPattern splitMoneyTsurgeon =
private static final TsurgeonPattern splitMoneyTsurgeon =
Tsurgeon.parseOperation("createSubtree QP left right");

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1222,8 +1222,10 @@ private UniversalEnglishGrammaticalRelations() {}
* "fewer than 700 bottles" &rarr;
* {@code mwe}(fewer, than)
*
* TODO: Fix variable names etc. but right output relation is used: The name "mwe" is from UDv1. It should now be "fixed"
*
* @see {@link CoordinationTransformer#MWETransform(Tree)}
* @see <a href="http://universaldependencies.github.io/docs/en/dep/mwe.html">List of multi-word expressions</a>
* @see <a href="https://universaldependencies.org/en/dep/fixed.html">List of multi-word expressions</a>
*/
public static final GrammaticalRelation MULTI_WORD_EXPRESSION =
new GrammaticalRelation(Language.UniversalEnglish, "fixed", "multi-word expression",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* @author John Bauer
*/
public class CoordinationTransformerTest extends TestCase {

static final String SYM_DONT_MOVE_RB = "(ROOT (S (NP (NP (NN fire) (NN gear)) (, ,) (ADVP (RB annually)) (SYM fy) (: -)) (VP (NN fy) (: :))))";

public void testMoveRB() {
Expand Down
Loading

0 comments on commit 6e14527

Please sign in to comment.