Merge pull request #3 from andrewresearch/develop

develop
nlytx · Oct 29, 2017 · 3c86593 · 3c86593
2 parents 5b8a190 + fd58e24
commit 3c86593
Show file tree

Hide file tree

Showing 378 changed files with 4,680 additions and 28,168 deletions.
diff --git a/.gitignore b/.gitignore
@@ -51,3 +51,4 @@ crashlytics-build.properties
 fabric.properties
 
 /.idea/*
+/src/main/scala/worksheet.sc
diff --git a/build.sbt b/build.sbt
@@ -11,6 +11,7 @@ val scalaParserV = "1.0.6"
 val jblasV = "1.2.4"
 val apacheComsCompressV = "1.15"
 val apacheComsLangV = "3.6"
+val factorieV = "1.2"
 
 val scalaLangDeps = Seq(
   "org.scala-lang.modules" %% "scala-parser-combinators" % scalaParserV,
@@ -19,7 +20,8 @@ val scalaLangDeps = Seq(
 )
 
 val scalaDeps = Seq(
-  "org.json4s" %% "json4s-jackson" % "3.5.3"
+  "org.json4s" %% "json4s-jackson" % "3.5.3",
+"cc.factorie.app.nlp" % "all-models" % factorieV
 )
 
 val javaDeps = Seq(
@@ -50,4 +52,19 @@ sourceGenerators in Compile += {
 
 //Enable this only for local builds - disabled for Travis
 enablePlugins(JavaAppPackaging) // sbt universal:packageZipTarball
-//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal
+//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal
+
+javaOptions in Universal ++= Seq(
+  // -J params will be added as jvm parameters
+  "-J-Xmx6g",
+  "-J-Xms3g"
+
+  // others will be added as app parameters
+  //  "-Dproperty=true",
+  //  "-port=8080",
+
+  // you can access any build setting/task here
+  //s"-version=${version.value}"
+)
+
+resolvers += "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public"
diff --git a/src/main/scala/cc/factorie/app/chain/Lexicons.scala b/src/main/scala/cc/factorie/app/chain/Lexicons.scala
@@ -12,14 +12,15 @@
    limitations under the License. */
 
 package cc.factorie.app.chain
+
 import cc.factorie.app.nlp.{Token, TokenSpan}
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.io.BufferedSource
 
-/** Methods of retrieving the lexicons that a token in a document (using the window around the token) or a span matches into
-  * returns the lexicons names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label)
+/** Methods of retrieving the lexicon that a token in a document (using the window around the token) or a span matches into
+  * returns the lexicon names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label)
     @author anzaroot */
 class Lexicons( val sources : List[(String,BufferedSource)]) {
   val lexiconMap = mutable.HashMap[String, List[String]]()
@@ -50,7 +51,7 @@ class Lexicons( val sources : List[(String,BufferedSource)]) {
       val key = removeTrail(keyPre.map(_.string).mkString(" "))
       if(lexiconMap.contains(key) && (removeTrail(token.string) != "" || (keyPre.head.position < token.position && keyPre.last.position > token.position ))) {
         lexes = lexiconMap(key).map(locate(token, keyPre) + _) ::: lexes
-        //println("Found for token: " + token.string + " with key: " + keyPre + " the lexicons: " + lexiconMap(key).mkString(" , "))
+        //println("Found for token: " + token.string + " with key: " + keyPre + " the lexicon: " + lexiconMap(key).mkString(" , "))
         //println("And phrase: " + phrase.map( _.string ).mkString(" "))
       }
     }

diff --git a/src/main/scala/cc/factorie/app/nlp/BasicSection.scala b/src/main/scala/cc/factorie/app/nlp/BasicSection.scala
@@ -0,0 +1,8 @@
+package cc.factorie.app.nlp
+
+/**
+  * Created by [email protected] on 27/10/17.
+  */
+
+/** A simple concrete implementation of Section. */
+class BasicSection(val document:Document, val stringStart:Int, val stringEnd:Int) extends Section
diff --git a/src/main/scala/cc/factorie/app/nlp/Document.scala b/src/main/scala/cc/factorie/app/nlp/Document.scala
diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala
@@ -0,0 +1,36 @@
+package cc.factorie.app.nlp
+
+/**User: apassos
+  * Date: 8/7/13
+  * Time: 2:48 PM
+  */
+
+/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator.
+    This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */
+class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator {
+  var profile = false
+  var tokensProcessed = 0
+  var msProcessed = 0L
+  val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]()
+  def postAttrs = annotators.flatMap(_.postAttrs).distinct
+  def process(document: Document) = {
+    var doc = document
+    val t00 = System.currentTimeMillis()
+    for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) {
+      val t0 = System.currentTimeMillis()
+      doc = annotator.process(doc)
+      if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0
+      annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass)
+    }
+    if (profile) {
+      msProcessed += System.currentTimeMillis() - t00
+      tokensProcessed += doc.tokenCount
+    }
+    doc
+  }
+  def profileReport: String = {
+    s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " +
+      "Speeds of individual components:\n" + timePerAnnotator.map(i => f"   ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n")
+  }
+  def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t")
+}
diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala
@@ -1,17 +1,5 @@
-/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
-   This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
-   http://factorie.cs.umass.edu, http://github.com/factorie
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
 package cc.factorie.app.nlp
+
 import cc.factorie.app.nlp.coref.Mention
 import cc.factorie.app.nlp.phrase.Phrase
 import cc.factorie.util.Threading
@@ -28,46 +16,10 @@ trait DocumentAnnotator {
   /** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format.
       If there is no per-token annotation, return null.  Used in Document.owplString. */
   def tokenAnnotationString(token:Token): String
-  
+
   /** How the annotation of this DocumentAnnotator should be printed as extra information after a one-word-per-line (OWPL) format.
       If there is no document annotation, return the empty string.  Used in Document.owplString. */
   def documentAnnotationString(document:Document): String = ""
   def phraseAnnotationString(phrase:Phrase): String = ""
   def mentionAnnotationString(mention:Mention): String = ""
 }
-
-/** Used as a stand-in dummy DocumentAnnotator in the DocumentAnnotatorMap when an annotation was added but not by a real DocumentAnnotator. */
-object UnknownDocumentAnnotator extends DocumentAnnotator {
-  def process(document: Document): Document = document
-  def prereqAttrs: Iterable[Class[_]] = Nil
-  def postAttrs: Iterable[Class[_]] = Nil
-  def tokenAnnotationString(token: Token) = null
-}
-
-object NoopDocumentAnnotator extends DocumentAnnotator {
-  def process(document: Document): Document = document
-  def prereqAttrs: Iterable[Class[_]] = Nil
-  def postAttrs: Iterable[Class[_]] = Nil
-  def tokenAnnotationString(token: Token) = null
-}
-
-class CompoundDocumentAnnotator(val annos:Seq[DocumentAnnotator]) extends DocumentAnnotator {
-  // for java compat
-  def this(annoArr:Array[DocumentAnnotator]) = this(annoArr.toSeq)
-  def tokenAnnotationString(token: Token) = annos.map(anno => Option(anno.tokenAnnotationString(token))).mkString("\t")
-
-  lazy val prereqAttrs = annos.flatMap(_.prereqAttrs).toSet diff postAttrs
-  lazy val postAttrs = annos.flatMap(_.postAttrs).toSet
-
-  def process(document: Document) = {
-    // left fold, but faster, thanks scala
-    var doc = document
-    val iter = annos.iterator
-    while(iter.hasNext) {
-      val anno = iter.next()
-      //println(s"annotating document ${doc.name} with ${anno.getClass.getName}")
-      doc = anno.process(doc)
-    }
-    doc
-  }
-}
diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala
@@ -1,92 +1,37 @@
-/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
-   This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
-   http://factorie.cs.umass.edu, http://github.com/factorie
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 package cc.factorie.app.nlp
 
 import cc.factorie.util.FastLogging
 
 import scala.reflect.ClassTag
 
-/**User: apassos
- * Date: 8/7/13
- * Time: 2:48 PM
- */
 
-/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator.
-    This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */
-class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator {
-  var profile = false
-  var tokensProcessed = 0
-  var msProcessed = 0L
-  val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]()
-  def postAttrs = annotators.flatMap(_.postAttrs).distinct
-  def process(document: Document) = {
-    var doc = document
-    val t00 = System.currentTimeMillis()
-    for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) {
-      val t0 = System.currentTimeMillis()
-      doc = annotator.process(doc)
-      if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0
-      annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass)
-    }
-    if (profile) {
-      msProcessed += System.currentTimeMillis() - t00
-      tokensProcessed += doc.tokenCount
-    }
-    doc
-  }
-  def profileReport: String = {
-    s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " +
-    "Speeds of individual components:\n" + timePerAnnotator.map(i => f"   ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n")
-  }
-  def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t")
-}
 
-/** A Map from annotation class to DocumentAnnotator that provides that annotation. 
-    Used to store default ways of getting certain prerequisite annotations. */
-class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] {
-  def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator)
-}
 
 /** A factory for creating DocumentAnnotatorPipelines given requirements about which annotations or which DocumentAnnotators are desired. */
 object DocumentAnnotatorPipeline extends FastLogging  {
+
   val defaultDocumentAnnotationMap: DocumentAnnotatorMap = new collection.immutable.ListMap ++ Seq(
     // Note that order matters here
-    classOf[pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger),
-    classOf[parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser),
-    classOf[segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer),
+    classOf[cc.factorie.app.nlp.pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger),
+    classOf[cc.factorie.app.nlp.parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser),
+    classOf[cc.factorie.app.nlp.segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer),
     classOf[Token] -> (() => segment.DeterministicNormalizingTokenizer),
     classOf[Sentence] -> (() => segment.DeterministicSentenceSegmenter),
-    classOf[lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer),
-    //classOf[lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer),
-    //classOf[lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer),
-    //classOf[lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer),
-    //classOf[lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer),
-    classOf[ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default?
-    //classOf[ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer),
-    classOf[ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer),
-    //classOf[ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator),
-    classOf[ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator),
-    //classOf[coref.mention.NerMentionList] -> (() => coref.mention.NerAndPronounMentionFinder),
-    //classOf[phrase.GenderLabel[coref.Mention]] -> (() => phrase.GenderLabeler[]),
-    classOf[phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler),
-    classOf[phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler),
-    classOf[phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder),
-    classOf[coref.WithinDocCoref] -> (() => coref.NerForwardCoref),
-    classOf[relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder)
-    //classOf[phrase.NumberLabel[phrase.NounPhrase]] -> (() => phrase.NounPhraseNumberLabeler),
-    //classOf[MentionEntityType] ->  (() => coref.mention.MentionEntityTypeLabeler),
-    //classOf[cc.factorie.util.coref.GenericEntityMap[coref.mention.Mention]] -> (() => coref.NerForwardCoref)
-
+    classOf[cc.factorie.app.nlp.lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer),
+    classOf[cc.factorie.app.nlp.lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer),
+    classOf[cc.factorie.app.nlp.lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer),
+    classOf[cc.factorie.app.nlp.lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer),
+    classOf[cc.factorie.app.nlp.lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer),
+    classOf[cc.factorie.app.nlp.ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default?
+    classOf[cc.factorie.app.nlp.ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer),
+    classOf[cc.factorie.app.nlp.ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer),
+    classOf[cc.factorie.app.nlp.ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator),
+    classOf[cc.factorie.app.nlp.ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator),
+    classOf[cc.factorie.app.nlp.phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler),
+    classOf[cc.factorie.app.nlp.phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler),
+    classOf[cc.factorie.app.nlp.phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder),
+    classOf[cc.factorie.app.nlp.coref.WithinDocCoref] -> (() => coref.NerForwardCoref),
+    classOf[cc.factorie.app.nlp.relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder)
   )
 
   //def apply(goal: Class[_]): DocumentAnnotationPipeline = apply(Seq(goal), defaultDocumentAnnotationMap)
@@ -153,5 +98,4 @@ object DocumentAnnotatorPipeline extends FastLogging  {
       }
     }
   }
-}
-
+}
diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentName.scala b/src/main/scala/cc/factorie/app/nlp/DocumentName.scala
@@ -0,0 +1,6 @@
+package cc.factorie.app.nlp
+
+/** Used as an attribute on Document to hold the document's name. */
+case class DocumentName(string:String) {
+  override def toString: String = string
+}
diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala b/src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala
@@ -0,0 +1,16 @@
+package cc.factorie.app.nlp
+
+/** A portion of the string contents of a Document.
+ *
+    *@author Andrew McCallum */
+trait DocumentSubstring {
+  /** The Document of which this DocumentSubstring is a part. */
+  def document: Document
+  /** The character offset into the Document.string at which this DocumentSubstring begins. */
+  def stringStart: Int
+  /** The character offset into the Document.string at which this DocumentSubstring is over.
+      In other words, the last character of the DocumentSubstring is Document.string(this.stringEnd-1). */
+  def stringEnd: Int
+  /** The substring of the Document encompassed by this DocumentSubstring. */
+  def string: String
+}
diff --git a/src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala b/src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala
@@ -0,0 +1,7 @@
+package cc.factorie.app.nlp
+
+/** A Map from annotation class to DocumentAnnotator that provides that annotation.
+    *Used to store default ways of getting certain prerequisite annotations. */
+class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] {
+  def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator)
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -51,3 +51,4 @@ crashlytics-build.properties
		fabric.properties

		/.idea/*
		/src/main/scala/worksheet.sc