Skip to content

Commit

Permalink
Merge pull request #3 from andrewresearch/develop
Browse files Browse the repository at this point in the history
develop
  • Loading branch information
andrewresearch authored Oct 29, 2017
2 parents 5b8a190 + fd58e24 commit 3c86593
Show file tree
Hide file tree
Showing 378 changed files with 4,680 additions and 28,168 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ crashlytics-build.properties
fabric.properties

/.idea/*
/src/main/scala/worksheet.sc
21 changes: 19 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ val scalaParserV = "1.0.6"
val jblasV = "1.2.4"
val apacheComsCompressV = "1.15"
val apacheComsLangV = "3.6"
val factorieV = "1.2"

val scalaLangDeps = Seq(
"org.scala-lang.modules" %% "scala-parser-combinators" % scalaParserV,
Expand All @@ -19,7 +20,8 @@ val scalaLangDeps = Seq(
)

val scalaDeps = Seq(
"org.json4s" %% "json4s-jackson" % "3.5.3"
"org.json4s" %% "json4s-jackson" % "3.5.3",
"cc.factorie.app.nlp" % "all-models" % factorieV
)

val javaDeps = Seq(
Expand Down Expand Up @@ -50,4 +52,19 @@ sourceGenerators in Compile += {

//Enable this only for local builds - disabled for Travis
enablePlugins(JavaAppPackaging) // sbt universal:packageZipTarball
//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal
//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal

javaOptions in Universal ++= Seq(
// -J params will be added as jvm parameters
"-J-Xmx6g",
"-J-Xms3g"

// others will be added as app parameters
// "-Dproperty=true",
// "-port=8080",

// you can access any build setting/task here
//s"-version=${version.value}"
)

resolvers += "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public"
7 changes: 4 additions & 3 deletions src/main/scala/cc/factorie/app/chain/Lexicons.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
limitations under the License. */

package cc.factorie.app.chain

import cc.factorie.app.nlp.{Token, TokenSpan}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.io.BufferedSource

/** Methods of retrieving the lexicons that a token in a document (using the window around the token) or a span matches into
* returns the lexicons names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label)
/** Methods of retrieving the lexicon that a token in a document (using the window around the token) or a span matches into
* returns the lexicon names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label)
@author anzaroot */
class Lexicons( val sources : List[(String,BufferedSource)]) {
val lexiconMap = mutable.HashMap[String, List[String]]()
Expand Down Expand Up @@ -50,7 +51,7 @@ class Lexicons( val sources : List[(String,BufferedSource)]) {
val key = removeTrail(keyPre.map(_.string).mkString(" "))
if(lexiconMap.contains(key) && (removeTrail(token.string) != "" || (keyPre.head.position < token.position && keyPre.last.position > token.position ))) {
lexes = lexiconMap(key).map(locate(token, keyPre) + _) ::: lexes
//println("Found for token: " + token.string + " with key: " + keyPre + " the lexicons: " + lexiconMap(key).mkString(" , "))
//println("Found for token: " + token.string + " with key: " + keyPre + " the lexicon: " + lexiconMap(key).mkString(" , "))
//println("And phrase: " + phrase.map( _.string ).mkString(" "))
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/main/scala/cc/factorie/app/nlp/BasicSection.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package cc.factorie.app.nlp

/**
* Created by [email protected] on 27/10/17.
*/

/** A simple concrete implementation of Section. */
class BasicSection(val document:Document, val stringStart:Int, val stringEnd:Int) extends Section
159 changes: 66 additions & 93 deletions src/main/scala/cc/factorie/app/nlp/Document.scala

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package cc.factorie.app.nlp

/**User: apassos
* Date: 8/7/13
* Time: 2:48 PM
*/

/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator.
This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */
class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator {
var profile = false
var tokensProcessed = 0
var msProcessed = 0L
val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]()
def postAttrs = annotators.flatMap(_.postAttrs).distinct
def process(document: Document) = {
var doc = document
val t00 = System.currentTimeMillis()
for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) {
val t0 = System.currentTimeMillis()
doc = annotator.process(doc)
if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0
annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass)
}
if (profile) {
msProcessed += System.currentTimeMillis() - t00
tokensProcessed += doc.tokenCount
}
doc
}
def profileReport: String = {
s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " +
"Speeds of individual components:\n" + timePerAnnotator.map(i => f" ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n")
}
def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t")
}
52 changes: 2 additions & 50 deletions src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
http://factorie.cs.umass.edu, http://github.com/factorie
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

package cc.factorie.app.nlp

import cc.factorie.app.nlp.coref.Mention
import cc.factorie.app.nlp.phrase.Phrase
import cc.factorie.util.Threading
Expand All @@ -28,46 +16,10 @@ trait DocumentAnnotator {
/** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format.
If there is no per-token annotation, return null. Used in Document.owplString. */
def tokenAnnotationString(token:Token): String

/** How the annotation of this DocumentAnnotator should be printed as extra information after a one-word-per-line (OWPL) format.
If there is no document annotation, return the empty string. Used in Document.owplString. */
def documentAnnotationString(document:Document): String = ""
def phraseAnnotationString(phrase:Phrase): String = ""
def mentionAnnotationString(mention:Mention): String = ""
}

/** Used as a stand-in dummy DocumentAnnotator in the DocumentAnnotatorMap when an annotation was added but not by a real DocumentAnnotator. */
object UnknownDocumentAnnotator extends DocumentAnnotator {
def process(document: Document): Document = document
def prereqAttrs: Iterable[Class[_]] = Nil
def postAttrs: Iterable[Class[_]] = Nil
def tokenAnnotationString(token: Token) = null
}

object NoopDocumentAnnotator extends DocumentAnnotator {
def process(document: Document): Document = document
def prereqAttrs: Iterable[Class[_]] = Nil
def postAttrs: Iterable[Class[_]] = Nil
def tokenAnnotationString(token: Token) = null
}

class CompoundDocumentAnnotator(val annos:Seq[DocumentAnnotator]) extends DocumentAnnotator {
// for java compat
def this(annoArr:Array[DocumentAnnotator]) = this(annoArr.toSeq)
def tokenAnnotationString(token: Token) = annos.map(anno => Option(anno.tokenAnnotationString(token))).mkString("\t")

lazy val prereqAttrs = annos.flatMap(_.prereqAttrs).toSet diff postAttrs
lazy val postAttrs = annos.flatMap(_.postAttrs).toSet

def process(document: Document) = {
// left fold, but faster, thanks scala
var doc = document
val iter = annos.iterator
while(iter.hasNext) {
val anno = iter.next()
//println(s"annotating document ${doc.name} with ${anno.getClass.getName}")
doc = anno.process(doc)
}
doc
}
}
96 changes: 20 additions & 76 deletions src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala
Original file line number Diff line number Diff line change
@@ -1,92 +1,37 @@
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
http://factorie.cs.umass.edu, http://github.com/factorie
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
package cc.factorie.app.nlp

import cc.factorie.util.FastLogging

import scala.reflect.ClassTag

/**User: apassos
* Date: 8/7/13
* Time: 2:48 PM
*/

/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator.
This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */
class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator {
var profile = false
var tokensProcessed = 0
var msProcessed = 0L
val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]()
def postAttrs = annotators.flatMap(_.postAttrs).distinct
def process(document: Document) = {
var doc = document
val t00 = System.currentTimeMillis()
for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) {
val t0 = System.currentTimeMillis()
doc = annotator.process(doc)
if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0
annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass)
}
if (profile) {
msProcessed += System.currentTimeMillis() - t00
tokensProcessed += doc.tokenCount
}
doc
}
def profileReport: String = {
s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " +
"Speeds of individual components:\n" + timePerAnnotator.map(i => f" ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n")
}
def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t")
}

/** A Map from annotation class to DocumentAnnotator that provides that annotation.
Used to store default ways of getting certain prerequisite annotations. */
class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] {
def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator)
}

/** A factory for creating DocumentAnnotatorPipelines given requirements about which annotations or which DocumentAnnotators are desired. */
object DocumentAnnotatorPipeline extends FastLogging {

val defaultDocumentAnnotationMap: DocumentAnnotatorMap = new collection.immutable.ListMap ++ Seq(
// Note that order matters here
classOf[pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger),
classOf[parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser),
classOf[segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer),
classOf[cc.factorie.app.nlp.pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger),
classOf[cc.factorie.app.nlp.parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser),
classOf[cc.factorie.app.nlp.segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer),
classOf[Token] -> (() => segment.DeterministicNormalizingTokenizer),
classOf[Sentence] -> (() => segment.DeterministicSentenceSegmenter),
classOf[lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer),
//classOf[lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer),
//classOf[lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer),
//classOf[lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer),
//classOf[lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer),
classOf[ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default?
//classOf[ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer),
classOf[ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer),
//classOf[ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator),
classOf[ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator),
//classOf[coref.mention.NerMentionList] -> (() => coref.mention.NerAndPronounMentionFinder),
//classOf[phrase.GenderLabel[coref.Mention]] -> (() => phrase.GenderLabeler[]),
classOf[phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler),
classOf[phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler),
classOf[phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder),
classOf[coref.WithinDocCoref] -> (() => coref.NerForwardCoref),
classOf[relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder)
//classOf[phrase.NumberLabel[phrase.NounPhrase]] -> (() => phrase.NounPhraseNumberLabeler),
//classOf[MentionEntityType] -> (() => coref.mention.MentionEntityTypeLabeler),
//classOf[cc.factorie.util.coref.GenericEntityMap[coref.mention.Mention]] -> (() => coref.NerForwardCoref)

classOf[cc.factorie.app.nlp.lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer),
classOf[cc.factorie.app.nlp.lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer),
classOf[cc.factorie.app.nlp.lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer),
classOf[cc.factorie.app.nlp.lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer),
classOf[cc.factorie.app.nlp.lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer),
classOf[cc.factorie.app.nlp.ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default?
classOf[cc.factorie.app.nlp.ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer),
classOf[cc.factorie.app.nlp.ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer),
classOf[cc.factorie.app.nlp.ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator),
classOf[cc.factorie.app.nlp.ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator),
classOf[cc.factorie.app.nlp.phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler),
classOf[cc.factorie.app.nlp.phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler),
classOf[cc.factorie.app.nlp.phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder),
classOf[cc.factorie.app.nlp.coref.WithinDocCoref] -> (() => coref.NerForwardCoref),
classOf[cc.factorie.app.nlp.relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder)
)

//def apply(goal: Class[_]): DocumentAnnotationPipeline = apply(Seq(goal), defaultDocumentAnnotationMap)
Expand Down Expand Up @@ -153,5 +98,4 @@ object DocumentAnnotatorPipeline extends FastLogging {
}
}
}
}

}
6 changes: 6 additions & 0 deletions src/main/scala/cc/factorie/app/nlp/DocumentName.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package cc.factorie.app.nlp

/** Used as an attribute on Document to hold the document's name. */
case class DocumentName(string:String) {
override def toString: String = string
}
16 changes: 16 additions & 0 deletions src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package cc.factorie.app.nlp

/** A portion of the string contents of a Document.
*
*@author Andrew McCallum */
trait DocumentSubstring {
/** The Document of which this DocumentSubstring is a part. */
def document: Document
/** The character offset into the Document.string at which this DocumentSubstring begins. */
def stringStart: Int
/** The character offset into the Document.string at which this DocumentSubstring is over.
In other words, the last character of the DocumentSubstring is Document.string(this.stringEnd-1). */
def stringEnd: Int
/** The substring of the Document encompassed by this DocumentSubstring. */
def string: String
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package cc.factorie.app.nlp

/** A Map from annotation class to DocumentAnnotator that provides that annotation.
*Used to store default ways of getting certain prerequisite annotations. */
class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] {
def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator)
}
Loading

0 comments on commit 3c86593

Please sign in to comment.