diff --git a/.travis.yml b/.travis.yml index a222ea6..6908934 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,10 +3,10 @@ language: scala jdk: oraclejdk8 scala: - - 2.12.4 + - 2.12.3 script: - - sbt ++$TRAVIS_SCALA_VERSION -J-Xmx6G test + - sbt ++$TRAVIS_SCALA_VERSION -J-Xmx2000m test # Use container-based infrastructure sudo: false diff --git a/README.md b/README.md index 96a3e60..d556860 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/nlytx/factorie-nlp-api.svg?branch=master)](https://travis-ci.org/nlytx/factorie-nlp-api) ![scalaVersion](https://img.shields.io/badge/scala-2.12.4-blue.svg) ![license](https://img.shields.io/badge/license-Apache%202-blue.svg) [ ![Download](https://api.bintray.com/packages/nlytx-io/factorie-nlp-api/factorie-nlp-api/images/download.svg?version=0.3) ](https://bintray.com/nlytx-io/factorie-nlp-api/factorie-nlp-api/0.3/link) +[![Build Status](https://travis-ci.org/nlytx/factorie-nlp-api.svg?branch=master)](https://travis-ci.org/nlytx/factorie-nlp-api) ![scalaVersion](https://img.shields.io/badge/scala-2.12.4-blue.svg) ![license](https://img.shields.io/badge/license-Apache%202-blue.svg) [ ![Download](https://api.bintray.com/packages/nlytx-io/factorie-nlp-api/factorie-nlp-api/images/download.svg?version=0.5.0) ](https://bintray.com/nlytx-io/factorie-nlp-api/factorie-nlp-api/0.5.0/link) # factorie-nlp-api diff --git a/build.sbt b/build.sbt index ff27b92..755566e 100644 --- a/build.sbt +++ b/build.sbt @@ -1,12 +1,12 @@ name := "factorie-nlp-api" -version := "0.3" +version := "0.5.1" -scalaVersion := "2.12.4" +scalaVersion := "2.12.3" organization := "io.nlytx" -val scalaLangV = "2.12.4" +val scalaLangV = "2.12.3" val scalaParserV = "1.0.6" val jblasV = "1.2.4" val apacheComsCompressV = "1.15" @@ -22,18 +22,22 @@ val scalaLangDeps = Seq( val scalaDeps = Seq( "com.typesafe.akka" %% "akka-stream" % akkaStreamV, + "com.typesafe.akka" %% "akka-slf4j" % akkaStreamV, "org.json4s" %% "json4s-jackson" % "3.5.3", -"cc.factorie.app.nlp" % "all-models" % factorieV + "cc.factorie.app.nlp" % "all-models" % factorieV ) val javaDeps = Seq( "org.jblas" % "jblas" % jblasV, "org.apache.commons" % "commons-compress" % apacheComsCompressV, - "org.apache.commons" % "commons-lang3" % apacheComsLangV + "org.apache.commons" % "commons-lang3" % apacheComsLangV, +"ch.qos.logback" % "logback-classic" % "1.2.3" ) libraryDependencies ++= (scalaLangDeps ++ scalaDeps ++ javaDeps) +resolvers += "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public" + //"junit" % "junit" % "4.12", //"org.scalatest" %% "scalatest" % "3.0.4" % Test, //"org.slf4j" % "slf4j-log4j12" % "1.7.25" % Test, @@ -50,10 +54,10 @@ libraryDependencies ++= (scalaLangDeps ++ scalaDeps ++ javaDeps) enablePlugins(JavaAppPackaging) // sbt universal:packageZipTarball //dockerExposedPorts := Seq(9000) // sbt docker:publishLocal -javaOptions in Universal ++= Seq( +//javaOptions in Universal ++= Seq( // -J params will be added as jvm parameters - "-J-Xmx6g", - "-J-Xms3g" + //"-J-Xmx6g", + //"-J-Xms3g" // others will be added as app parameters // "-Dproperty=true", @@ -61,6 +65,5 @@ javaOptions in Universal ++= Seq( // you can access any build setting/task here //s"-version=${version.value}" -) +//) -resolvers += "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public" \ No newline at end of file diff --git a/src/main/resources/application.conf b/src/main/resources/application.conf new file mode 100644 index 0000000..050cf21 --- /dev/null +++ b/src/main/resources/application.conf @@ -0,0 +1,6 @@ +akka { + loglevel = "DEBUG" + loggers = ["akka.event.slf4j.Slf4jLogger"] + logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" + logger-startup-timeout=30s +} \ No newline at end of file diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..1cd2824 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,14 @@ + + + + + %green(%date{yyyyMMdd_HH:mm:ss.SSS, Australia/Sydney}) %highlight(%-5level) %cyan(%logger{36}) %yellow(%X{akkaSource}) - %msg%n + + + + + + + + + diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala index 7397c56..e5091fd 100644 --- a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala +++ b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala @@ -1,10 +1,13 @@ package cc.factorie.app.nlp +import akka.event.jul.Logger import cc.factorie.app.nlp.coref.Mention import cc.factorie.app.nlp.phrase.Phrase import cc.factorie.util.Threading trait DocumentAnnotator { + + def process(document: Document): Document // NOTE: this method may mutate and return the same document that was passed in def prereqAttrs: Iterable[Class[_]] def postAttrs: Iterable[Class[_]] diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala index de67395..6971a06 100644 --- a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala @@ -3,6 +3,7 @@ package cc.factorie.app.nlp.segment import java.io.StringReader import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} +import cc.factorie.app.nlp.segment.EnglishLexer diff --git a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala b/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala deleted file mode 100644 index 5af36c7..0000000 --- a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala +++ /dev/null @@ -1,132 +0,0 @@ -package io.nlytx.factorie.nlp.api - -import akka.NotUsed -import akka.actor.ActorSystem -import akka.stream.ActorMaterializer -import akka.stream.scaladsl.{Flow, Keep, Sink, Source} -import cc.factorie.app.nlp.Document -import cc.factorie.app.nlp.parse.OntonotesTransitionBasedParser -import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger -import cc.factorie.app.nlp.segment.{DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter, PlainTokenNormalizer} -import cc.factorie.app.nlp.wordnet.WordNet - -import scala.concurrent.ExecutionContext.Implicits.global -import scala.concurrent.duration._ -import scala.concurrent.{Await, Future} -import scala.reflect.runtime.universe._ - -/** - * Created by andrew@andrewresearch.net on 24/10/17. - */ - -class DocumentBuilder { - - - implicit val system: ActorSystem = ActorSystem("factorie-nlp-api-as") - implicit val materializer: ActorMaterializer = ActorMaterializer() - - - //Document Annotators - no models required - private val tokeniser = (doc: Document) => DeterministicNormalizingTokenizer.process(doc) - private val segmenter = (doc: Document) => DeterministicSentenceSegmenter.process(doc) - private val normaliser = (doc: Document) => PlainTokenNormalizer.process(doc) - - //Document Annotators - load models - private val postagger = (doc: Document) => OntonotesForwardPosTagger.process(doc) - private lazy val wordNetLemmatizer = wordnet.wnLemmatizer - private val lemmatiser = (doc: Document) => wordNetLemmatizer.process(doc) - private val parser = (doc: Document) => OntonotesTransitionBasedParser.process(doc) - - //Very slow model loading - returns a future - private val nerTagger = (doc: Document) => Future(SlowLoad.nerTagger.process(doc)) - - //Pipeline types - type TokenSegment = Flow[String, Document, NotUsed] - type Complete = Flow[String, Document, NotUsed] - type PosTag = Flow[Document, Document, NotUsed] - type Lemma = Flow[Document, Document, NotUsed] - type Parse = Flow[Document, Document, NotUsed] - type NerTag = Flow[Document, Document, NotUsed] - - //Pipeline segments - private val doc = Flow[String].map(new Document(_)) - private val tokenise = Flow[Document].map(tokeniser).map(segmenter) - private val posTag = Flow[Document].map(normaliser).map(postagger) - private val lemma = Flow[Document].map(lemmatiser) - private val parse = Flow[Document].map(parser) - private val ner = Flow[Document].mapAsync(2)(nerTagger) - - //Pipelines - private val tsPipe: TokenSegment = doc via tokenise - private val tokenSegmentPipeline = (s:String) => Source.single(s).via(tsPipe).toMat(Sink.head[Document])(Keep.right) - - private val ptPipe: PosTag = posTag - private val posTagPipeline = (d:Document) => Source.single(d).via(ptPipe).toMat(Sink.head[Document])(Keep.right) - - private val lemPipe: Lemma = lemma - private val lemmaPipeline = (d:Document) => Source.single(d).via(lemPipe).toMat(Sink.head[Document])(Keep.right) - - private val parsePipe: Parse = parse - private val parsePipeline = (d:Document) => Source.single(d).via(parsePipe).toMat(Sink.head[Document])(Keep.right) - - private val nerPipe: NerTag = ner - private val nerPipeline = (d:Document) => Source.single(d).via(nerPipe).toMat(Sink.head[Document])(Keep.right) - - private val completePipe: Complete = doc via tokenise via posTag via lemma via parse via ner - private val completePipeline = (s:String) => Source.single(s).via(completePipe).toMat(Sink.head[Document])(Keep.right) - - - def process[T: TypeTag](input: Any): Future[Document] = input match { - case text: String if typeOf[T] <:< typeOf[Complete] => completePipeline(text).run - case text: String if typeOf[T] <:< typeOf[TokenSegment] => tokenSegmentPipeline(text).run - case doc: Document if typeOf[T] <:< typeOf[PosTag] => posTagPipeline(doc).run - case doc: Document if typeOf[T] <:< typeOf[Lemma] => lemmaPipeline(doc).run - case doc: Document if typeOf[T] <:< typeOf[Parse] => parsePipeline(doc).run - case doc: Document if typeOf[T] <:< typeOf[NerTag] => parsePipeline(doc).run - case _ => { - println("Unknown format") - Future(new Document("")) - } - } - - def show(fdoc:Future[Document]) = { - val doc = Await.result(fdoc, 120 second) - doc.sentences.foreach { s => - println(s"Sentence index: ${s.indexInSection}") - println(s"Sentence parse: ${s.parse.toString}") - s.tokens.foreach { t => - println(s"Token: ${t.toString}") - println(s"Position: ${t.positionInSentence}") - println(s"PosTag: ${t.posTag.toString}") - println(s"Lemma: ${t.lemmaString}") - if (!t.nerTag.isEmpty) println(s"NerTag: ${t.nerTag.baseCategoryValue}") - } - } - } - - def wordnet:WordNet = { - val streamFactory = (file:String) => this.getClass.getResourceAsStream("/cc/factorie/app/nlp/wordnet/"+file) - new WordNet(streamFactory) - } - -} - - - - - - - -// Example usages: -// token.sentence.attr[ParseTree].parent(token) -// sentence.attr[ParseTree].children(token) -// sentence.attr[ParseTree].setParent(token, parentToken) -// sentence.attr[ParseTree].label(token) -// sentence.attr[ParseTree].label(token).set("SUBJ") - -// Methods also created in Token supporting: -// token.parseParent -// token.setParseParent(parentToken) -// token.parseChildren -// token.parseLabel -// token.leftChildren \ No newline at end of file diff --git a/src/main/scala/io/nlytx/factorie_nlp_api/AnnotatorPipelines.scala b/src/main/scala/io/nlytx/factorie_nlp_api/AnnotatorPipelines.scala new file mode 100644 index 0000000..d0e4c85 --- /dev/null +++ b/src/main/scala/io/nlytx/factorie_nlp_api/AnnotatorPipelines.scala @@ -0,0 +1,142 @@ +package io.nlytx.factorie_nlp_api + +import akka.NotUsed +import akka.actor.ActorSystem +import akka.event.Logging +import akka.event.slf4j.Logger +import akka.stream.ActorMaterializer +import akka.stream.scaladsl.{Flow, Keep, RunnableGraph, Sink, Source} +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.parse.OntonotesTransitionBasedParser +import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger +import cc.factorie.app.nlp.segment.{DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter, PlainTokenNormalizer} +import cc.factorie.app.nlp.wordnet.WordNet + +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.duration._ +import scala.concurrent.{Await, Future} +import scala.reflect.runtime.universe._ + +/** + * Created by andrew@andrewresearch.net on 24/10/17. + */ + +object AnnotatorPipelines { + + implicit val system: ActorSystem = ActorSystem("factorie-nlp-api-as") + implicit val materializer: ActorMaterializer = ActorMaterializer() + + val logger = Logging(system.eventStream, "factorie-nlp-api") + + type Pipeline = String => RunnableGraph[Future[Document]] + type DocPipeline = Document => RunnableGraph[Future[Document]] + + //Make Document + private lazy val doc = Flow[String].map(new Document(_)) + + //Document Annotators - no models required + private lazy val tokeniser = (doc: Document) => DeterministicNormalizingTokenizer.process(doc) + private lazy val segmenter = (doc: Document) => DeterministicSentenceSegmenter.process(doc) + private lazy val normaliser = (doc: Document) => PlainTokenNormalizer.process(doc) + + //Document Annotators - load models + private lazy val postagger = (doc: Document) => OntonotesForwardPosTagger.process(doc) + private lazy val wordNetLemmatizer = wordnet.wnLemmatizer + private lazy val lemmatiser = (doc: Document) => wordNetLemmatizer.process(doc) + private lazy val parser = (doc: Document) => OntonotesTransitionBasedParser.process(doc) + + //Very slow model loading - returns a future + private lazy val nerTagger = (doc: Document) => Future(SlowLoad.nerTagger.process(doc)) + + + //Pipelines in order of complexity + + val tokenPipeline = (s:String) => + Source.single(s) + .via(doc.map(tokeniser)) + .toMat(Sink.head[Document])(Keep.right) + + val segmentPipeline = (s:String) => + Source.single(s) + .via(doc.map(tokeniser).map(segmenter)) + .toMat(Sink.head[Document])(Keep.right) + + val postagPipeline = (s:String) => + Source.single(s) + .via(doc.map(tokeniser).map(segmenter).map(normaliser).map(postagger)) + .toMat(Sink.head[Document])(Keep.right) + + val fastPipeline = postagPipeline + + val lemmaPipeline = (s:String) => + Source.single(s) + .via(doc.map(tokeniser).map(segmenter).map(normaliser).map(postagger).map(lemmatiser)) + .toMat(Sink.head[Document])(Keep.right) + + val defaultPipeline = lemmaPipeline + + val parsePipeline = (d:Document) => + Source.single(d) + .map(parser) + .toMat(Sink.head[Document])(Keep.right) + + val nerPipeline = (d:Document) => + Source.single(d) + .mapAsync(2)(nerTagger) + .toMat(Sink.head[Document])(Keep.right) + + val completePipeline = nerPipeline + + /* The main method for running a pipeline */ + def process(text:String,pipeline:Pipeline=defaultPipeline):Future[Document] = pipeline(text).run + + def processDoc(doc:Document,pipeline:DocPipeline):Future[Document] = pipeline(doc).run + + def profile(text:String,pipeline:Pipeline=defaultPipeline,wait:Int=180):Document = { + logger.info(s"Profiling pipeline...") + val start = System.currentTimeMillis() + val doc = Await.result(process(text,pipeline), wait seconds) + val time = System.currentTimeMillis() - start + logger.info(s"Completed in ${time} ms") + doc + } + + def wordnet:WordNet = { + val streamFactory = (file:String) => this.getClass.getResourceAsStream("/cc/factorie/app/nlp/wordnet/"+file) + new WordNet(streamFactory) + } + +} + + +//def show(fdoc:Future[Document]) = { +// val doc = Await.result(fdoc, 120 second) +// doc.sentences.foreach { s => +// println(s"Sentence index: ${s.indexInSection}") +// println(s"Sentence parse: ${s.parse.toString}") +// s.tokens.foreach { t => +// println(s"Token: ${t.toString}") +// println(s"Position: ${t.positionInSentence}") +// println(s"PosTag: ${t.posTag.toString}") +// println(s"Lemma: ${t.lemmaString}") +// if (!t.nerTag.isEmpty) println(s"NerTag: ${t.nerTag.baseCategoryValue}") +//} +//} +//} + + + + +// Example usages: +// token.sentence.attr[ParseTree].parent(token) +// sentence.attr[ParseTree].children(token) +// sentence.attr[ParseTree].setParent(token, parentToken) +// sentence.attr[ParseTree].label(token) +// sentence.attr[ParseTree].label(token).set("SUBJ") + +// Methods also created in Token supporting: +// token.parseParent +// token.setParseParent(parentToken) +// token.parseChildren +// token.parseLabel +// token.leftChildren \ No newline at end of file diff --git a/src/main/scala/io/nlytx/factorie/nlp/api/SlowLoad.scala b/src/main/scala/io/nlytx/factorie_nlp_api/SlowLoad.scala similarity index 97% rename from src/main/scala/io/nlytx/factorie/nlp/api/SlowLoad.scala rename to src/main/scala/io/nlytx/factorie_nlp_api/SlowLoad.scala index 583aaf0..bdd4a54 100644 --- a/src/main/scala/io/nlytx/factorie/nlp/api/SlowLoad.scala +++ b/src/main/scala/io/nlytx/factorie_nlp_api/SlowLoad.scala @@ -1,4 +1,4 @@ -package io.nlytx.factorie.nlp.api +package io.nlytx.factorie_nlp_api import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} import cc.factorie.app.nlp.ner.StaticLexiconFeatures