Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
#18 progress: Added AnalysisEngine, which can process multiple texts …
Browse files Browse the repository at this point in the history
…asyncronously.
  • Loading branch information
jasonbaldridge committed Jul 21, 2013
1 parent 2edd8c5 commit 8dd6388
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 19 deletions.
72 changes: 54 additions & 18 deletions src/main/scala/chalk/slab/AnalysisEngine.scala
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package chalk.slab

import akka.actor.{Actor,ActorLogging,ActorSystem,Props}
import akka.pattern.ask
import akka.pattern.{ask,pipe}
import akka.util.Timeout
import scala.collection.mutable.ListBuffer
import scala.concurrent.duration._
import scala.concurrent.Future

/**
* An actor that mixes-in an AnalysisFunction and hands Slabs contained in Process messages over
Expand Down Expand Up @@ -35,43 +36,78 @@ class SentenceSegmenterActor extends SentenceSegmenter
*/
class TokenizerActor extends AnalysisComponent[String, StringAnnotation, Sentence, Token] with Tokenizer


/**
* An analysis engine that runs Slabs through a pipeline of AnalysisComponents. It currently
* requires explicit declaration of the analyzers, but this would ideally be done through
* configuration. No compile-time consistency check for the types in the pipeline is performed.
* Anyhoo, this gives the basic idea for how an actor based AnalysisEngine might work, so
* it should be a good starting point.
*/
class AnalysisEngine extends Actor with ActorLogging {

import AnalysisComponent._
import AnalysisEngine._
import StringAnnotation._
implicit val ec = context.dispatcher
implicit val timeout = Timeout(10 seconds)

val sentenceSegmenter = context.system.actorOf(Props[SentenceSegmenterActor])
val tokenizer = context.system.actorOf(Props[TokenizerActor])

def receive = {
case Process(slab) =>
log.info("Processing slab:\n " + slab.content)
(for {
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]
slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]]
} yield {
slab2
}) pipeTo sender

case ProcessCorpus(texts) =>
Future.traverse(texts)(text => self ? Process(Slab(text))) pipeTo sender
}
}

/**
* Example application doing actor based Slab processing.
*/
object AnalysisEngine {

case class ProcessCorpus(corpus: Iterator[String])

import AnalysisComponent._
import StringAnnotation._

val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
val text1 = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"

val text2 = "How much wood can a woodchuck chuck? Can a woodchuck chuck wood?"

val text3 = "The Association for Computational Linguistics is proud to present its first Lifetime Achievement Award to Prof. Aravind Joshi of the University of Pennsylvania. Aravind Joshi was born in 1929 in Pune, India, where he completed his secondary education as well as his first degree in Mechanical and Electrical Engineering, the latter in 1950. He worked as a research assistant in Linguistics at Penn from 1958-60, while completing his Ph.D. in Electrical Engineering, in 1960. Joshi's work and the work of his Penn colleagues at the frontiers of Cognitive Science was rewarded in 1991 by the establishment of a National Science Foundation Science and Technology Center for Research in Cognitive Science, which Aravind Joshi co-directed until 2001. Dr. Joshi has supervised thirty-six Ph.D. theses to-date, on topics including information and coding theory, and also pure linguistics."

def main(args: Array[String]) {

val slab = Slab(text)
val system = ActorSystem("ChalkSystem")

implicit val ec = system.dispatcher
implicit val timeout = Timeout(10 seconds)
implicit val timeout = Timeout(10 seconds)

val engine = system.actorOf(Props[AnalysisEngine])
val corpus = Iterator(text1,text2,text3)

val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor])
val tokenizer = system.actorOf(Props[TokenizerActor])

for {
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]
slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]]
slabs <- (engine ? ProcessCorpus(corpus)).mapTo[Iterator[Slab[String,StringAnnotation,Sentence with Token]]]
slab <- slabs
} {

// Notice that the last sentence (lacking EOS char) is missing.
val sentences = slab2.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab2).content).mkString("\n"))
val sentences = slab.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n"))

val tokens = slab2.iterator[Token].toList
println("\nTOKENS\n\n" + tokens.map(_.in(slab2).content).mkString("\n"))

system.shutdown
val tokens = slab.iterator[Token].toList
println("\nTOKENS\n\n" + tokens.map(_.in(slab).content).mkString("\n"))
}

Thread.sleep(3000)
system.shutdown
}

}
2 changes: 1 addition & 1 deletion src/main/scala/chalk/slab/AnalysisFunction.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ object AnalysisPipeline {
val sentenceSegmenter = new SentenceSegmenter{}
val tokenizer = new Tokenizer {}
val pipeline = StringIdentityAnalyzer andThen sentenceSegmenter andThen tokenizer
val slab = pipeline(Slab(AnalysisEngine.text))
val slab = pipeline(Slab(AnalysisEngine.text1))
// Notice that the last sentence (lacking EOS char) is missing.
val sentences = slab.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n"))
Expand Down

0 comments on commit 8dd6388

Please sign in to comment.